summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/txn
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/txn')
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c2899
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c3249
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ext.c91
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_log.c1247
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_nsnap.c679
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_recover.c1329
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c872
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c2298
8 files changed, 6059 insertions, 6605 deletions
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index b3085080956..66a5330258b 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -10,1772 +10,1631 @@
/*
* __snapsort_partition --
- * Custom quick sort partitioning for snapshots.
+ * Custom quick sort partitioning for snapshots.
*/
static uint32_t
__snapsort_partition(uint64_t *array, uint32_t f, uint32_t l, uint64_t pivot)
{
- uint32_t i, j;
-
- i = f - 1;
- j = l + 1;
- for (;;) {
- while (pivot < array[--j])
- ;
- while (array[++i] < pivot)
- ;
- if (i < j) {
- uint64_t tmp = array[i];
- array[i] = array[j];
- array[j] = tmp;
- } else
- return (j);
- }
+ uint32_t i, j;
+
+ i = f - 1;
+ j = l + 1;
+ for (;;) {
+ while (pivot < array[--j])
+ ;
+ while (array[++i] < pivot)
+ ;
+ if (i < j) {
+ uint64_t tmp = array[i];
+ array[i] = array[j];
+ array[j] = tmp;
+ } else
+ return (j);
+ }
}
/*
* __snapsort_impl --
- * Custom quick sort implementation for snapshots.
+ * Custom quick sort implementation for snapshots.
*/
static void
__snapsort_impl(uint64_t *array, uint32_t f, uint32_t l)
{
- while (f + 16 < l) {
- uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l)/2];
- uint64_t median = v1 < v2 ?
- (v3 < v1 ? v1 : WT_MIN(v2, v3)) :
- (v3 < v2 ? v2 : WT_MIN(v1, v3));
- uint32_t m = __snapsort_partition(array, f, l, median);
- __snapsort_impl(array, f, m);
- f = m + 1;
- }
+ while (f + 16 < l) {
+ uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l) / 2];
+ uint64_t median =
+ v1 < v2 ? (v3 < v1 ? v1 : WT_MIN(v2, v3)) : (v3 < v2 ? v2 : WT_MIN(v1, v3));
+ uint32_t m = __snapsort_partition(array, f, l, median);
+ __snapsort_impl(array, f, m);
+ f = m + 1;
+ }
}
/*
* __snapsort --
- * Sort an array of transaction IDs.
+ * Sort an array of transaction IDs.
*/
static void
__snapsort(uint64_t *array, uint32_t size)
{
- __snapsort_impl(array, 0, size - 1);
- WT_INSERTION_SORT(array, size, uint64_t, WT_TXNID_LT);
+ __snapsort_impl(array, 0, size - 1);
+ WT_INSERTION_SORT(array, size, uint64_t, WT_TXNID_LT);
}
/*
* __txn_remove_from_global_table --
- * Remove the transaction id from the global transaction table.
+ * Remove the transaction id from the global transaction table.
*/
static inline void
__txn_remove_from_global_table(WT_SESSION_IMPL *session)
{
#ifdef HAVE_DIAGNOSTIC
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
- WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running));
- WT_ASSERT(session,
- txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE);
+ WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running));
+ WT_ASSERT(session, txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE);
#else
- WT_TXN_STATE *txn_state;
+ WT_TXN_STATE *txn_state;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn_state = WT_SESSION_TXN_STATE(session);
#endif
- WT_PUBLISH(txn_state->id, WT_TXN_NONE);
+ WT_PUBLISH(txn_state->id, WT_TXN_NONE);
}
/*
* __txn_sort_snapshot --
- * Sort a snapshot for faster searching and set the min/max bounds.
+ * Sort a snapshot for faster searching and set the min/max bounds.
*/
static void
__txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max)
{
- WT_TXN *txn;
+ WT_TXN *txn;
- txn = &session->txn;
+ txn = &session->txn;
- if (n > 1)
- __snapsort(txn->snapshot, n);
+ if (n > 1)
+ __snapsort(txn->snapshot, n);
- txn->snapshot_count = n;
- txn->snap_max = snap_max;
- txn->snap_min = (n > 0 && WT_TXNID_LE(txn->snapshot[0], snap_max)) ?
- txn->snapshot[0] : snap_max;
- F_SET(txn, WT_TXN_HAS_SNAPSHOT);
- WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
+ txn->snapshot_count = n;
+ txn->snap_max = snap_max;
+ txn->snap_min =
+ (n > 0 && WT_TXNID_LE(txn->snapshot[0], snap_max)) ? txn->snapshot[0] : snap_max;
+ F_SET(txn, WT_TXN_HAS_SNAPSHOT);
+ WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE);
}
/*
* __wt_txn_release_snapshot --
- * Release the snapshot in the current transaction.
+ * Release the snapshot in the current transaction.
*/
void
__wt_txn_release_snapshot(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
- WT_ASSERT(session,
- txn_state->pinned_id == WT_TXN_NONE ||
- session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
- !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE));
+ WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE ||
+ session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
+ !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE));
- txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
- F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
+ txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
+ F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
- /* Clear a checkpoint's pinned ID. */
- if (WT_SESSION_IS_CHECKPOINT(session)) {
- txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
- txn_global->checkpoint_timestamp = 0;
- }
+ /* Clear a checkpoint's pinned ID. */
+ if (WT_SESSION_IS_CHECKPOINT(session)) {
+ txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
+ txn_global->checkpoint_timestamp = 0;
+ }
- __wt_txn_clear_read_timestamp(session);
+ __wt_txn_clear_read_timestamp(session);
}
/*
* __wt_txn_get_snapshot --
- * Allocate a snapshot.
+ * Allocate a snapshot.
*/
void
__wt_txn_get_snapshot(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s, *txn_state;
- uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id;
- uint32_t i, n, session_cnt;
-
- conn = S2C(session);
- txn = &session->txn;
- txn_global = &conn->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
- n = 0;
-
- /* Fast path if we already have the current snapshot. */
- if ((commit_gen = __wt_session_gen(session, WT_GEN_COMMIT)) != 0) {
- if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
- commit_gen == __wt_gen(session, WT_GEN_COMMIT))
- return;
- __wt_session_gen_leave(session, WT_GEN_COMMIT);
- }
- __wt_session_gen_enter(session, WT_GEN_COMMIT);
-
- /* We're going to scan the table: wait for the lock. */
- __wt_readlock(session, &txn_global->rwlock);
-
- current_id = pinned_id = txn_global->current;
- prev_oldest_id = txn_global->oldest_id;
-
- /*
- * Include the checkpoint transaction, if one is running: we should
- * ignore any uncommitted changes the checkpoint has written to the
- * metadata. We don't have to keep the checkpoint's changes pinned so
- * don't including it in the published pinned ID.
- */
- if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) {
- txn->snapshot[n++] = id;
- txn_state->metadata_pinned = id;
- }
-
- /* For pure read-only workloads, avoid scanning. */
- if (prev_oldest_id == current_id) {
- txn_state->pinned_id = current_id;
- /* Check that the oldest ID has not moved in the meantime. */
- WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- goto done;
- }
-
- /* Walk the array of concurrent transactions. */
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /*
- * Build our snapshot of any concurrent transaction IDs.
- *
- * Ignore:
- * - Our own ID: we always read our own updates.
- * - The ID if it is older than the oldest ID we saw. This
- * can happen if we race with a thread that is allocating
- * an ID -- the ID will not be used because the thread will
- * keep spinning until it gets a valid one.
- * - The ID if it is higher than the current ID we saw. This
- * can happen if the transaction is already finished. In
- * this case, we ignore this transaction because it would
- * not be visible to the current snapshot.
- */
- while (s != txn_state &&
- (id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LE(prev_oldest_id, id) &&
- WT_TXNID_LT(id, current_id)) {
- /*
- * If the transaction is still allocating its ID, then
- * we spin here until it gets its valid ID.
- */
- WT_READ_BARRIER();
- if (!s->is_allocating) {
- /*
- * There is still a chance that fetched ID is
- * not valid after ID allocation, so we check
- * again here. The read of transaction ID
- * should be carefully ordered: we want to
- * re-read ID from transaction state after this
- * transaction completes ID allocation.
- */
- WT_READ_BARRIER();
- if (id == s->id) {
- txn->snapshot[n++] = id;
- if (WT_TXNID_LT(id, pinned_id))
- pinned_id = id;
- break;
- }
- }
- WT_PAUSE();
- }
- }
-
- /*
- * If we got a new snapshot, update the published pinned ID for this
- * session.
- */
- WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id));
- WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- txn_state->pinned_id = pinned_id;
-
-done: __wt_readunlock(session, &txn_global->rwlock);
- __txn_sort_snapshot(session, n, current_id);
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s, *txn_state;
+ uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id;
+ uint32_t i, n, session_cnt;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
+ n = 0;
+
+ /* Fast path if we already have the current snapshot. */
+ if ((commit_gen = __wt_session_gen(session, WT_GEN_COMMIT)) != 0) {
+ if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && commit_gen == __wt_gen(session, WT_GEN_COMMIT))
+ return;
+ __wt_session_gen_leave(session, WT_GEN_COMMIT);
+ }
+ __wt_session_gen_enter(session, WT_GEN_COMMIT);
+
+ /* We're going to scan the table: wait for the lock. */
+ __wt_readlock(session, &txn_global->rwlock);
+
+ current_id = pinned_id = txn_global->current;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /*
+ * Include the checkpoint transaction, if one is running: we should ignore any uncommitted
+ * changes the checkpoint has written to the metadata. We don't have to keep the checkpoint's
+ * changes pinned so don't including it in the published pinned ID.
+ */
+ if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) {
+ txn->snapshot[n++] = id;
+ txn_state->metadata_pinned = id;
+ }
+
+ /* For pure read-only workloads, avoid scanning. */
+ if (prev_oldest_id == current_id) {
+ txn_state->pinned_id = current_id;
+ /* Check that the oldest ID has not moved in the meantime. */
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ goto done;
+ }
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /*
+ * Build our snapshot of any concurrent transaction IDs.
+ *
+ * Ignore:
+ * - Our own ID: we always read our own updates.
+ * - The ID if it is older than the oldest ID we saw. This
+ * can happen if we race with a thread that is allocating
+ * an ID -- the ID will not be used because the thread will
+ * keep spinning until it gets a valid one.
+ * - The ID if it is higher than the current ID we saw. This
+ * can happen if the transaction is already finished. In
+ * this case, we ignore this transaction because it would
+ * not be visible to the current snapshot.
+ */
+ while (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) &&
+ WT_TXNID_LT(id, current_id)) {
+ /*
+ * If the transaction is still allocating its ID, then we spin here until it gets its
+ * valid ID.
+ */
+ WT_READ_BARRIER();
+ if (!s->is_allocating) {
+ /*
+ * There is still a chance that fetched ID is not valid after ID allocation, so we
+ * check again here. The read of transaction ID should be carefully ordered: we want
+ * to re-read ID from transaction state after this transaction completes ID
+ * allocation.
+ */
+ WT_READ_BARRIER();
+ if (id == s->id) {
+ txn->snapshot[n++] = id;
+ if (WT_TXNID_LT(id, pinned_id))
+ pinned_id = id;
+ break;
+ }
+ }
+ WT_PAUSE();
+ }
+ }
+
+ /*
+ * If we got a new snapshot, update the published pinned ID for this session.
+ */
+ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id));
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ txn_state->pinned_id = pinned_id;
+
+done:
+ __wt_readunlock(session, &txn_global->rwlock);
+ __txn_sort_snapshot(session, n, current_id);
}
/*
* __txn_oldest_scan --
- * Sweep the running transactions to calculate the oldest ID required.
+ * Sweep the running transactions to calculate the oldest ID required.
*/
static void
-__txn_oldest_scan(WT_SESSION_IMPL *session,
- uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp,
- WT_SESSION_IMPL **oldest_sessionp)
+__txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last_runningp,
+ uint64_t *metadata_pinnedp, WT_SESSION_IMPL **oldest_sessionp)
{
- WT_CONNECTION_IMPL *conn;
- WT_SESSION_IMPL *oldest_session;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
- uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id;
- uint32_t i, session_cnt;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- oldest_session = NULL;
-
- /* The oldest ID cannot change while we are holding the scan lock. */
- prev_oldest_id = txn_global->oldest_id;
- last_running = oldest_id = txn_global->current;
- if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE)
- metadata_pinned = oldest_id;
-
- /* Walk the array of concurrent transactions. */
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Update the last running transaction ID. */
- while ((id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LE(prev_oldest_id, id) &&
- WT_TXNID_LT(id, last_running)) {
- /*
- * If the transaction is still allocating its ID, then
- * we spin here until it gets its valid ID.
- */
- WT_READ_BARRIER();
- if (!s->is_allocating) {
- /*
- * There is still a chance that fetched ID is
- * not valid after ID allocation, so we check
- * again here. The read of transaction ID
- * should be carefully ordered: we want to
- * re-read ID from transaction state after this
- * transaction completes ID allocation.
- */
- WT_READ_BARRIER();
- if (id == s->id) {
- last_running = id;
- break;
- }
- }
- WT_PAUSE();
- }
-
- /* Update the metadata pinned ID. */
- if ((id = s->metadata_pinned) != WT_TXN_NONE &&
- WT_TXNID_LT(id, metadata_pinned))
- metadata_pinned = id;
-
- /*
- * !!!
- * Note: Don't ignore pinned ID values older than the previous
- * oldest ID. Read-uncommitted operations publish pinned ID
- * values without acquiring the scan lock to protect the global
- * table. See the comment in __wt_txn_cursor_op for more
- * details.
- */
- if ((id = s->pinned_id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id)) {
- oldest_id = id;
- oldest_session = &conn->sessions[i];
- }
- }
-
- if (WT_TXNID_LT(last_running, oldest_id))
- oldest_id = last_running;
-
- /* The oldest ID can't move past any named snapshots. */
- if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id))
- oldest_id = id;
-
- /* The metadata pinned ID can't move past the oldest ID. */
- if (WT_TXNID_LT(oldest_id, metadata_pinned))
- metadata_pinned = oldest_id;
-
- *last_runningp = last_running;
- *metadata_pinnedp = metadata_pinned;
- *oldest_idp = oldest_id;
- *oldest_sessionp = oldest_session;
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *oldest_session;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id;
+ uint32_t i, session_cnt;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ oldest_session = NULL;
+
+ /* The oldest ID cannot change while we are holding the scan lock. */
+ prev_oldest_id = txn_global->oldest_id;
+ last_running = oldest_id = txn_global->current;
+ if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE)
+ metadata_pinned = oldest_id;
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /* Update the last running transaction ID. */
+ while ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) &&
+ WT_TXNID_LT(id, last_running)) {
+ /*
+ * If the transaction is still allocating its ID, then we spin here until it gets its
+ * valid ID.
+ */
+ WT_READ_BARRIER();
+ if (!s->is_allocating) {
+ /*
+ * There is still a chance that fetched ID is not valid after ID allocation, so we
+ * check again here. The read of transaction ID should be carefully ordered: we want
+ * to re-read ID from transaction state after this transaction completes ID
+ * allocation.
+ */
+ WT_READ_BARRIER();
+ if (id == s->id) {
+ last_running = id;
+ break;
+ }
+ }
+ WT_PAUSE();
+ }
+
+ /* Update the metadata pinned ID. */
+ if ((id = s->metadata_pinned) != WT_TXN_NONE && WT_TXNID_LT(id, metadata_pinned))
+ metadata_pinned = id;
+
+ /*
+ * !!!
+ * Note: Don't ignore pinned ID values older than the previous
+ * oldest ID. Read-uncommitted operations publish pinned ID
+ * values without acquiring the scan lock to protect the global
+ * table. See the comment in __wt_txn_cursor_op for more
+ * details.
+ */
+ if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) {
+ oldest_id = id;
+ oldest_session = &conn->sessions[i];
+ }
+ }
+
+ if (WT_TXNID_LT(last_running, oldest_id))
+ oldest_id = last_running;
+
+ /* The oldest ID can't move past any named snapshots. */
+ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id))
+ oldest_id = id;
+
+ /* The metadata pinned ID can't move past the oldest ID. */
+ if (WT_TXNID_LT(oldest_id, metadata_pinned))
+ metadata_pinned = oldest_id;
+
+ *last_runningp = last_running;
+ *metadata_pinnedp = metadata_pinned;
+ *oldest_idp = oldest_id;
+ *oldest_sessionp = oldest_session;
}
/*
* __wt_txn_update_oldest --
- * Sweep the running transactions to update the oldest ID required.
+ * Sweep the running transactions to update the oldest ID required.
*/
int
__wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_SESSION_IMPL *oldest_session;
- WT_TXN_GLOBAL *txn_global;
- uint64_t current_id, last_running, metadata_pinned, oldest_id;
- uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id;
- bool strict, wait;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
- wait = LF_ISSET(WT_TXN_OLDEST_WAIT);
-
- current_id = last_running = metadata_pinned = txn_global->current;
- prev_last_running = txn_global->last_running;
- prev_metadata_pinned = txn_global->metadata_pinned;
- prev_oldest_id = txn_global->oldest_id;
-
- /* Try to move the pinned timestamp forward. */
- if (strict)
- WT_RET(__wt_txn_update_pinned_timestamp(session, false));
-
- /*
- * For pure read-only workloads, or if the update isn't forced and the
- * oldest ID isn't too far behind, avoid scanning.
- */
- if ((prev_oldest_id == current_id &&
- prev_metadata_pinned == current_id) ||
- (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
- return (0);
-
- /* First do a read-only scan. */
- if (wait)
- __wt_readlock(session, &txn_global->rwlock);
- else if ((ret =
- __wt_try_readlock(session, &txn_global->rwlock)) != 0)
- return (ret == EBUSY ? 0 : ret);
- __txn_oldest_scan(session,
- &oldest_id, &last_running, &metadata_pinned, &oldest_session);
- __wt_readunlock(session, &txn_global->rwlock);
-
- /*
- * If the state hasn't changed (or hasn't moved far enough for
- * non-forced updates), give up.
- */
- if ((oldest_id == prev_oldest_id ||
- (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
- ((last_running == prev_last_running) ||
- (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) &&
- metadata_pinned == prev_metadata_pinned)
- return (0);
-
- /* It looks like an update is necessary, wait for exclusive access. */
- if (wait)
- __wt_writelock(session, &txn_global->rwlock);
- else if ((ret =
- __wt_try_writelock(session, &txn_global->rwlock)) != 0)
- return (ret == EBUSY ? 0 : ret);
-
- /*
- * If the oldest ID has been updated while we waited, don't bother
- * scanning.
- */
- if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
- WT_TXNID_LE(last_running, txn_global->last_running) &&
- WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned))
- goto done;
-
- /*
- * Re-scan now that we have exclusive access. This is necessary because
- * threads get transaction snapshots with read locks, and we have to be
- * sure that there isn't a thread that has got a snapshot locally but
- * not yet published its snap_min.
- */
- __txn_oldest_scan(session,
- &oldest_id, &last_running, &metadata_pinned, &oldest_session);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *oldest_session;
+ WT_TXN_GLOBAL *txn_global;
+ uint64_t current_id, last_running, metadata_pinned, oldest_id;
+ uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id;
+ bool strict, wait;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
+ wait = LF_ISSET(WT_TXN_OLDEST_WAIT);
+
+ current_id = last_running = metadata_pinned = txn_global->current;
+ prev_last_running = txn_global->last_running;
+ prev_metadata_pinned = txn_global->metadata_pinned;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /* Try to move the pinned timestamp forward. */
+ if (strict)
+ WT_RET(__wt_txn_update_pinned_timestamp(session, false));
+
+ /*
+ * For pure read-only workloads, or if the update isn't forced and the oldest ID isn't too far
+ * behind, avoid scanning.
+ */
+ if ((prev_oldest_id == current_id && prev_metadata_pinned == current_id) ||
+ (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
+ return (0);
+
+ /* First do a read-only scan. */
+ if (wait)
+ __wt_readlock(session, &txn_global->rwlock);
+ else if ((ret = __wt_try_readlock(session, &txn_global->rwlock)) != 0)
+ return (ret == EBUSY ? 0 : ret);
+ __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session);
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /*
+ * If the state hasn't changed (or hasn't moved far enough for non-forced updates), give up.
+ */
+ if ((oldest_id == prev_oldest_id ||
+ (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
+ ((last_running == prev_last_running) ||
+ (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) &&
+ metadata_pinned == prev_metadata_pinned)
+ return (0);
+
+ /* It looks like an update is necessary, wait for exclusive access. */
+ if (wait)
+ __wt_writelock(session, &txn_global->rwlock);
+ else if ((ret = __wt_try_writelock(session, &txn_global->rwlock)) != 0)
+ return (ret == EBUSY ? 0 : ret);
+
+ /*
+ * If the oldest ID has been updated while we waited, don't bother scanning.
+ */
+ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
+ WT_TXNID_LE(last_running, txn_global->last_running) &&
+ WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned))
+ goto done;
+
+ /*
+ * Re-scan now that we have exclusive access. This is necessary because threads get transaction
+ * snapshots with read locks, and we have to be sure that there isn't a thread that has got a
+ * snapshot locally but not yet published its snap_min.
+ */
+ __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session);
#ifdef HAVE_DIAGNOSTIC
- {
- /*
- * Make sure the ID doesn't move past any named snapshots.
- *
- * Don't include the read/assignment in the assert statement. Coverity
- * complains if there are assignments only done in diagnostic builds,
- * and when the read is from a volatile.
- */
- uint64_t id = txn_global->nsnap_oldest_id;
- WT_ASSERT(session,
- id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
- }
+ {
+ /*
+ * Make sure the ID doesn't move past any named snapshots.
+ *
+ * Don't include the read/assignment in the assert statement. Coverity
+ * complains if there are assignments only done in diagnostic builds,
+ * and when the read is from a volatile.
+ */
+ uint64_t id = txn_global->nsnap_oldest_id;
+ WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+ }
#endif
- /* Update the public IDs. */
- if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned))
- txn_global->metadata_pinned = metadata_pinned;
- if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
- txn_global->oldest_id = oldest_id;
- if (WT_TXNID_LT(txn_global->last_running, last_running)) {
- txn_global->last_running = last_running;
-
- /* Output a verbose message about long-running transactions,
- * but only when some progress is being made. */
- if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
- current_id - oldest_id > 10000 && oldest_session != NULL) {
- __wt_verbose(session, WT_VERB_TRANSACTION,
- "old snapshot %" PRIu64
- " pinned in session %" PRIu32 " [%s]"
- " with snap_min %" PRIu64,
- oldest_id, oldest_session->id,
- oldest_session->lastop,
- oldest_session->txn.snap_min);
- }
- }
-
-done: __wt_writeunlock(session, &txn_global->rwlock);
- return (ret);
+ /* Update the public IDs. */
+ if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned))
+ txn_global->metadata_pinned = metadata_pinned;
+ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ if (WT_TXNID_LT(txn_global->last_running, last_running)) {
+ txn_global->last_running = last_running;
+
+ /* Output a verbose message about long-running transactions,
+ * but only when some progress is being made. */
+ if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 &&
+ oldest_session != NULL) {
+ __wt_verbose(session, WT_VERB_TRANSACTION,
+ "old snapshot %" PRIu64 " pinned in session %" PRIu32
+ " [%s]"
+ " with snap_min %" PRIu64,
+ oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min);
+ }
+ }
+
+done:
+ __wt_writeunlock(session, &txn_global->rwlock);
+ return (ret);
}
/*
* __wt_txn_config --
- * Configure a transaction.
+ * Configure a transaction.
*/
int
__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_TXN *txn;
- wt_timestamp_t read_ts;
-
- txn = &session->txn;
-
- WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
- if (cval.len != 0)
- txn->isolation =
- WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
- WT_ISO_SNAPSHOT :
- WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
- WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;
-
- /*
- * The default sync setting is inherited from the connection, but can
- * be overridden by an explicit "sync" setting for this transaction.
- *
- * We want to distinguish between inheriting implicitly and explicitly.
- */
- F_CLR(txn, WT_TXN_SYNC_SET);
- WT_RET(__wt_config_gets_def(
- session, cfg, "sync", (int)UINT_MAX, &cval));
- if (cval.val == 0 || cval.val == 1)
- /*
- * This is an explicit setting of sync. Set the flag so
- * that we know not to overwrite it in commit_transaction.
- */
- F_SET(txn, WT_TXN_SYNC_SET);
-
- /*
- * If sync is turned off explicitly, clear the transaction's sync field.
- */
- if (cval.val == 0)
- txn->txn_logsync = 0;
-
- WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
- if (cval.len > 0)
- /*
- * The layering here isn't ideal - the named snapshot get
- * function does both validation and setup. Otherwise we'd
- * need to walk the list of named snapshots twice during
- * transaction open.
- */
- WT_RET(__wt_txn_named_snapshot_get(session, &cval));
-
- /* Check if prepared updates should be ignored during reads. */
- WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval));
- if (cval.len > 0 &&
- WT_STRING_MATCH("force", cval.str, cval.len))
- F_SET(txn, WT_TXN_IGNORE_PREPARE);
- else if (cval.val)
- F_SET(txn, WT_TXN_IGNORE_PREPARE | WT_TXN_READONLY);
-
- /*
- * Check if the prepare timestamp and the commit timestamp of a
- * prepared transaction need to be rounded up.
- */
- WT_RET(__wt_config_gets_def(
- session, cfg, "roundup_timestamps.prepared", 0, &cval));
- if (cval.val)
- F_SET(txn, WT_TXN_TS_ROUND_PREPARED);
-
- /* Check if read timestamp needs to be rounded up. */
- WT_RET(__wt_config_gets_def(
- session, cfg, "roundup_timestamps.read", 0, &cval));
- if (cval.val)
- F_SET(txn, WT_TXN_TS_ROUND_READ);
-
- WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
- if (cval.len != 0) {
- WT_RET(__wt_txn_parse_timestamp(
- session, "read", &read_ts, &cval));
- WT_RET(__wt_txn_set_read_timestamp(session, read_ts));
- }
-
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_TXN *txn;
+ wt_timestamp_t read_ts;
+
+ txn = &session->txn;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
+ if (cval.len != 0)
+ txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+ WT_ISO_SNAPSHOT :
+ WT_STRING_MATCH("read-committed", cval.str, cval.len) ? WT_ISO_READ_COMMITTED :
+ WT_ISO_READ_UNCOMMITTED;
+
+ /*
+ * The default sync setting is inherited from the connection, but can
+ * be overridden by an explicit "sync" setting for this transaction.
+ *
+ * We want to distinguish between inheriting implicitly and explicitly.
+ */
+ F_CLR(txn, WT_TXN_SYNC_SET);
+ WT_RET(__wt_config_gets_def(session, cfg, "sync", (int)UINT_MAX, &cval));
+ if (cval.val == 0 || cval.val == 1)
+ /*
+ * This is an explicit setting of sync. Set the flag so that we know not to overwrite it in
+ * commit_transaction.
+ */
+ F_SET(txn, WT_TXN_SYNC_SET);
+
+ /*
+ * If sync is turned off explicitly, clear the transaction's sync field.
+ */
+ if (cval.val == 0)
+ txn->txn_logsync = 0;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
+ if (cval.len > 0)
+ /*
+ * The layering here isn't ideal - the named snapshot get function does both validation and
+ * setup. Otherwise we'd need to walk the list of named snapshots twice during transaction
+ * open.
+ */
+ WT_RET(__wt_txn_named_snapshot_get(session, &cval));
+
+ /* Check if prepared updates should be ignored during reads. */
+ WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval));
+ if (cval.len > 0 && WT_STRING_MATCH("force", cval.str, cval.len))
+ F_SET(txn, WT_TXN_IGNORE_PREPARE);
+ else if (cval.val)
+ F_SET(txn, WT_TXN_IGNORE_PREPARE | WT_TXN_READONLY);
+
+ /*
+ * Check if the prepare timestamp and the commit timestamp of a prepared transaction need to be
+ * rounded up.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "roundup_timestamps.prepared", 0, &cval));
+ if (cval.val)
+ F_SET(txn, WT_TXN_TS_ROUND_PREPARED);
+
+ /* Check if read timestamp needs to be rounded up. */
+ WT_RET(__wt_config_gets_def(session, cfg, "roundup_timestamps.read", 0, &cval));
+ if (cval.val)
+ F_SET(txn, WT_TXN_TS_ROUND_READ);
+
+ WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
+ if (cval.len != 0) {
+ WT_RET(__wt_txn_parse_timestamp(session, "read", &read_ts, &cval));
+ WT_RET(__wt_txn_set_read_timestamp(session, read_ts));
+ }
+
+ return (0);
}
/*
* __wt_txn_reconfigure --
- * WT_SESSION::reconfigure for transactions.
+ * WT_SESSION::reconfigure for transactions.
*/
int
__wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config)
{
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- WT_TXN *txn;
-
- txn = &session->txn;
-
- ret = __wt_config_getones(session, config, "isolation", &cval);
- if (ret == 0 && cval.len != 0) {
- session->isolation = txn->isolation =
- WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
- WT_ISO_SNAPSHOT :
- WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ?
- WT_ISO_READ_UNCOMMITTED : WT_ISO_READ_COMMITTED;
- }
- WT_RET_NOTFOUND_OK(ret);
-
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ ret = __wt_config_getones(session, config, "isolation", &cval);
+ if (ret == 0 && cval.len != 0) {
+ session->isolation = txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
+ WT_ISO_SNAPSHOT :
+ WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ? WT_ISO_READ_UNCOMMITTED :
+ WT_ISO_READ_COMMITTED;
+ }
+ WT_RET_NOTFOUND_OK(ret);
+
+ return (0);
}
/*
* __wt_txn_release --
- * Release the resources associated with the current transaction.
+ * Release the resources associated with the current transaction.
*/
void
__wt_txn_release(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
-
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
-
- WT_ASSERT(session, txn->mod_count == 0);
- txn->notify = NULL;
-
- /* Clear the transaction's ID from the global table. */
- if (WT_SESSION_IS_CHECKPOINT(session)) {
- WT_ASSERT(session,
- WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
- txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE;
-
- /*
- * Be extra careful to cleanup everything for checkpoints: once
- * the global checkpoint ID is cleared, we can no longer tell
- * if this session is doing a checkpoint.
- */
- txn_global->checkpoint_id = 0;
- } else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
- /*
- * If transaction is prepared, this would have been done in
- * prepare.
- */
- if (!F_ISSET(txn, WT_TXN_PREPARE))
- __txn_remove_from_global_table(session);
- txn->id = WT_TXN_NONE;
- }
-
- __wt_txn_clear_durable_timestamp(session);
-
- /* Free the scratch buffer allocated for logging. */
- __wt_logrec_free(session, &txn->logrec);
-
- /* Discard any memory from the session's stash that we can. */
- WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0);
- __wt_stash_discard(session);
-
- /*
- * Reset the transaction state to not running and release the snapshot.
- */
- __wt_txn_release_snapshot(session);
- txn->isolation = session->isolation;
-
- txn->rollback_reason = NULL;
-
- /*
- * Ensure the transaction flags are cleared on exit
- *
- * Purposely do NOT clear the commit and durable timestamps on release.
- * Other readers may still find these transactions in the durable queue
- * and will need to see those timestamps.
- */
- txn->flags = 0;
- txn->prepare_timestamp = WT_TS_NONE;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+
+ WT_ASSERT(session, txn->mod_count == 0);
+ txn->notify = NULL;
+
+ /* Clear the transaction's ID from the global table. */
+ if (WT_SESSION_IS_CHECKPOINT(session)) {
+ WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
+ txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE;
+
+ /*
+ * Be extra careful to cleanup everything for checkpoints: once the global checkpoint ID is
+ * cleared, we can no longer tell if this session is doing a checkpoint.
+ */
+ txn_global->checkpoint_id = 0;
+ } else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
+ /*
+ * If transaction is prepared, this would have been done in prepare.
+ */
+ if (!F_ISSET(txn, WT_TXN_PREPARE))
+ __txn_remove_from_global_table(session);
+ else
+ WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE);
+ txn->id = WT_TXN_NONE;
+ }
+
+ __wt_txn_clear_durable_timestamp(session);
+
+ /* Free the scratch buffer allocated for logging. */
+ __wt_logrec_free(session, &txn->logrec);
+
+ /* Discard any memory from the session's stash that we can. */
+ WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0);
+ __wt_stash_discard(session);
+
+ /*
+ * Reset the transaction state to not running and release the snapshot.
+ */
+ __wt_txn_release_snapshot(session);
+ txn->isolation = session->isolation;
+
+ txn->rollback_reason = NULL;
+
+ /*
+ * Ensure the transaction flags are cleared on exit
+ *
+ * Purposely do NOT clear the commit and durable timestamps on release.
+ * Other readers may still find these transactions in the durable queue
+ * and will need to see those timestamps.
+ */
+ txn->flags = 0;
+ txn->prepare_timestamp = WT_TS_NONE;
}
/*
* __txn_commit_timestamps_assert --
- * Validate that timestamps provided to commit are legal.
+ * Validate that timestamps provided to commit are legal.
*/
static inline int
__txn_commit_timestamps_assert(WT_SESSION_IMPL *session)
{
- WT_CURSOR *cursor;
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_OP *op;
- WT_UPDATE *upd;
- wt_timestamp_t durable_op_timestamp, op_timestamp, prev_op_timestamp;
- u_int i;
- const char *open_cursor_cfg[] = {
- WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
- bool op_zero_ts, upd_zero_ts;
-
- txn = &session->txn;
- cursor = NULL;
- durable_op_timestamp = prev_op_timestamp = WT_TS_NONE;
-
- /*
- * Debugging checks on timestamps, if user requested them.
- */
- if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) &&
- !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
- txn->mod_count != 0)
- WT_RET_MSG(session, EINVAL, "commit_timestamp required and "
- "none set on this transaction");
- if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) &&
- F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
- txn->mod_count != 0)
- WT_RET_MSG(session, EINVAL, "no commit_timestamp required and "
- "timestamp set on this transaction");
- if (F_ISSET(txn, WT_TXN_TS_DURABLE_ALWAYS) &&
- !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
- txn->mod_count != 0)
- WT_RET_MSG(session, EINVAL, "durable_timestamp required and "
- "none set on this transaction");
- if (F_ISSET(txn, WT_TXN_TS_DURABLE_NEVER) &&
- F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
- txn->mod_count != 0)
- WT_RET_MSG(session, EINVAL, "no durable_timestamp required and "
- "durable timestamp set on this transaction");
-
- /*
- * If we're not doing any key consistency checking, we're done.
- */
- if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS | WT_TXN_TS_DURABLE_KEYS))
- return (0);
-
- /*
- * Error on any valid update structures for the same key that
- * are at a later timestamp or use timestamps inconsistently.
- */
- for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
- if (op->type == WT_TXN_OP_BASIC_COL ||
- op->type == WT_TXN_OP_BASIC_ROW) {
- /*
- * Search for prepared updates, so that they will be
- * restored, if moved to lookaside.
- */
- if (F_ISSET(txn, WT_TXN_PREPARE)) {
- WT_RET(__wt_open_cursor(session,
- op->btree->dhandle->name, NULL,
- open_cursor_cfg, &cursor));
- F_CLR(txn, WT_TXN_PREPARE);
- if (op->type == WT_TXN_OP_BASIC_ROW)
- __wt_cursor_set_raw_key(
- cursor, &op->u.op_row.key);
- else
- ((WT_CURSOR_BTREE*)cursor)->iface.recno
- = op->u.op_col.recno;
- F_SET(txn, WT_TXN_PREPARE);
- WT_WITH_BTREE(session, op->btree,
- ret = __wt_btcur_search_uncommitted(
- (WT_CURSOR_BTREE *)cursor, &upd));
- if (ret != 0)
- WT_RET_MSG(session, EINVAL,
- "prepared update restore failed");
- } else
- upd = op->u.op_upd;
-
- WT_ASSERT(session, upd != NULL);
- op_timestamp = upd->start_ts;
-
- /*
- * Skip over any aborted update structures, internally
- * created update structures or ones from our own
- * transaction.
- */
- while (upd != NULL && (upd->txnid == WT_TXN_ABORTED ||
- upd->txnid == WT_TXN_NONE || upd->txnid == txn->id))
- upd = upd->next;
-
- /*
- * Check the timestamp on this update with the
- * first valid update in the chain. They're in
- * most recent order.
- */
- if (upd != NULL) {
- prev_op_timestamp = upd->start_ts;
- durable_op_timestamp = upd->durable_ts;
- }
-
- /*
- * We no longer need to access the update structure so
- * it's safe to release our reference to the page.
- */
- if (cursor != NULL) {
- WT_ASSERT(
- session, F_ISSET(txn, WT_TXN_PREPARE));
- WT_RET(cursor->close(cursor));
- cursor = NULL;
- }
-
- if (upd == NULL)
- continue;
- /*
- * Check for consistent per-key timestamp usage.
- * If timestamps are or are not used originally then
- * they should be used the same way always. For this
- * transaction, timestamps are in use anytime the
- * commit timestamp is set.
- * Check timestamps are used in order.
- */
- op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
- upd_zero_ts = prev_op_timestamp == WT_TS_NONE;
- if (op_zero_ts != upd_zero_ts)
- WT_RET_MSG(session, EINVAL,
- "per-key timestamps used inconsistently");
- /*
- * If we aren't using timestamps for this transaction
- * then we are done checking. Don't check the timestamp
- * because the one in the transaction is not cleared.
- */
- if (op_zero_ts)
- continue;
-
- /*
- * Only if the update structure doesn't have a timestamp
- * then use the one in the transaction structure.
- */
- if (op_timestamp == WT_TS_NONE)
- op_timestamp = txn->commit_timestamp;
- if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) &&
- op_timestamp < prev_op_timestamp)
- WT_RET_MSG(session, EINVAL,
- "out of order commit timestamps");
- if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) &&
- txn->durable_timestamp < durable_op_timestamp)
- WT_RET_MSG(session, EINVAL,
- "out of order durable timestamps");
- }
- return (0);
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ WT_UPDATE *upd;
+ wt_timestamp_t durable_op_timestamp, op_timestamp, prev_op_timestamp;
+ u_int i;
+ const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL};
+ bool op_zero_ts, upd_zero_ts;
+
+ txn = &session->txn;
+ cursor = NULL;
+ durable_op_timestamp = prev_op_timestamp = WT_TS_NONE;
+
+ /*
+ * Debugging checks on timestamps, if user requested them.
+ */
+ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
+ txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL,
+ "commit_timestamp required and "
+ "none set on this transaction");
+ if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
+ txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL,
+ "no commit_timestamp required and "
+ "timestamp set on this transaction");
+ if (F_ISSET(txn, WT_TXN_TS_DURABLE_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
+ txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL,
+ "durable_timestamp required and "
+ "none set on this transaction");
+ if (F_ISSET(txn, WT_TXN_TS_DURABLE_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) &&
+ txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL,
+ "no durable_timestamp required and "
+ "durable timestamp set on this transaction");
+
+ /*
+ * If we're not doing any key consistency checking, we're done.
+ */
+ if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS | WT_TXN_TS_DURABLE_KEYS))
+ return (0);
+
+ /*
+ * Error on any valid update structures for the same key that are at a later timestamp or use
+ * timestamps inconsistently.
+ */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++)
+ if (op->type == WT_TXN_OP_BASIC_COL || op->type == WT_TXN_OP_BASIC_ROW) {
+ /*
+ * Search for prepared updates, so that they will be restored, if moved to lookaside.
+ */
+ if (F_ISSET(txn, WT_TXN_PREPARE)) {
+ WT_RET(__wt_open_cursor(
+ session, op->btree->dhandle->name, NULL, open_cursor_cfg, &cursor));
+ F_CLR(txn, WT_TXN_PREPARE);
+ if (op->type == WT_TXN_OP_BASIC_ROW)
+ __wt_cursor_set_raw_key(cursor, &op->u.op_row.key);
+ else
+ ((WT_CURSOR_BTREE *)cursor)->iface.recno = op->u.op_col.recno;
+ F_SET(txn, WT_TXN_PREPARE);
+ WT_WITH_BTREE(session, op->btree,
+ ret = __wt_btcur_search_uncommitted((WT_CURSOR_BTREE *)cursor, &upd));
+ if (ret != 0)
+ WT_RET_MSG(session, EINVAL, "prepared update restore failed");
+ } else
+ upd = op->u.op_upd;
+
+ WT_ASSERT(session, upd != NULL);
+ op_timestamp = upd->start_ts;
+
+ /*
+ * Skip over any aborted update structures, internally created update structures or ones
+ * from our own transaction.
+ */
+ while (upd != NULL &&
+ (upd->txnid == WT_TXN_ABORTED || upd->txnid == WT_TXN_NONE || upd->txnid == txn->id))
+ upd = upd->next;
+
+ /*
+ * Check the timestamp on this update with the first valid update in the chain. They're
+ * in most recent order.
+ */
+ if (upd != NULL) {
+ prev_op_timestamp = upd->start_ts;
+ durable_op_timestamp = upd->durable_ts;
+ }
+
+ /*
+ * We no longer need to access the update structure so it's safe to release our
+ * reference to the page.
+ */
+ if (cursor != NULL) {
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_PREPARE));
+ WT_RET(cursor->close(cursor));
+ cursor = NULL;
+ }
+
+ if (upd == NULL)
+ continue;
+ /*
+ * Check for consistent per-key timestamp usage. If timestamps are or are not used
+ * originally then they should be used the same way always. For this transaction,
+ * timestamps are in use anytime the commit timestamp is set. Check timestamps are used
+ * in order.
+ */
+ op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT);
+ upd_zero_ts = prev_op_timestamp == WT_TS_NONE;
+ if (op_zero_ts != upd_zero_ts)
+ WT_RET_MSG(session, EINVAL, "per-key timestamps used inconsistently");
+ /*
+ * If we aren't using timestamps for this transaction then we are done checking. Don't
+ * check the timestamp because the one in the transaction is not cleared.
+ */
+ if (op_zero_ts)
+ continue;
+
+ /*
+ * Only if the update structure doesn't have a timestamp then use the one in the
+ * transaction structure.
+ */
+ if (op_timestamp == WT_TS_NONE)
+ op_timestamp = txn->commit_timestamp;
+ if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) && op_timestamp < prev_op_timestamp)
+ WT_RET_MSG(session, EINVAL, "out of order commit timestamps");
+ if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) &&
+ txn->durable_timestamp < durable_op_timestamp)
+ WT_RET_MSG(session, EINVAL, "out of order durable timestamps");
+ }
+ return (0);
}
/*
* __wt_txn_commit --
- * Commit the current transaction.
+ * Commit the current transaction.
*/
int
__wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_OP *op;
- WT_UPDATE *upd;
- wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp;
- int64_t resolved_update_count, visited_update_count;
- uint32_t fileid;
- u_int i;
- bool locked, prepare, readonly, skip_update_assert, update_durable_ts;
-
- txn = &session->txn;
- conn = S2C(session);
- txn_global = &conn->txn_global;
- locked = skip_update_assert = false;
- resolved_update_count = visited_update_count = 0;
-
- WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) ||
- txn->mod_count == 0);
-
- readonly = txn->mod_count == 0;
-
- prepare = F_ISSET(txn, WT_TXN_PREPARE);
-
- /*
- * Clear the prepared round up flag if the transaction is not prepared.
- * There is no rounding up to do in that case.
- */
- if (!prepare)
- F_CLR(txn, WT_TXN_TS_ROUND_PREPARED);
-
- /* Set the commit and the durable timestamps. */
- WT_ERR(__wt_txn_set_timestamp(session, cfg));
-
- if (prepare) {
- if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
- WT_ERR_MSG(session, EINVAL,
- "commit_timestamp is required for a prepared "
- "transaction");
-
- if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
- WT_ERR_MSG(session, EINVAL,
- "durable_timestamp is required for a prepared "
- "transaction");
-
- WT_ASSERT(session,
- txn->prepare_timestamp <= txn->commit_timestamp);
- } else {
- if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
- WT_ERR_MSG(session, EINVAL,
- "prepare timestamp is set for non-prepared "
- "transaction");
-
- if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
- WT_ERR_MSG(session, EINVAL,
- "durable_timestamp should not be specified for "
- "non-prepared transaction");
- }
-
- if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
- WT_ASSERT(session,
- txn->commit_timestamp <= txn->durable_timestamp);
-
- WT_ERR(__txn_commit_timestamps_assert(session));
-
- /*
- * The default sync setting is inherited from the connection, but can
- * be overridden by an explicit "sync" setting for this transaction.
- */
- WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));
-
- /*
- * If the user chose the default setting, check whether sync is enabled
- * for this transaction (either inherited or via begin_transaction).
- * If sync is disabled, clear the field to avoid the log write being
- * flushed.
- *
- * Otherwise check for specific settings. We don't need to check for
- * "on" because that is the default inherited from the connection. If
- * the user set anything in begin_transaction, we only override with an
- * explicit setting.
- */
- if (cval.len == 0) {
- if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) &&
- !F_ISSET(txn, WT_TXN_SYNC_SET))
- txn->txn_logsync = 0;
- } else {
- /*
- * If the caller already set sync on begin_transaction then
- * they should not be using sync on commit_transaction.
- * Flag that as an error.
- */
- if (F_ISSET(txn, WT_TXN_SYNC_SET))
- WT_ERR_MSG(session, EINVAL,
- "Sync already set during begin_transaction");
- if (WT_STRING_MATCH("background", cval.str, cval.len))
- txn->txn_logsync = WT_LOG_BACKGROUND;
- else if (WT_STRING_MATCH("off", cval.str, cval.len))
- txn->txn_logsync = 0;
- /*
- * We don't need to check for "on" here because that is the
- * default to inherit from the connection setting.
- */
- }
-
- /* Commit notification. */
- if (txn->notify != NULL)
- WT_ERR(txn->notify->notify(txn->notify,
- (WT_SESSION *)session, txn->id, 1));
-
- /*
- * We are about to release the snapshot: copy values into any
- * positioned cursors so they don't point to updates that could be
- * freed once we don't have a snapshot.
- * If this transaction is prepared, then copying values would have been
- * done during prepare.
- */
- if (session->ncursors > 0 && !prepare) {
- WT_DIAGNOSTIC_YIELD;
- WT_ERR(__wt_session_copy_values(session));
- }
-
- /* If we are logging, write a commit log record. */
- if (txn->logrec != NULL &&
- FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
- !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
- /*
- * We are about to block on I/O writing the log.
- * Release our snapshot in case it is keeping data pinned.
- * This is particularly important for checkpoints.
- */
- __wt_txn_release_snapshot(session);
- /*
- * We hold the visibility lock for reading from the time
- * we write our log record until the time we release our
- * transaction so that the LSN any checkpoint gets will
- * always reflect visible data.
- */
- __wt_readlock(session, &txn_global->visibility_rwlock);
- locked = true;
- WT_ERR(__wt_txn_log_commit(session, cfg));
- }
-
- /* Note: we're going to commit: nothing can fail after this point. */
-
- /* Process and free updates. */
- for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
- fileid = op->btree->id;
- switch (op->type) {
- case WT_TXN_OP_NONE:
- break;
- case WT_TXN_OP_BASIC_COL:
- case WT_TXN_OP_BASIC_ROW:
- case WT_TXN_OP_INMEM_COL:
- case WT_TXN_OP_INMEM_ROW:
- upd = op->u.op_upd;
-
- /*
- * Need to resolve indirect references of transaction
- * operation, in case of prepared transaction.
- */
- if (!prepare) {
- /*
- * Switch reserved operations to abort to
- * simplify obsolete update list truncation.
- */
- if (upd->type == WT_UPDATE_RESERVE) {
- upd->txnid = WT_TXN_ABORTED;
- break;
- }
-
- /*
- * Writes to the lookaside file can be evicted
- * as soon as they commit.
- */
- if (conn->cache->las_fileid != 0 &&
- fileid == conn->cache->las_fileid) {
- upd->txnid = WT_TXN_NONE;
- break;
- }
-
- __wt_txn_op_set_timestamp(session, op);
- } else {
- visited_update_count++;
- /*
- * If we have set the key repeated flag
- * we can skip resolving prepared updates as
- * it would have happened on a previous
- * modification in this txn.
- */
- if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) {
- skip_update_assert =
- skip_update_assert ||
- F_ISSET(op, WT_TXN_OP_KEY_RESERVED);
- WT_ERR(__wt_txn_resolve_prepared_op(
- session, op, true,
- &resolved_update_count));
- }
-
- /*
- * We should resolve at least one or more
- * updates each time we call
- * __wt_txn_resolve_prepared_op, as such
- * resolved update count should never be less
- * than visited update count.
- */
- WT_ASSERT(session,
- resolved_update_count >=
- visited_update_count);
- }
-
- break;
- case WT_TXN_OP_REF_DELETE:
- __wt_txn_op_set_timestamp(session, op);
- break;
- case WT_TXN_OP_TRUNCATE_COL:
- case WT_TXN_OP_TRUNCATE_ROW:
- /* Other operations don't need timestamps. */
- break;
- }
-
- __wt_txn_op_free(session, op);
- }
- WT_ASSERT(session, skip_update_assert ||
- resolved_update_count == visited_update_count);
- WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved,
- resolved_update_count);
-
- txn->mod_count = 0;
-
- /*
- * If durable is set, we'll try to update the global durable timestamp
- * with that value. If durable isn't set, durable is implied to be the
- * the same as commit so we'll use that instead.
- */
- candidate_durable_timestamp = WT_TS_NONE;
- if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
- candidate_durable_timestamp = txn->durable_timestamp;
- else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
- candidate_durable_timestamp = txn->commit_timestamp;
-
- __wt_txn_release(session);
- if (locked)
- __wt_readunlock(session, &txn_global->visibility_rwlock);
-
- /*
- * If we have made some updates visible, start a new commit generation:
- * any cached snapshots have to be refreshed.
- */
- if (!readonly)
- WT_IGNORE_RET(__wt_gen_next(session, WT_GEN_COMMIT));
-
- /* First check if we've made something durable in the future. */
- update_durable_ts = false;
- prev_durable_timestamp = WT_TS_NONE;
- if (candidate_durable_timestamp != WT_TS_NONE) {
- prev_durable_timestamp = txn_global->durable_timestamp;
- update_durable_ts =
- candidate_durable_timestamp > prev_durable_timestamp;
- }
-
- /*
- * If it looks like we'll need to move the global durable timestamp,
- * attempt atomic cas and re-check.
- */
- if (update_durable_ts)
- while (candidate_durable_timestamp > prev_durable_timestamp) {
- if (__wt_atomic_cas64(&txn_global->durable_timestamp,
- prev_durable_timestamp,
- candidate_durable_timestamp)) {
- txn_global->has_durable_timestamp = true;
- break;
- }
- prev_durable_timestamp = txn_global->durable_timestamp;
- }
-
- /*
- * We're between transactions, if we need to block for eviction, it's
- * a good time to do so. Note that we must ignore any error return
- * because the user's data is committed.
- */
- if (!readonly)
- WT_IGNORE_RET(
- __wt_cache_eviction_check(session, false, false, NULL));
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_OP *op;
+ WT_UPDATE *upd;
+ wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp;
+ int64_t resolved_update_count, visited_update_count;
+ uint32_t fileid;
+ u_int i;
+ bool locked, prepare, readonly, skip_update_assert, update_durable_ts;
+
+ txn = &session->txn;
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ locked = skip_update_assert = false;
+ resolved_update_count = visited_update_count = 0;
+
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
+ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
+
+ readonly = txn->mod_count == 0;
+
+ prepare = F_ISSET(txn, WT_TXN_PREPARE);
+
+ /*
+ * Clear the prepared round up flag if the transaction is not prepared. There is no rounding up
+ * to do in that case.
+ */
+ if (!prepare)
+ F_CLR(txn, WT_TXN_TS_ROUND_PREPARED);
+
+ /* Set the commit and the durable timestamps. */
+ WT_ERR(__wt_txn_set_timestamp(session, cfg));
+
+ if (prepare) {
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ WT_ERR_MSG(session, EINVAL,
+ "commit_timestamp is required for a prepared "
+ "transaction");
+
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
+ WT_ERR_MSG(session, EINVAL,
+ "durable_timestamp is required for a prepared "
+ "transaction");
+
+ WT_ASSERT(session, txn->prepare_timestamp <= txn->commit_timestamp);
+ } else {
+ if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
+ WT_ERR_MSG(session, EINVAL,
+ "prepare timestamp is set for non-prepared "
+ "transaction");
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
+ WT_ERR_MSG(session, EINVAL,
+ "durable_timestamp should not be specified for "
+ "non-prepared transaction");
+ }
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ WT_ASSERT(session, txn->commit_timestamp <= txn->durable_timestamp);
+
+ WT_ERR(__txn_commit_timestamps_assert(session));
+
+ /*
+ * The default sync setting is inherited from the connection, but can be overridden by an
+ * explicit "sync" setting for this transaction.
+ */
+ WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval));
+
+ /*
+ * If the user chose the default setting, check whether sync is enabled
+ * for this transaction (either inherited or via begin_transaction).
+ * If sync is disabled, clear the field to avoid the log write being
+ * flushed.
+ *
+ * Otherwise check for specific settings. We don't need to check for
+ * "on" because that is the default inherited from the connection. If
+ * the user set anything in begin_transaction, we only override with an
+ * explicit setting.
+ */
+ if (cval.len == 0) {
+ if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET))
+ txn->txn_logsync = 0;
+ } else {
+ /*
+ * If the caller already set sync on begin_transaction then they should not be using sync on
+ * commit_transaction. Flag that as an error.
+ */
+ if (F_ISSET(txn, WT_TXN_SYNC_SET))
+ WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction");
+ if (WT_STRING_MATCH("background", cval.str, cval.len))
+ txn->txn_logsync = WT_LOG_BACKGROUND;
+ else if (WT_STRING_MATCH("off", cval.str, cval.len))
+ txn->txn_logsync = 0;
+ /*
+ * We don't need to check for "on" here because that is the default to inherit from the
+ * connection setting.
+ */
+ }
+
+ /* Commit notification. */
+ if (txn->notify != NULL)
+ WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1));
+
+ /*
+ * We are about to release the snapshot: copy values into any positioned cursors so they don't
+ * point to updates that could be freed once we don't have a snapshot. If this transaction is
+ * prepared, then copying values would have been done during prepare.
+ */
+ if (session->ncursors > 0 && !prepare) {
+ WT_DIAGNOSTIC_YIELD;
+ WT_ERR(__wt_session_copy_values(session));
+ }
+
+ /* If we are logging, write a commit log record. */
+ if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
+ !F_ISSET(session, WT_SESSION_NO_LOGGING)) {
+ /*
+ * We are about to block on I/O writing the log. Release our snapshot in case it is keeping
+ * data pinned. This is particularly important for checkpoints.
+ */
+ __wt_txn_release_snapshot(session);
+ /*
+ * We hold the visibility lock for reading from the time we write our log record until the
+ * time we release our transaction so that the LSN any checkpoint gets will always reflect
+ * visible data.
+ */
+ __wt_readlock(session, &txn_global->visibility_rwlock);
+ locked = true;
+ WT_ERR(__wt_txn_log_commit(session, cfg));
+ }
+
+ /* Note: we're going to commit: nothing can fail after this point. */
+
+ /* Process and free updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+ fileid = op->btree->id;
+ switch (op->type) {
+ case WT_TXN_OP_NONE:
+ break;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
+ upd = op->u.op_upd;
+
+ /*
+ * Need to resolve indirect references of transaction operation, in case of prepared
+ * transaction.
+ */
+ if (!prepare) {
+ /*
+ * Switch reserved operations to abort to simplify obsolete update list truncation.
+ */
+ if (upd->type == WT_UPDATE_RESERVE) {
+ upd->txnid = WT_TXN_ABORTED;
+ break;
+ }
+
+ /*
+ * Writes to the lookaside file can be evicted as soon as they commit.
+ */
+ if (conn->cache->las_fileid != 0 && fileid == conn->cache->las_fileid) {
+ upd->txnid = WT_TXN_NONE;
+ break;
+ }
+
+ __wt_txn_op_set_timestamp(session, op);
+ } else {
+ visited_update_count++;
+ /*
+ * If we have set the key repeated flag we can skip resolving prepared updates as it
+ * would have happened on a previous modification in this txn.
+ */
+ if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) {
+ skip_update_assert = skip_update_assert || F_ISSET(op, WT_TXN_OP_KEY_RESERVED);
+ WT_ERR(__wt_txn_resolve_prepared_op(session, op, true, &resolved_update_count));
+ }
+
+ /*
+ * We should resolve at least one or more
+ * updates each time we call
+ * __wt_txn_resolve_prepared_op, as such
+ * resolved update count should never be less
+ * than visited update count.
+ */
+ WT_ASSERT(session, resolved_update_count >= visited_update_count);
+ }
+
+ break;
+ case WT_TXN_OP_REF_DELETE:
+ __wt_txn_op_set_timestamp(session, op);
+ break;
+ case WT_TXN_OP_TRUNCATE_COL:
+ case WT_TXN_OP_TRUNCATE_ROW:
+ /* Other operations don't need timestamps. */
+ break;
+ }
+
+ __wt_txn_op_free(session, op);
+ }
+ WT_ERR_ASSERT(session, skip_update_assert || resolved_update_count == visited_update_count,
+ EINVAL, "Number of resolved prepared updates: %" PRId64
+ " does not match"
+ " number visited: %" PRId64,
+ resolved_update_count, visited_update_count);
+ WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved, resolved_update_count);
+
+ txn->mod_count = 0;
+
+ /*
+ * If durable is set, we'll try to update the global durable timestamp with that value. If
+ * durable isn't set, durable is implied to be the same as commit so we'll use that instead.
+ */
+ candidate_durable_timestamp = WT_TS_NONE;
+ if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
+ candidate_durable_timestamp = txn->durable_timestamp;
+ else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ candidate_durable_timestamp = txn->commit_timestamp;
+
+ __wt_txn_release(session);
+ if (locked)
+ __wt_readunlock(session, &txn_global->visibility_rwlock);
+
+ /*
+ * If we have made some updates visible, start a new commit generation: any cached snapshots
+ * have to be refreshed.
+ */
+ if (!readonly)
+ WT_IGNORE_RET(__wt_gen_next(session, WT_GEN_COMMIT));
+
+ /* First check if we've made something durable in the future. */
+ update_durable_ts = false;
+ prev_durable_timestamp = WT_TS_NONE;
+ if (candidate_durable_timestamp != WT_TS_NONE) {
+ prev_durable_timestamp = txn_global->durable_timestamp;
+ update_durable_ts = candidate_durable_timestamp > prev_durable_timestamp;
+ }
+
+ /*
+ * If it looks like we'll need to move the global durable timestamp, attempt atomic cas and
+ * re-check.
+ */
+ if (update_durable_ts)
+ while (candidate_durable_timestamp > prev_durable_timestamp) {
+ if (__wt_atomic_cas64(&txn_global->durable_timestamp, prev_durable_timestamp,
+ candidate_durable_timestamp)) {
+ txn_global->has_durable_timestamp = true;
+ break;
+ }
+ prev_durable_timestamp = txn_global->durable_timestamp;
+ }
+
+ /*
+ * We're between transactions, if we need to block for eviction, it's a good time to do so. Note
+ * that we must ignore any error return because the user's data is committed.
+ */
+ if (!readonly)
+ WT_IGNORE_RET(__wt_cache_eviction_check(session, false, false, NULL));
+ return (0);
err:
- /*
- * If anything went wrong, roll back.
- *
- * !!!
- * Nothing can fail after this point.
- */
- if (locked)
- __wt_readunlock(session, &txn_global->visibility_rwlock);
- WT_TRET(__wt_txn_rollback(session, cfg));
- return (ret);
+ /*
+ * If anything went wrong, roll back.
+ *
+ * !!!
+ * Nothing can fail after this point.
+ */
+ if (locked)
+ __wt_readunlock(session, &txn_global->visibility_rwlock);
+ WT_TRET(__wt_txn_rollback(session, cfg));
+ return (ret);
}
/*
* __wt_txn_prepare --
- * Prepare the current transaction.
+ * Prepare the current transaction.
*/
int
__wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_TXN *txn;
- WT_TXN_OP *op;
- WT_UPDATE *upd;
- u_int i;
-
- txn = &session->txn;
-
- WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
- /*
- * A transaction should not have updated any of the logged tables,
- * if debug mode logging is not turned on.
- */
- if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE))
- WT_ASSERT(session, txn->logrec == NULL);
-
- /* Set the prepare timestamp. */
- WT_RET(__wt_txn_set_timestamp(session, cfg));
-
- if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
- WT_RET_MSG(session, EINVAL, "prepare timestamp is not set");
-
- /*
- * We are about to release the snapshot: copy values into any
- * positioned cursors so they don't point to updates that could be
- * freed once we don't have a snapshot.
- */
- if (session->ncursors > 0) {
- WT_DIAGNOSTIC_YIELD;
- WT_RET(__wt_session_copy_values(session));
- }
-
- /*
- * Prepare updates, traverse the modification array in reverse order
- * so that we visit the update chain in newest to oldest order
- * allowing us to set the key repeated flag with reserved updates in
- * the chain.
- */
- for (i = txn->mod_count; i > 0; i--) {
- op = &txn->mod[i - 1];
- /* Assert it's not an update to the lookaside file. */
- WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
- !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
-
- /* Metadata updates should never be prepared. */
- WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle));
- if (WT_IS_METADATA(op->btree->dhandle))
- continue;
-
- upd = op->u.op_upd;
-
- switch (op->type) {
- case WT_TXN_OP_NONE:
- break;
- case WT_TXN_OP_BASIC_COL:
- case WT_TXN_OP_BASIC_ROW:
- case WT_TXN_OP_INMEM_COL:
- case WT_TXN_OP_INMEM_ROW:
- /*
- * Switch reserved operation to abort to simplify
- * obsolete update list truncation. The object free
- * function clears the operation type so we don't
- * try to visit this update again: it can be evicted.
- */
- if (upd->type == WT_UPDATE_RESERVE) {
- upd->txnid = WT_TXN_ABORTED;
- __wt_txn_op_free(session, op);
- break;
- }
-
- /* Set prepare timestamp. */
- upd->start_ts = txn->prepare_timestamp;
-
- WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS);
- op->u.op_upd = NULL;
- WT_STAT_CONN_INCR(session, txn_prepared_updates_count);
- /*
- * Set the key repeated flag which tells us that we've
- * got multiple updates to the same key by the same txn.
- * This is later used in txn commit.
- *
- * When we see a reserved update we set the
- * WT_UPDATE_RESERVED flag instead. We do this as we
- * cannot know if our current update should specify the
- * key repeated flag as we don't want to traverse the
- * entire update chain to find out. i.e. if there is
- * an update with our txnid after the reserved update
- * we should set key repeated, but if there isn't we
- * shouldn't.
- */
- if (upd->next != NULL &&
- upd->txnid == upd->next->txnid) {
- if (upd->next->type == WT_UPDATE_RESERVE)
- F_SET(op, WT_TXN_OP_KEY_RESERVED);
- else
- F_SET(op, WT_TXN_OP_KEY_REPEATED);
- }
- break;
- case WT_TXN_OP_REF_DELETE:
- __wt_txn_op_apply_prepare_state(
- session, op->u.ref, false);
- break;
- case WT_TXN_OP_TRUNCATE_COL:
- case WT_TXN_OP_TRUNCATE_ROW:
- /* Other operations don't need timestamps. */
- break;
- }
- }
-
- /* Set transaction state to prepare. */
- F_SET(&session->txn, WT_TXN_PREPARE);
-
- /* Release our snapshot in case it is keeping data pinned. */
- __wt_txn_release_snapshot(session);
-
- /*
- * Clear the transaction's ID from the global table, to facilitate
- * prepared data visibility, but not from local transaction structure.
- */
- if (F_ISSET(txn, WT_TXN_HAS_ID))
- __txn_remove_from_global_table(session);
-
- return (0);
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ WT_UPDATE *upd;
+ u_int i;
+
+ txn = &session->txn;
+
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
+ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
+ /*
+ * A transaction should not have updated any of the logged tables, if debug mode logging is not
+ * turned on.
+ */
+ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE))
+ WT_RET_ASSERT(session, txn->logrec == NULL, EINVAL,
+ "A transaction should not have been assigned a log"
+ " record if WT_CONN_LOG_DEBUG mode is not enabled");
+
+ /* Set the prepare timestamp. */
+ WT_RET(__wt_txn_set_timestamp(session, cfg));
+
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
+ WT_RET_MSG(session, EINVAL, "prepare timestamp is not set");
+
+ /*
+ * We are about to release the snapshot: copy values into any positioned cursors so they don't
+ * point to updates that could be freed once we don't have a snapshot.
+ */
+ if (session->ncursors > 0) {
+ WT_DIAGNOSTIC_YIELD;
+ WT_RET(__wt_session_copy_values(session));
+ }
+
+ /*
+ * Prepare updates, traverse the modification array in reverse order so that we visit the update
+ * chain in newest to oldest order allowing us to set the key repeated flag with reserved
+ * updates in the chain.
+ */
+ for (i = txn->mod_count; i > 0; i--) {
+ op = &txn->mod[i - 1];
+ /* Assert it's not an update to the lookaside file. */
+ WT_ASSERT(
+ session, S2C(session)->cache->las_fileid == 0 || !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
+
+ /* Metadata updates should never be prepared. */
+ WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle));
+ if (WT_IS_METADATA(op->btree->dhandle))
+ continue;
+
+ upd = op->u.op_upd;
+
+ switch (op->type) {
+ case WT_TXN_OP_NONE:
+ break;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
+ /*
+ * Switch reserved operation to abort to simplify obsolete update list truncation. The
+ * object free function clears the operation type so we don't try to visit this update
+ * again: it can be evicted.
+ */
+ if (upd->type == WT_UPDATE_RESERVE) {
+ upd->txnid = WT_TXN_ABORTED;
+ __wt_txn_op_free(session, op);
+ break;
+ }
+
+ /* Set prepare timestamp. */
+ upd->start_ts = txn->prepare_timestamp;
+
+ WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS);
+ op->u.op_upd = NULL;
+ WT_STAT_CONN_INCR(session, txn_prepared_updates_count);
+ /*
+ * Set the key repeated flag which tells us that we've
+ * got multiple updates to the same key by the same txn.
+ * This is later used in txn commit.
+ *
+ * When we see a reserved update we set the
+ * WT_UPDATE_RESERVED flag instead. We do this as we
+ * cannot know if our current update should specify the
+ * key repeated flag as we don't want to traverse the
+ * entire update chain to find out. i.e. if there is
+ * an update with our txnid after the reserved update
+ * we should set key repeated, but if there isn't we
+ * shouldn't.
+ */
+ if (upd->next != NULL && upd->txnid == upd->next->txnid) {
+ if (upd->next->type == WT_UPDATE_RESERVE)
+ F_SET(op, WT_TXN_OP_KEY_RESERVED);
+ else
+ F_SET(op, WT_TXN_OP_KEY_REPEATED);
+ }
+ break;
+ case WT_TXN_OP_REF_DELETE:
+ __wt_txn_op_apply_prepare_state(session, op->u.ref, false);
+ break;
+ case WT_TXN_OP_TRUNCATE_COL:
+ case WT_TXN_OP_TRUNCATE_ROW:
+ /* Other operations don't need timestamps. */
+ break;
+ }
+ }
+
+ /* Set transaction state to prepare. */
+ F_SET(&session->txn, WT_TXN_PREPARE);
+
+ /* Release our snapshot in case it is keeping data pinned. */
+ __wt_txn_release_snapshot(session);
+
+ /*
+ * Clear the transaction's ID from the global table, to facilitate prepared data visibility, but
+ * not from local transaction structure.
+ */
+ if (F_ISSET(txn, WT_TXN_HAS_ID))
+ __txn_remove_from_global_table(session);
+
+ return (0);
}
/*
* __wt_txn_rollback --
- * Roll back the current transaction.
+ * Roll back the current transaction.
*/
int
__wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_OP *op;
- WT_UPDATE *upd;
- int64_t resolved_update_count, visited_update_count;
- u_int i;
- bool readonly, skip_update_assert;
-
- WT_UNUSED(cfg);
- resolved_update_count = visited_update_count = 0;
- txn = &session->txn;
- readonly = txn->mod_count == 0;
- skip_update_assert = false;
- WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
-
- /* Rollback notification. */
- if (txn->notify != NULL)
- WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session,
- txn->id, 0));
-
- /* Rollback updates. */
- for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
- /* Assert it's not an update to the lookaside file. */
- WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 ||
- !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
-
- /* Metadata updates should never be rolled back. */
- WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle));
- if (WT_IS_METADATA(op->btree->dhandle))
- continue;
-
- upd = op->u.op_upd;
-
- switch (op->type) {
- case WT_TXN_OP_NONE:
- break;
- case WT_TXN_OP_BASIC_COL:
- case WT_TXN_OP_BASIC_ROW:
- case WT_TXN_OP_INMEM_COL:
- case WT_TXN_OP_INMEM_ROW:
- /*
- * Need to resolve indirect references of transaction
- * operation, in case of prepared transaction.
- */
- if (F_ISSET(txn, WT_TXN_PREPARE)) {
- visited_update_count++;
- /*
- * If we have set the key repeated flag
- * we can skip resolving prepared updates as
- * it would have happened on a previous
- * modification in this txn.
- */
- if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) {
- skip_update_assert =
- skip_update_assert ||
- F_ISSET(op, WT_TXN_OP_KEY_RESERVED);
- WT_RET(__wt_txn_resolve_prepared_op(
- session, op, false,
- &resolved_update_count));
- }
- /*
- * We should resolve at least one or more
- * updates each time we call
- * __wt_txn_resolve_prepared_op, as such
- * resolved update count should never be less
- * than visited update count.
- */
- WT_ASSERT(session,
- resolved_update_count >=
- visited_update_count);
- } else {
- WT_ASSERT(session, upd->txnid == txn->id ||
- upd->txnid == WT_TXN_ABORTED);
- upd->txnid = WT_TXN_ABORTED;
- }
- break;
- case WT_TXN_OP_REF_DELETE:
- WT_TRET(__wt_delete_page_rollback(session, op->u.ref));
- break;
- case WT_TXN_OP_TRUNCATE_COL:
- case WT_TXN_OP_TRUNCATE_ROW:
- /*
- * Nothing to do: these operations are only logged for
- * recovery. The in-memory changes will be rolled back
- * with a combination of WT_TXN_OP_REF_DELETE and
- * WT_TXN_OP_INMEM operations.
- */
- break;
- }
-
- __wt_txn_op_free(session, op);
- }
- WT_ASSERT(session, skip_update_assert ||
- resolved_update_count == visited_update_count);
- WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved,
- resolved_update_count);
-
- txn->mod_count = 0;
-
- __wt_txn_release(session);
- /*
- * We're between transactions, if we need to block for eviction, it's
- * a good time to do so. Note that we must ignore any error return
- * because the user's data is committed.
- */
- if (!readonly)
- WT_IGNORE_RET(
- __wt_cache_eviction_check(session, false, false, NULL));
- return (ret);
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+ WT_UPDATE *upd;
+ int64_t resolved_update_count, visited_update_count;
+ u_int i;
+ bool readonly, skip_update_assert;
+
+ WT_UNUSED(cfg);
+ resolved_update_count = visited_update_count = 0;
+ txn = &session->txn;
+ readonly = txn->mod_count == 0;
+ skip_update_assert = false;
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
+
+ /* Rollback notification. */
+ if (txn->notify != NULL)
+ WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 0));
+
+ /* Rollback updates. */
+ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) {
+ /* Assert it's not an update to the lookaside file. */
+ WT_ASSERT(
+ session, S2C(session)->cache->las_fileid == 0 || !F_ISSET(op->btree, WT_BTREE_LOOKASIDE));
+
+ /* Metadata updates should never be rolled back. */
+ WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle));
+ if (WT_IS_METADATA(op->btree->dhandle))
+ continue;
+
+ upd = op->u.op_upd;
+
+ switch (op->type) {
+ case WT_TXN_OP_NONE:
+ break;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
+ /*
+ * Need to resolve indirect references of transaction operation, in case of prepared
+ * transaction.
+ */
+ if (F_ISSET(txn, WT_TXN_PREPARE)) {
+ visited_update_count++;
+ /*
+ * If we have set the key repeated flag we can skip resolving prepared updates as it
+ * would have happened on a previous modification in this txn.
+ */
+ if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) {
+ skip_update_assert = skip_update_assert || F_ISSET(op, WT_TXN_OP_KEY_RESERVED);
+ WT_RET(
+ __wt_txn_resolve_prepared_op(session, op, false, &resolved_update_count));
+ }
+ /*
+ * We should resolve at least one or more
+ * updates each time we call
+ * __wt_txn_resolve_prepared_op, as such
+ * resolved update count should never be less
+ * than visited update count.
+ */
+ WT_ASSERT(session, resolved_update_count >= visited_update_count);
+ } else {
+ WT_ASSERT(session, upd->txnid == txn->id || upd->txnid == WT_TXN_ABORTED);
+ upd->txnid = WT_TXN_ABORTED;
+ }
+ break;
+ case WT_TXN_OP_REF_DELETE:
+ WT_TRET(__wt_delete_page_rollback(session, op->u.ref));
+ break;
+ case WT_TXN_OP_TRUNCATE_COL:
+ case WT_TXN_OP_TRUNCATE_ROW:
+ /*
+ * Nothing to do: these operations are only logged for recovery. The in-memory changes
+ * will be rolled back with a combination of WT_TXN_OP_REF_DELETE and WT_TXN_OP_INMEM
+ * operations.
+ */
+ break;
+ }
+
+ __wt_txn_op_free(session, op);
+ }
+ WT_RET_ASSERT(session, skip_update_assert || resolved_update_count == visited_update_count,
+ EINVAL, "Number of resolved prepared updates: %" PRId64
+ " does not match"
+ " number visited: %" PRId64,
+ resolved_update_count, visited_update_count);
+ WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved, resolved_update_count);
+
+ txn->mod_count = 0;
+
+ __wt_txn_release(session);
+ /*
+ * We're between transactions, if we need to block for eviction, it's a good time to do so. Note
+ * that we must ignore any error return because the user's data is committed.
+ */
+ if (!readonly)
+ WT_IGNORE_RET(__wt_cache_eviction_check(session, false, false, NULL));
+ return (ret);
}
/*
* __wt_txn_rollback_required --
- * Prepare to log a reason if the user attempts to use the transaction to
- * do anything other than rollback.
+ * Prepare to log a reason if the user attempts to use the transaction to do anything other than
+ * rollback.
*/
int
__wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason)
{
- session->txn.rollback_reason = reason;
- return (WT_ROLLBACK);
+ session->txn.rollback_reason = reason;
+ return (WT_ROLLBACK);
}
/*
* __wt_txn_init --
- * Initialize a session's transaction data.
+ * Initialize a session's transaction data.
*/
int
__wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
{
- WT_TXN *txn;
+ WT_TXN *txn;
- txn = &session_ret->txn;
- txn->id = WT_TXN_NONE;
+ txn = &session_ret->txn;
+ txn->id = WT_TXN_NONE;
- WT_RET(__wt_calloc_def(session,
- S2C(session_ret)->session_size, &txn->snapshot));
+ WT_RET(__wt_calloc_def(session, S2C(session_ret)->session_size, &txn->snapshot));
#ifdef HAVE_DIAGNOSTIC
- if (S2C(session_ret)->txn_global.states != NULL) {
- WT_TXN_STATE *txn_state;
- txn_state = WT_SESSION_TXN_STATE(session_ret);
- WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE);
- }
+ if (S2C(session_ret)->txn_global.states != NULL) {
+ WT_TXN_STATE *txn_state;
+ txn_state = WT_SESSION_TXN_STATE(session_ret);
+ WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE);
+ }
#endif
- /*
- * Take care to clean these out in case we are reusing the transaction
- * for eviction.
- */
- txn->mod = NULL;
+ /*
+ * Take care to clean these out in case we are reusing the transaction for eviction.
+ */
+ txn->mod = NULL;
- txn->isolation = session_ret->isolation;
- return (0);
+ txn->isolation = session_ret->isolation;
+ return (0);
}
/*
* __wt_txn_stats_update --
- * Update the transaction statistics for return to the application.
+ * Update the transaction statistics for return to the application.
*/
void
__wt_txn_stats_update(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_CONNECTION_STATS **stats;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t checkpoint_timestamp;
- wt_timestamp_t durable_timestamp;
- wt_timestamp_t oldest_active_read_timestamp;
- wt_timestamp_t pinned_timestamp;
- uint64_t checkpoint_pinned, snapshot_pinned;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- stats = conn->stats;
- checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
- snapshot_pinned = txn_global->nsnap_oldest_id;
-
- WT_STAT_SET(session, stats, txn_pinned_range,
- txn_global->current - txn_global->oldest_id);
-
- checkpoint_timestamp = txn_global->checkpoint_timestamp;
- durable_timestamp = txn_global->durable_timestamp;
- pinned_timestamp = txn_global->pinned_timestamp;
- if (checkpoint_timestamp != WT_TS_NONE &&
- checkpoint_timestamp < pinned_timestamp)
- pinned_timestamp = checkpoint_timestamp;
- WT_STAT_SET(session, stats, txn_pinned_timestamp,
- durable_timestamp - pinned_timestamp);
- WT_STAT_SET(session, stats, txn_pinned_timestamp_checkpoint,
- durable_timestamp - checkpoint_timestamp);
- WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest,
- durable_timestamp - txn_global->oldest_timestamp);
-
- if (__wt_txn_get_pinned_timestamp(
- session, &oldest_active_read_timestamp, 0) == 0) {
- WT_STAT_SET(session, stats,
- txn_timestamp_oldest_active_read,
- oldest_active_read_timestamp);
- WT_STAT_SET(session, stats,
- txn_pinned_timestamp_reader,
- durable_timestamp - oldest_active_read_timestamp);
- } else {
- WT_STAT_SET(session,
- stats, txn_timestamp_oldest_active_read, 0);
- WT_STAT_SET(session,
- stats, txn_pinned_timestamp_reader, 0);
- }
-
- WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
- snapshot_pinned == WT_TXN_NONE ?
- 0 : txn_global->current - snapshot_pinned);
-
- WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
- checkpoint_pinned == WT_TXN_NONE ?
- 0 : txn_global->current - checkpoint_pinned);
-
- WT_STAT_SET(
- session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
- WT_STAT_SET(
- session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
- WT_STAT_SET(
- session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
- WT_STAT_SET(
- session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
- WT_STAT_SET(session,
- stats, txn_durable_queue_len, txn_global->durable_timestampq_len);
- WT_STAT_SET(session,
- stats, txn_read_queue_len, txn_global->read_timestampq_len);
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS **stats;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t checkpoint_timestamp;
+ wt_timestamp_t durable_timestamp;
+ wt_timestamp_t oldest_active_read_timestamp;
+ wt_timestamp_t pinned_timestamp;
+ uint64_t checkpoint_pinned, snapshot_pinned;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ stats = conn->stats;
+ checkpoint_pinned = txn_global->checkpoint_state.pinned_id;
+ snapshot_pinned = txn_global->nsnap_oldest_id;
+
+ WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id);
+
+ checkpoint_timestamp = txn_global->checkpoint_timestamp;
+ durable_timestamp = txn_global->durable_timestamp;
+ pinned_timestamp = txn_global->pinned_timestamp;
+ if (checkpoint_timestamp != WT_TS_NONE && checkpoint_timestamp < pinned_timestamp)
+ pinned_timestamp = checkpoint_timestamp;
+ WT_STAT_SET(session, stats, txn_pinned_timestamp, durable_timestamp - pinned_timestamp);
+ WT_STAT_SET(
+ session, stats, txn_pinned_timestamp_checkpoint, durable_timestamp - checkpoint_timestamp);
+ WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest,
+ durable_timestamp - txn_global->oldest_timestamp);
+
+ if (__wt_txn_get_pinned_timestamp(session, &oldest_active_read_timestamp, 0) == 0) {
+ WT_STAT_SET(session, stats, txn_timestamp_oldest_active_read, oldest_active_read_timestamp);
+ WT_STAT_SET(session, stats, txn_pinned_timestamp_reader,
+ durable_timestamp - oldest_active_read_timestamp);
+ } else {
+ WT_STAT_SET(session, stats, txn_timestamp_oldest_active_read, 0);
+ WT_STAT_SET(session, stats, txn_pinned_timestamp_reader, 0);
+ }
+
+ WT_STAT_SET(session, stats, txn_pinned_snapshot_range,
+ snapshot_pinned == WT_TXN_NONE ? 0 : txn_global->current - snapshot_pinned);
+
+ WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
+ checkpoint_pinned == WT_TXN_NONE ? 0 : txn_global->current - checkpoint_pinned);
+
+ WT_STAT_SET(session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
+ WT_STAT_SET(session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
+ WT_STAT_SET(session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
+ WT_STAT_SET(session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
+ WT_STAT_SET(session, stats, txn_durable_queue_len, txn_global->durable_timestampq_len);
+ WT_STAT_SET(session, stats, txn_read_queue_len, txn_global->read_timestampq_len);
}
/*
* __wt_txn_release_resources --
- * Release resources for a session's transaction data.
+ * Release resources for a session's transaction data.
*/
void
__wt_txn_release_resources(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
+ WT_TXN *txn;
- txn = &session->txn;
+ txn = &session->txn;
- WT_ASSERT(session, txn->mod_count == 0);
- __wt_free(session, txn->mod);
- txn->mod_alloc = 0;
- txn->mod_count = 0;
+ WT_ASSERT(session, txn->mod_count == 0);
+ __wt_free(session, txn->mod);
+ txn->mod_alloc = 0;
+ txn->mod_count = 0;
}
/*
* __wt_txn_destroy --
- * Destroy a session's transaction data.
+ * Destroy a session's transaction data.
*/
void
__wt_txn_destroy(WT_SESSION_IMPL *session)
{
- __wt_txn_release_resources(session);
- __wt_free(session, session->txn.snapshot);
+ __wt_txn_release_resources(session);
+ __wt_free(session, session->txn.snapshot);
}
/*
* __wt_txn_global_init --
- * Initialize the global transaction state.
+ * Initialize the global transaction state.
*/
int
__wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
- u_int i;
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ u_int i;
- WT_UNUSED(cfg);
- conn = S2C(session);
+ WT_UNUSED(cfg);
+ conn = S2C(session);
- txn_global = &conn->txn_global;
- txn_global->current = txn_global->last_running =
- txn_global->metadata_pinned = txn_global->oldest_id = WT_TXN_FIRST;
+ txn_global = &conn->txn_global;
+ txn_global->current = txn_global->last_running = txn_global->metadata_pinned =
+ txn_global->oldest_id = WT_TXN_FIRST;
- WT_RET(__wt_spin_init(
- session, &txn_global->id_lock, "transaction id lock"));
- WT_RWLOCK_INIT_TRACKED(session, &txn_global->rwlock, txn_global);
- WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock));
+ WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock"));
+ WT_RWLOCK_INIT_TRACKED(session, &txn_global->rwlock, txn_global);
+ WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock));
- WT_RWLOCK_INIT_TRACKED(session,
- &txn_global->durable_timestamp_rwlock, durable_timestamp);
- TAILQ_INIT(&txn_global->durable_timestamph);
+ WT_RWLOCK_INIT_TRACKED(session, &txn_global->durable_timestamp_rwlock, durable_timestamp);
+ TAILQ_INIT(&txn_global->durable_timestamph);
- WT_RWLOCK_INIT_TRACKED(session,
- &txn_global->read_timestamp_rwlock, read_timestamp);
- TAILQ_INIT(&txn_global->read_timestamph);
+ WT_RWLOCK_INIT_TRACKED(session, &txn_global->read_timestamp_rwlock, read_timestamp);
+ TAILQ_INIT(&txn_global->read_timestamph);
- WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock));
- txn_global->nsnap_oldest_id = WT_TXN_NONE;
- TAILQ_INIT(&txn_global->nsnaph);
+ WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock));
+ txn_global->nsnap_oldest_id = WT_TXN_NONE;
+ TAILQ_INIT(&txn_global->nsnaph);
- WT_RET(__wt_calloc_def(
- session, conn->session_size, &txn_global->states));
+ WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->states));
- for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
- s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE;
+ for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
+ s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE;
- return (0);
+ return (0);
}
/*
* __wt_txn_global_destroy --
- * Destroy the global transaction state.
+ * Destroy the global transaction state.
*/
void
__wt_txn_global_destroy(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- if (txn_global == NULL)
- return;
-
- __wt_spin_destroy(session, &txn_global->id_lock);
- __wt_rwlock_destroy(session, &txn_global->rwlock);
- __wt_rwlock_destroy(session, &txn_global->durable_timestamp_rwlock);
- __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock);
- __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
- __wt_rwlock_destroy(session, &txn_global->visibility_rwlock);
- __wt_free(session, txn_global->states);
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ if (txn_global == NULL)
+ return;
+
+ __wt_spin_destroy(session, &txn_global->id_lock);
+ __wt_rwlock_destroy(session, &txn_global->rwlock);
+ __wt_rwlock_destroy(session, &txn_global->durable_timestamp_rwlock);
+ __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock);
+ __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock);
+ __wt_rwlock_destroy(session, &txn_global->visibility_rwlock);
+ __wt_free(session, txn_global->states);
}
/*
* __wt_txn_activity_drain --
- * Wait for transactions to quiesce.
+ * Wait for transactions to quiesce.
*/
int
__wt_txn_activity_drain(WT_SESSION_IMPL *session)
{
- bool txn_active;
-
- /*
- * It's possible that the eviction server is in the middle of a long
- * operation, with a transaction ID pinned. In that case, we will loop
- * here until the transaction ID is released, when the oldest
- * transaction ID will catch up with the current ID.
- */
- for (;;) {
- WT_RET(__wt_txn_activity_check(session, &txn_active));
- if (!txn_active)
- break;
-
- WT_STAT_CONN_INCR(session, txn_release_blocked);
- __wt_yield();
- }
-
- return (0);
+ bool txn_active;
+
+ /*
+ * It's possible that the eviction server is in the middle of a long operation, with a
+ * transaction ID pinned. In that case, we will loop here until the transaction ID is released,
+ * when the oldest transaction ID will catch up with the current ID.
+ */
+ for (;;) {
+ WT_RET(__wt_txn_activity_check(session, &txn_active));
+ if (!txn_active)
+ break;
+
+ WT_STAT_CONN_INCR(session, txn_release_blocked);
+ __wt_yield();
+ }
+
+ return (0);
}
/*
* __wt_txn_global_shutdown --
- * Shut down the global transaction state.
+ * Shut down the global transaction state.
*/
void
__wt_txn_global_shutdown(WT_SESSION_IMPL *session)
{
- /*
- * All application transactions have completed, ignore the pinned
- * timestamp so that updates can be evicted from the cache during
- * connection close.
- *
- * Note that we are relying on a special case in __wt_txn_visible_all
- * that returns true during close when there is no pinned timestamp
- * set.
- */
- S2C(session)->txn_global.has_pinned_timestamp = false;
+ /*
+ * All application transactions have completed, ignore the pinned
+ * timestamp so that updates can be evicted from the cache during
+ * connection close.
+ *
+ * Note that we are relying on a special case in __wt_txn_visible_all
+ * that returns true during close when there is no pinned timestamp
+ * set.
+ */
+ S2C(session)->txn_global.has_pinned_timestamp = false;
}
/*
* __wt_verbose_dump_txn_one --
- * Output diagnostic information about a transaction structure.
+ * Output diagnostic information about a transaction structure.
*/
int
__wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn)
{
- const char *iso_tag;
- char ts_string[5][WT_TS_INT_STRING_SIZE];
-
- WT_NOT_READ(iso_tag, "INVALID");
- switch (txn->isolation) {
- case WT_ISO_READ_COMMITTED:
- iso_tag = "WT_ISO_READ_COMMITTED";
- break;
- case WT_ISO_READ_UNCOMMITTED:
- iso_tag = "WT_ISO_READ_UNCOMMITTED";
- break;
- case WT_ISO_SNAPSHOT:
- iso_tag = "WT_ISO_SNAPSHOT";
- break;
- }
- WT_RET(__wt_msg(session,
- "transaction id: %" PRIu64
- ", mod count: %u"
- ", snap min: %" PRIu64
- ", snap max: %" PRIu64
- ", snapshot count: %u"
- ", commit_timestamp: %s"
- ", durable_timestamp: %s"
- ", first_commit_timestamp: %s"
- ", prepare_timestamp: %s"
- ", read_timestamp: %s"
- ", checkpoint LSN: [%" PRIu32 "][%" PRIu32 "]"
- ", full checkpoint: %s"
- ", rollback reason: %s"
- ", flags: 0x%08" PRIx32
- ", isolation: %s",
- txn->id,
- txn->mod_count,
- txn->snap_min,
- txn->snap_max,
- txn->snapshot_count,
- __wt_timestamp_to_string(txn->commit_timestamp, ts_string[0]),
- __wt_timestamp_to_string(txn->durable_timestamp, ts_string[1]),
- __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[2]),
- __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[3]),
- __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]),
- txn->ckpt_lsn.l.file, txn->ckpt_lsn.l.offset,
- txn->full_ckpt ? "true" : "false",
- txn->rollback_reason == NULL ? "" : txn->rollback_reason,
- txn->flags,
- iso_tag));
- return (0);
+ char ts_string[5][WT_TS_INT_STRING_SIZE];
+ const char *iso_tag;
+
+ WT_NOT_READ(iso_tag, "INVALID");
+ switch (txn->isolation) {
+ case WT_ISO_READ_COMMITTED:
+ iso_tag = "WT_ISO_READ_COMMITTED";
+ break;
+ case WT_ISO_READ_UNCOMMITTED:
+ iso_tag = "WT_ISO_READ_UNCOMMITTED";
+ break;
+ case WT_ISO_SNAPSHOT:
+ iso_tag = "WT_ISO_SNAPSHOT";
+ break;
+ }
+ WT_RET(__wt_msg(session, "transaction id: %" PRIu64 ", mod count: %u"
+ ", snap min: %" PRIu64 ", snap max: %" PRIu64 ", snapshot count: %u"
+ ", commit_timestamp: %s"
+ ", durable_timestamp: %s"
+ ", first_commit_timestamp: %s"
+ ", prepare_timestamp: %s"
+ ", read_timestamp: %s"
+ ", checkpoint LSN: [%" PRIu32 "][%" PRIu32 "]"
+ ", full checkpoint: %s"
+ ", rollback reason: %s"
+ ", flags: 0x%08" PRIx32 ", isolation: %s",
+ txn->id, txn->mod_count, txn->snap_min, txn->snap_max, txn->snapshot_count,
+ __wt_timestamp_to_string(txn->commit_timestamp, ts_string[0]),
+ __wt_timestamp_to_string(txn->durable_timestamp, ts_string[1]),
+ __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[2]),
+ __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[3]),
+ __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]), txn->ckpt_lsn.l.file,
+ txn->ckpt_lsn.l.offset, txn->full_ckpt ? "true" : "false",
+ txn->rollback_reason == NULL ? "" : txn->rollback_reason, txn->flags, iso_tag));
+ return (0);
}
/*
* __wt_verbose_dump_txn --
- * Output diagnostic information about the global transaction state.
+ * Output diagnostic information about the global transaction state.
*/
int
__wt_verbose_dump_txn(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_SESSION_IMPL *sess;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *s;
- uint64_t id;
- uint32_t i, session_cnt;
- char ts_string[WT_TS_INT_STRING_SIZE];
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
- WT_RET(__wt_msg(session, "transaction state dump"));
-
- WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
- WT_RET(__wt_msg(session,
- "last running ID: %" PRIu64, txn_global->last_running));
- WT_RET(__wt_msg(session,
- "metadata_pinned ID: %" PRIu64, txn_global->metadata_pinned));
- WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
-
- WT_RET(__wt_msg(session, "durable timestamp: %s",
- __wt_timestamp_to_string(
- txn_global->durable_timestamp, ts_string)));
- WT_RET(__wt_msg(session, "oldest timestamp: %s",
- __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string)));
- WT_RET(__wt_msg(session, "pinned timestamp: %s",
- __wt_timestamp_to_string(txn_global->pinned_timestamp, ts_string)));
- WT_RET(__wt_msg(session, "stable timestamp: %s",
- __wt_timestamp_to_string(txn_global->stable_timestamp, ts_string)));
- WT_RET(__wt_msg(session, "has_durable_timestamp: %s",
- txn_global->has_durable_timestamp ? "yes" : "no"));
- WT_RET(__wt_msg(session, "has_oldest_timestamp: %s",
- txn_global->has_oldest_timestamp ? "yes" : "no"));
- WT_RET(__wt_msg(session, "has_pinned_timestamp: %s",
- txn_global->has_pinned_timestamp ? "yes" : "no"));
- WT_RET(__wt_msg(session, "has_stable_timestamp: %s",
- txn_global->has_stable_timestamp ? "yes" : "no"));
- WT_RET(__wt_msg(session, "oldest_is_pinned: %s",
- txn_global->oldest_is_pinned ? "yes" : "no"));
- WT_RET(__wt_msg(session, "stable_is_pinned: %s",
- txn_global->stable_is_pinned ? "yes" : "no"));
-
- WT_RET(__wt_msg(session, "checkpoint running: %s",
- txn_global->checkpoint_running ? "yes" : "no"));
- WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64,
- __wt_gen(session, WT_GEN_CHECKPOINT)));
- WT_RET(__wt_msg(session, "checkpoint pinned ID: %" PRIu64,
- txn_global->checkpoint_state.pinned_id));
- WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64,
- txn_global->checkpoint_state.id));
-
- WT_RET(__wt_msg(session,
- "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
-
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
- WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
-
- /*
- * Walk each session transaction state and dump information. Accessing
- * the content of session handles is not thread safe, so some
- * information may change while traversing if other threads are active
- * at the same time, which is OK since this is diagnostic code.
- */
- for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip sessions with no active transaction */
- if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
- continue;
- sess = &conn->sessions[i];
- WT_RET(__wt_msg(session,
- "ID: %" PRIu64
- ", pinned ID: %" PRIu64
- ", metadata pinned ID: %" PRIu64
- ", name: %s",
- id, s->pinned_id, s->metadata_pinned,
- sess->name == NULL ?
- "EMPTY" : sess->name));
- WT_RET(__wt_verbose_dump_txn_one(session, &sess->txn));
- }
-
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *sess;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ uint64_t id;
+ uint32_t i, session_cnt;
+ char ts_string[WT_TS_INT_STRING_SIZE];
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ WT_RET(__wt_msg(session, "%s", WT_DIVIDER));
+ WT_RET(__wt_msg(session, "transaction state dump"));
+
+ WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current));
+ WT_RET(__wt_msg(session, "last running ID: %" PRIu64, txn_global->last_running));
+ WT_RET(__wt_msg(session, "metadata_pinned ID: %" PRIu64, txn_global->metadata_pinned));
+ WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id));
+
+ WT_RET(__wt_msg(session, "durable timestamp: %s",
+ __wt_timestamp_to_string(txn_global->durable_timestamp, ts_string)));
+ WT_RET(__wt_msg(session, "oldest timestamp: %s",
+ __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string)));
+ WT_RET(__wt_msg(session, "pinned timestamp: %s",
+ __wt_timestamp_to_string(txn_global->pinned_timestamp, ts_string)));
+ WT_RET(__wt_msg(session, "stable timestamp: %s",
+ __wt_timestamp_to_string(txn_global->stable_timestamp, ts_string)));
+ WT_RET(__wt_msg(
+ session, "has_durable_timestamp: %s", txn_global->has_durable_timestamp ? "yes" : "no"));
+ WT_RET(__wt_msg(
+ session, "has_oldest_timestamp: %s", txn_global->has_oldest_timestamp ? "yes" : "no"));
+ WT_RET(__wt_msg(
+ session, "has_pinned_timestamp: %s", txn_global->has_pinned_timestamp ? "yes" : "no"));
+ WT_RET(__wt_msg(
+ session, "has_stable_timestamp: %s", txn_global->has_stable_timestamp ? "yes" : "no"));
+ WT_RET(__wt_msg(session, "oldest_is_pinned: %s", txn_global->oldest_is_pinned ? "yes" : "no"));
+ WT_RET(__wt_msg(session, "stable_is_pinned: %s", txn_global->stable_is_pinned ? "yes" : "no"));
+
+ WT_RET(
+ __wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no"));
+ WT_RET(
+ __wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT)));
+ WT_RET(
+ __wt_msg(session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_state.pinned_id));
+ WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id));
+
+ WT_RET(__wt_msg(session, "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id));
+
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt));
+ WT_RET(__wt_msg(session, "Transaction state of active sessions:"));
+
+ /*
+ * Walk each session transaction state and dump information. Accessing the content of session
+ * handles is not thread safe, so some information may change while traversing if other threads
+ * are active at the same time, which is OK since this is diagnostic code.
+ */
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ /* Skip sessions with no active transaction */
+ if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE)
+ continue;
+ sess = &conn->sessions[i];
+ WT_RET(__wt_msg(session,
+ "ID: %" PRIu64 ", pinned ID: %" PRIu64 ", metadata pinned ID: %" PRIu64 ", name: %s", id,
+ s->pinned_id, s->metadata_pinned, sess->name == NULL ? "EMPTY" : sess->name));
+ WT_RET(__wt_verbose_dump_txn_one(session, &sess->txn));
+ }
+
+ return (0);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index ba3f4520e37..072406a25cc 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -9,1966 +9,1839 @@
#include "wt_internal.h"
static void __checkpoint_timing_stress(WT_SESSION_IMPL *);
-static int __checkpoint_lock_dirty_tree(
- WT_SESSION_IMPL *, bool, bool, bool, const char *[]);
+static int __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *, bool, bool, bool, const char *[]);
static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool);
static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]);
static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]);
/*
* __checkpoint_name_ok --
- * Complain if the checkpoint name isn't acceptable.
+ * Complain if the checkpoint name isn't acceptable.
*/
static int
__checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len)
{
- /* Check for characters we don't want to see in a metadata file. */
- WT_RET(__wt_name_check(session, name, len));
-
- /*
- * The internal checkpoint name is special, applications aren't allowed
- * to use it. Be aggressive and disallow any matching prefix, it makes
- * things easier when checking in other places.
- */
- if (len < strlen(WT_CHECKPOINT))
- return (0);
- if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT))
- return (0);
-
- WT_RET_MSG(session, EINVAL,
- "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT);
+ /* Check for characters we don't want to see in a metadata file. */
+ WT_RET(__wt_name_check(session, name, len));
+
+ /*
+ * The internal checkpoint name is special, applications aren't allowed to use it. Be aggressive
+ * and disallow any matching prefix, it makes things easier when checking in other places.
+ */
+ if (len < strlen(WT_CHECKPOINT))
+ return (0);
+ if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT))
+ return (0);
+
+ WT_RET_MSG(session, EINVAL, "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT);
}
/*
* __checkpoint_name_check --
- * Check for an attempt to name a checkpoint that includes anything
- * other than a file object.
+ * Check for an attempt to name a checkpoint that includes anything other than a file object.
*/
static int
__checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
{
- WT_CURSOR *cursor;
- WT_DECL_RET;
- const char *fail;
-
- cursor = NULL;
- fail = NULL;
-
- /*
- * This function exists as a place for this comment: named checkpoints
- * are only supported on file objects, and not on LSM trees. If a target
- * list is configured for the checkpoint, this function is called with
- * each target list entry; check the entry to make sure it's backed by
- * a file. If no target list is configured, confirm the metadata file
- * contains no non-file objects. Skip any internal system objects. We
- * don't want spurious error messages, other code will skip over them
- * and the user has no control over their existence.
- */
- if (uri == NULL) {
- WT_RET(__wt_metadata_cursor(session, &cursor));
- while ((ret = cursor->next(cursor)) == 0) {
- WT_ERR(cursor->get_key(cursor, &uri));
- if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
- !WT_PREFIX_MATCH(uri, "file:") &&
- !WT_PREFIX_MATCH(uri, "index:") &&
- !WT_PREFIX_MATCH(uri, WT_SYSTEM_PREFIX) &&
- !WT_PREFIX_MATCH(uri, "table:")) {
- fail = uri;
- break;
- }
- }
- WT_ERR_NOTFOUND_OK(ret);
- } else
- if (!WT_PREFIX_MATCH(uri, "colgroup:") &&
- !WT_PREFIX_MATCH(uri, "file:") &&
- !WT_PREFIX_MATCH(uri, "index:") &&
- !WT_PREFIX_MATCH(uri, "table:"))
- fail = uri;
-
- if (fail != NULL)
- WT_ERR_MSG(session, EINVAL,
- "%s object does not support named checkpoints", fail);
-
-err: WT_TRET(__wt_metadata_cursor_release(session, &cursor));
- return (ret);
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ const char *fail;
+
+ cursor = NULL;
+ fail = NULL;
+
+ /*
+ * This function exists as a place for this comment: named checkpoints are only supported on
+ * file objects, and not on LSM trees. If a target list is configured for the checkpoint, this
+ * function is called with each target list entry; check the entry to make sure it's backed by a
+ * file. If no target list is configured, confirm the metadata file contains no non-file
+ * objects. Skip any internal system objects. We don't want spurious error messages, other code
+ * will skip over them and the user has no control over their existence.
+ */
+ if (uri == NULL) {
+ WT_RET(__wt_metadata_cursor(session, &cursor));
+ while ((ret = cursor->next(cursor)) == 0) {
+ WT_ERR(cursor->get_key(cursor, &uri));
+ if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, WT_SYSTEM_PREFIX) &&
+ !WT_PREFIX_MATCH(uri, "table:")) {
+ fail = uri;
+ break;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ } else if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") &&
+ !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:"))
+ fail = uri;
+
+ if (fail != NULL)
+ WT_ERR_MSG(session, EINVAL, "%s object does not support named checkpoints", fail);
+
+err:
+ WT_TRET(__wt_metadata_cursor_release(session, &cursor));
+ return (ret);
}
/*
* __checkpoint_update_generation --
- * Update the checkpoint generation of the current tree.
- *
- * This indicates that the tree will not be visited again by the current
- * checkpoint.
+ * Update the checkpoint generation of the current tree. This indicates that the tree will not
+ * be visited again by the current checkpoint.
*/
static void
__checkpoint_update_generation(WT_SESSION_IMPL *session)
{
- WT_BTREE *btree;
+ WT_BTREE *btree;
- btree = S2BT(session);
+ btree = S2BT(session);
- /*
- * Updates to the metadata are made by the checkpoint transaction, so
- * the metadata tree's checkpoint generation should never be updated.
- */
- if (WT_IS_METADATA(session->dhandle))
- return;
+ /*
+ * Updates to the metadata are made by the checkpoint transaction, so the metadata tree's
+ * checkpoint generation should never be updated.
+ */
+ if (WT_IS_METADATA(session->dhandle))
+ return;
- WT_PUBLISH(btree->checkpoint_gen, __wt_gen(session, WT_GEN_CHECKPOINT));
- WT_STAT_DATA_SET(session,
- btree_checkpoint_generation, btree->checkpoint_gen);
+ WT_PUBLISH(btree->checkpoint_gen, __wt_gen(session, WT_GEN_CHECKPOINT));
+ WT_STAT_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen);
}
/*
* __checkpoint_apply_all --
- * Apply an operation to all files involved in a checkpoint.
+ * Apply an operation to all files involved in a checkpoint.
*/
static int
-__checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[],
- int (*op)(WT_SESSION_IMPL *, const char *[]))
+__checkpoint_apply_all(
+ WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[]))
{
- WT_CONFIG targetconf;
- WT_CONFIG_ITEM cval, k, v;
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- bool ckpt_closed, named, target_list;
-
- target_list = false;
-
- /* Flag if this is a named checkpoint, and check if the name is OK. */
- WT_RET(__wt_config_gets(session, cfg, "name", &cval));
- named = cval.len != 0;
- if (named)
- WT_RET(__checkpoint_name_ok(session, cval.str, cval.len));
-
- /* Step through the targets and optionally operate on each one. */
- WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
- __wt_config_subinit(session, &targetconf, &cval);
- while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) {
- if (!target_list) {
- WT_ERR(__wt_scr_alloc(session, 512, &tmp));
- target_list = true;
- }
-
- if (v.len != 0)
- WT_ERR_MSG(session, EINVAL,
- "invalid checkpoint target %.*s: URIs may require "
- "quoting",
- (int)cval.len, (char *)cval.str);
-
- /* Some objects don't support named checkpoints. */
- if (named)
- WT_ERR(__checkpoint_name_check(session, k.str));
-
- if (op == NULL)
- continue;
- WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
- if ((ret = __wt_schema_worker(
- session, tmp->data, op, NULL, cfg, 0)) != 0)
- WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
- }
- WT_ERR_NOTFOUND_OK(ret);
-
- if (!target_list && named)
- /* Some objects don't support named checkpoints. */
- WT_ERR(__checkpoint_name_check(session, NULL));
-
- if (!target_list && op != NULL) {
- /*
- * If the checkpoint is named or we're dropping checkpoints, we
- * checkpoint both open and closed files; else, only checkpoint
- * open files.
- *
- * XXX
- * We don't optimize unnamed checkpoints of a list of targets,
- * we open the targets and checkpoint them even if they are
- * quiescent and don't need a checkpoint, believing applications
- * unlikely to checkpoint a list of closed targets.
- */
- ckpt_closed = named;
- if (!ckpt_closed) {
- WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
- ckpt_closed = cval.len != 0;
- }
- WT_ERR(ckpt_closed ?
- __wt_meta_apply_all(session, op, NULL, cfg) :
- __wt_conn_btree_apply(session, NULL, op, NULL, cfg));
- }
-
-err: __wt_scr_free(session, &tmp);
- return (ret);
+ WT_CONFIG targetconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ bool ckpt_closed, named, target_list;
+
+ target_list = false;
+
+ /* Flag if this is a named checkpoint, and check if the name is OK. */
+ WT_RET(__wt_config_gets(session, cfg, "name", &cval));
+ named = cval.len != 0;
+ if (named)
+ WT_RET(__checkpoint_name_ok(session, cval.str, cval.len));
+
+ /* Step through the targets and optionally operate on each one. */
+ WT_ERR(__wt_config_gets(session, cfg, "target", &cval));
+ __wt_config_subinit(session, &targetconf, &cval);
+ while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) {
+ if (!target_list) {
+ WT_ERR(__wt_scr_alloc(session, 512, &tmp));
+ target_list = true;
+ }
+
+ if (v.len != 0)
+ WT_ERR_MSG(session, EINVAL,
+ "invalid checkpoint target %.*s: URIs may require "
+ "quoting",
+ (int)cval.len, (char *)cval.str);
+
+ /* Some objects don't support named checkpoints. */
+ if (named)
+ WT_ERR(__checkpoint_name_check(session, k.str));
+
+ if (op == NULL)
+ continue;
+ WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str));
+ if ((ret = __wt_schema_worker(session, tmp->data, op, NULL, cfg, 0)) != 0)
+ WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ if (!target_list && named)
+ /* Some objects don't support named checkpoints. */
+ WT_ERR(__checkpoint_name_check(session, NULL));
+
+ if (!target_list && op != NULL) {
+ /*
+ * If the checkpoint is named or we're dropping checkpoints, we
+ * checkpoint both open and closed files; else, only checkpoint
+ * open files.
+ *
+ * XXX
+ * We don't optimize unnamed checkpoints of a list of targets,
+ * we open the targets and checkpoint them even if they are
+ * quiescent and don't need a checkpoint, believing applications
+ * unlikely to checkpoint a list of closed targets.
+ */
+ ckpt_closed = named;
+ if (!ckpt_closed) {
+ WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+ ckpt_closed = cval.len != 0;
+ }
+ WT_ERR(ckpt_closed ? __wt_meta_apply_all(session, op, NULL, cfg) :
+ __wt_conn_btree_apply(session, NULL, op, NULL, cfg));
+ }
+
+err:
+ __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
* __checkpoint_apply --
- * Apply an operation to all handles locked for a checkpoint.
+ * Apply an operation to all handles locked for a checkpoint.
*/
static int
-__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[],
- int (*op)(WT_SESSION_IMPL *, const char *[]))
+__checkpoint_apply(
+ WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[]))
{
- WT_DECL_RET;
- u_int i;
-
- /* If we have already locked the handles, apply the operation. */
- for (i = 0; i < session->ckpt_handle_next; ++i) {
- if (session->ckpt_handle[i] == NULL)
- continue;
- WT_WITH_DHANDLE(session, session->ckpt_handle[i],
- ret = (*op)(session, cfg));
- WT_RET(ret);
- }
-
- return (0);
+ WT_DECL_RET;
+ u_int i;
+
+ /* If we have already locked the handles, apply the operation. */
+ for (i = 0; i < session->ckpt_handle_next; ++i) {
+ if (session->ckpt_handle[i] == NULL)
+ continue;
+ WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg));
+ WT_RET(ret);
+ }
+
+ return (0);
}
/*
* __checkpoint_data_source --
- * Checkpoint all data sources.
+ * Checkpoint all data sources.
*/
static int
__checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_DATA_SOURCE *dsrc;
- WT_NAMED_DATA_SOURCE *ndsrc;
-
- /*
- * A place-holder, to support data sources: we assume calling the
- * underlying data-source session checkpoint function is sufficient to
- * checkpoint all objects in the data source, open or closed, and we
- * don't attempt to optimize the checkpoint of individual targets.
- * Those assumptions are not necessarily going to be true for all
- * data sources.
- *
- * It's not difficult to support data-source checkpoints of individual
- * targets (__wt_schema_worker is the underlying function that will do
- * the work, and it's already written to support data-sources, although
- * we'd probably need to pass the URI of the object to the data source
- * checkpoint function which we don't currently do). However, doing a
- * full data checkpoint is trickier: currently, the connection code is
- * written to ignore all objects other than "file:", and that code will
- * require significant changes to work with data sources.
- */
- TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) {
- dsrc = ndsrc->dsrc;
- if (dsrc->checkpoint != NULL)
- WT_RET(dsrc->checkpoint(dsrc,
- (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg));
- }
- return (0);
+ WT_DATA_SOURCE *dsrc;
+ WT_NAMED_DATA_SOURCE *ndsrc;
+
+ /*
+ * A place-holder, to support data sources: we assume calling the
+ * underlying data-source session checkpoint function is sufficient to
+ * checkpoint all objects in the data source, open or closed, and we
+ * don't attempt to optimize the checkpoint of individual targets.
+ * Those assumptions are not necessarily going to be true for all
+ * data sources.
+ *
+ * It's not difficult to support data-source checkpoints of individual
+ * targets (__wt_schema_worker is the underlying function that will do
+ * the work, and it's already written to support data-sources, although
+ * we'd probably need to pass the URI of the object to the data source
+ * checkpoint function which we don't currently do). However, doing a
+ * full data checkpoint is trickier: currently, the connection code is
+ * written to ignore all objects other than "file:", and that code will
+ * require significant changes to work with data sources.
+ */
+ TAILQ_FOREACH (ndsrc, &S2C(session)->dsrcqh, q) {
+ dsrc = ndsrc->dsrc;
+ if (dsrc->checkpoint != NULL)
+ WT_RET(dsrc->checkpoint(dsrc, (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg));
+ }
+ return (0);
}
/*
* __wt_checkpoint_get_handles --
- * Get a list of handles to flush.
+ * Get a list of handles to flush.
*/
int
__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_BTREE *btree;
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- const char *name;
- bool force;
-
- /* Find out if we have to force a checkpoint. */
- WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
- force = cval.val != 0;
- if (!force) {
- WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
- force = cval.len != 0;
- }
-
- /* Should not be called with anything other than a live btree handle. */
- WT_ASSERT(session, session->dhandle->type == WT_DHANDLE_TYPE_BTREE &&
- session->dhandle->checkpoint == NULL);
-
- btree = S2BT(session);
-
- /* Skip files that are never involved in a checkpoint. */
- if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
- return (0);
-
- /*
- * We may have raced between starting the checkpoint transaction and
- * some operation completing on the handle that updated the metadata
- * (e.g., closing a bulk load cursor). All such operations either have
- * exclusive access to the handle or hold the schema lock. We are now
- * holding the schema lock and have an open btree handle, so if we
- * can't update the metadata, then there has been some state change
- * invisible to the checkpoint transaction.
- */
- if (!WT_IS_METADATA(session->dhandle)) {
- WT_CURSOR *meta_cursor;
-
- WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR));
- WT_RET(__wt_metadata_cursor(session, &meta_cursor));
- meta_cursor->set_key(meta_cursor, session->dhandle->name);
- ret = __wt_curfile_insert_check(meta_cursor);
- if (ret == WT_ROLLBACK) {
- /*
- * If create or drop or any schema operation of a table
- * is with in an user transaction then checkpoint can
- * see the dhandle before the commit, which will lead
- * to the rollback error. We will ignore this dhandle as
- * part of this checkpoint by returning from here.
- */
- WT_TRET(__wt_metadata_cursor_release(session,
- &meta_cursor));
- return (0);
- }
- WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
- WT_RET(ret);
- }
-
- /*
- * Decide whether the tree needs to be included in the checkpoint and
- * if so, acquire the necessary locks.
- */
- WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(
- session, true, force, true, cfg));
- WT_RET(ret);
- if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
- WT_ASSERT(session, btree->ckpt == NULL);
- __checkpoint_update_generation(session);
- return (0);
- }
-
- /*
- * Make sure there is space for the new entry: do this before getting
- * the handle to avoid cleanup if we can't allocate the memory.
- */
- WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
- session->ckpt_handle_next + 1, &session->ckpt_handle));
-
- /*
- * The current tree will be included: get it again because the handle
- * we have is only valid for the duration of this function.
- */
- name = session->dhandle->name;
- session->dhandle = NULL;
-
- if ((ret = __wt_session_get_dhandle(session, name, NULL, NULL, 0)) != 0)
- return (ret == EBUSY ? 0 : ret);
-
- /*
- * Save the current eviction walk setting: checkpoint can interfere
- * with eviction and we don't want to unfairly penalize (or promote)
- * eviction in trees due to checkpoints.
- */
- btree->evict_walk_saved = btree->evict_walk_period;
-
- session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle;
- return (0);
+ WT_BTREE *btree;
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ const char *name;
+ bool force;
+
+ /* Find out if we have to force a checkpoint. */
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = cval.val != 0;
+ if (!force) {
+ WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
+ force = cval.len != 0;
+ }
+
+ /* Should not be called with anything other than a live btree handle. */
+ WT_ASSERT(session,
+ session->dhandle->type == WT_DHANDLE_TYPE_BTREE && session->dhandle->checkpoint == NULL);
+
+ btree = S2BT(session);
+
+ /* Skip files that are never involved in a checkpoint. */
+ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ return (0);
+
+ /*
+ * We may have raced between starting the checkpoint transaction and
+ * some operation completing on the handle that updated the metadata
+ * (e.g., closing a bulk load cursor). All such operations either have
+ * exclusive access to the handle or hold the schema lock. We are now
+ * holding the schema lock and have an open btree handle, so if we
+ * can't update the metadata, then there has been some state change
+ * invisible to the checkpoint transaction.
+ */
+ if (!WT_IS_METADATA(session->dhandle)) {
+ WT_CURSOR *meta_cursor;
+
+ WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR));
+ WT_RET(__wt_metadata_cursor(session, &meta_cursor));
+ meta_cursor->set_key(meta_cursor, session->dhandle->name);
+ ret = __wt_curfile_insert_check(meta_cursor);
+ if (ret == WT_ROLLBACK) {
+ /*
+ * If create or drop or any schema operation of a table is with in an user transaction
+ * then checkpoint can see the dhandle before the commit, which will lead to the
+ * rollback error. We will ignore this dhandle as part of this checkpoint by returning
+ * from here.
+ */
+ WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
+ return (0);
+ }
+ WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor));
+ WT_RET(ret);
+ }
+
+ /*
+ * Decide whether the tree needs to be included in the checkpoint and if so, acquire the
+ * necessary locks.
+ */
+ WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
+ WT_RET(ret);
+ if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) {
+ WT_ASSERT(session, btree->ckpt == NULL);
+ __checkpoint_update_generation(session);
+ return (0);
+ }
+
+ /*
+ * Make sure there is space for the new entry: do this before getting the handle to avoid
+ * cleanup if we can't allocate the memory.
+ */
+ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1,
+ &session->ckpt_handle));
+
+ /*
+ * The current tree will be included: get it again because the handle we have is only valid for
+ * the duration of this function.
+ */
+ name = session->dhandle->name;
+ session->dhandle = NULL;
+
+ if ((ret = __wt_session_get_dhandle(session, name, NULL, NULL, 0)) != 0)
+ return (ret == EBUSY ? 0 : ret);
+
+ /*
+ * Save the current eviction walk setting: checkpoint can interfere with eviction and we don't
+ * want to unfairly penalize (or promote) eviction in trees due to checkpoints.
+ */
+ btree->evict_walk_saved = btree->evict_walk_period;
+
+ session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle;
+ return (0);
}
/*
* __checkpoint_reduce_dirty_cache --
- * Release clean trees from the list cached for checkpoints.
+ * Release clean trees from the list cached for checkpoints.
*/
static void
__checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session)
{
- WT_CACHE *cache;
- WT_CONNECTION_IMPL *conn;
- double current_dirty, prev_dirty;
- uint64_t bytes_written_start, bytes_written_total;
- uint64_t cache_size, max_write;
- uint64_t time_start, time_stop;
- uint64_t total_ms;
-
- conn = S2C(session);
- cache = conn->cache;
-
- /*
- * Give up if scrubbing is disabled, including when checkpointing with
- * a timestamp on close (we can't evict dirty pages in that case, so
- * scrubbing cannot help).
- */
- if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) ||
- cache->eviction_checkpoint_target < DBL_EPSILON)
- return;
-
- time_start = __wt_clock(session);
- bytes_written_start = cache->bytes_written;
-
- /*
- * If the cache size is zero or very small, we're done. The cache
- * size can briefly become zero if we're transitioning to a shared
- * cache via reconfigure. This avoids potential divide by zero.
- */
- if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE)
- return;
-
- current_dirty =
- (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
- if (current_dirty <= cache->eviction_checkpoint_target)
- return;
-
- /* Stop if we write as much dirty data as is currently in cache. */
- max_write = __wt_cache_dirty_leaf_inuse(cache);
-
- /* Set the dirty trigger to the target value. */
- cache->eviction_scrub_target = cache->eviction_checkpoint_target;
- WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
-
- /* Wait while the dirty level is going down. */
- for (;;) {
- __wt_sleep(0, 100 * WT_THOUSAND);
-
- prev_dirty = current_dirty;
- current_dirty =
- (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
- if (current_dirty <= cache->eviction_checkpoint_target ||
- current_dirty >= prev_dirty)
- break;
-
- /*
- * Don't scrub when the lookaside table is in use: scrubbing is
- * counter-productive in that case.
- */
- if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE))
- break;
-
- /*
- * We haven't reached the current target.
- *
- * Don't wait indefinitely: there might be dirty pages
- * that can't be evicted. If we can't meet the target,
- * give up and start the checkpoint for real.
- */
- bytes_written_total =
- cache->bytes_written - bytes_written_start;
- if (bytes_written_total > max_write)
- break;
- }
-
- time_stop = __wt_clock(session);
- total_ms = WT_CLOCKDIFF_MS(time_stop, time_start);
- WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms);
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ double current_dirty, prev_dirty;
+ uint64_t bytes_written_start, bytes_written_total;
+ uint64_t cache_size, max_write;
+ uint64_t time_start, time_stop;
+ uint64_t total_ms;
+
+ conn = S2C(session);
+ cache = conn->cache;
+
+ /*
+ * Give up if scrubbing is disabled, including when checkpointing with a timestamp on close (we
+ * can't evict dirty pages in that case, so scrubbing cannot help).
+ */
+ if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) || cache->eviction_checkpoint_target < DBL_EPSILON)
+ return;
+
+ time_start = __wt_clock(session);
+ bytes_written_start = cache->bytes_written;
+
+ /*
+ * If the cache size is zero or very small, we're done. The cache size can briefly become zero
+ * if we're transitioning to a shared cache via reconfigure. This avoids potential divide by
+ * zero.
+ */
+ if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE)
+ return;
+
+ current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
+ if (current_dirty <= cache->eviction_checkpoint_target)
+ return;
+
+ /* Stop if we write as much dirty data as is currently in cache. */
+ max_write = __wt_cache_dirty_leaf_inuse(cache);
+
+ /* Set the dirty trigger to the target value. */
+ cache->eviction_scrub_target = cache->eviction_checkpoint_target;
+ WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
+
+ /* Wait while the dirty level is going down. */
+ for (;;) {
+ __wt_sleep(0, 100 * WT_THOUSAND);
+
+ prev_dirty = current_dirty;
+ current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size;
+ if (current_dirty <= cache->eviction_checkpoint_target || current_dirty >= prev_dirty)
+ break;
+
+ /*
+ * Don't scrub when the lookaside table is in use: scrubbing is counter-productive in that
+ * case.
+ */
+ if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE))
+ break;
+
+ /*
+ * We haven't reached the current target.
+ *
+ * Don't wait indefinitely: there might be dirty pages
+ * that can't be evicted. If we can't meet the target,
+ * give up and start the checkpoint for real.
+ */
+ bytes_written_total = cache->bytes_written - bytes_written_start;
+ if (bytes_written_total > max_write)
+ break;
+ }
+
+ time_stop = __wt_clock(session);
+ total_ms = WT_CLOCKDIFF_MS(time_stop, time_start);
+ WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms);
}
/*
* __wt_checkpoint_progress --
- * Output a checkpoint progress message.
+ * Output a checkpoint progress message.
*/
void
__wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing)
{
- struct timespec cur_time;
- WT_CONNECTION_IMPL *conn;
- uint64_t time_diff;
-
- conn = S2C(session);
- __wt_epoch(session, &cur_time);
-
- /* Time since the full database checkpoint started */
- time_diff = WT_TIMEDIFF_SEC(cur_time,
- conn->ckpt_timer_start);
-
- if (closing || (time_diff / WT_PROGRESS_MSG_PERIOD) >
- conn->ckpt_progress_msg_count) {
- __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS,
- "Checkpoint %s for %" PRIu64
- " seconds and wrote: %" PRIu64 " pages (%" PRIu64 " MB)",
- closing ? "ran" : "has been running",
- time_diff, conn->ckpt_write_pages,
- conn->ckpt_write_bytes / WT_MEGABYTE);
- conn->ckpt_progress_msg_count++;
- }
+ struct timespec cur_time;
+ WT_CONNECTION_IMPL *conn;
+ uint64_t time_diff;
+
+ conn = S2C(session);
+ __wt_epoch(session, &cur_time);
+
+ /* Time since the full database checkpoint started */
+ time_diff = WT_TIMEDIFF_SEC(cur_time, conn->ckpt_timer_start);
+
+ if (closing || (time_diff / WT_PROGRESS_MSG_PERIOD) > conn->ckpt_progress_msg_count) {
+ __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS,
+ "Checkpoint %s for %" PRIu64 " seconds and wrote: %" PRIu64 " pages (%" PRIu64 " MB)",
+ closing ? "ran" : "has been running", time_diff, conn->ckpt_write_pages,
+ conn->ckpt_write_bytes / WT_MEGABYTE);
+ conn->ckpt_progress_msg_count++;
+ }
}
/*
* __checkpoint_stats --
- * Update checkpoint timer stats.
+ * Update checkpoint timer stats.
*/
static void
__checkpoint_stats(WT_SESSION_IMPL *session)
{
- struct timespec stop;
- WT_CONNECTION_IMPL *conn;
- uint64_t msec;
+ struct timespec stop;
+ WT_CONNECTION_IMPL *conn;
+ uint64_t msec;
- conn = S2C(session);
+ conn = S2C(session);
- /* Output a verbose progress message for long running checkpoints */
- if (conn->ckpt_progress_msg_count > 0)
- __wt_checkpoint_progress(session, true);
+ /* Output a verbose progress message for long running checkpoints */
+ if (conn->ckpt_progress_msg_count > 0)
+ __wt_checkpoint_progress(session, true);
- __wt_epoch(session, &stop);
- msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_scrub_end);
+ __wt_epoch(session, &stop);
+ msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_scrub_end);
- if (msec > conn->ckpt_time_max)
- conn->ckpt_time_max = msec;
- if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min)
- conn->ckpt_time_min = msec;
- conn->ckpt_time_recent = msec;
- conn->ckpt_time_total += msec;
+ if (msec > conn->ckpt_time_max)
+ conn->ckpt_time_max = msec;
+ if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min)
+ conn->ckpt_time_min = msec;
+ conn->ckpt_time_recent = msec;
+ conn->ckpt_time_total += msec;
}
/*
* __checkpoint_verbose_track --
- * Output a verbose message with timing information
+ * Output a verbose message with timing information
*/
static void
__checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg)
{
- struct timespec stop;
- WT_CONNECTION_IMPL *conn;
- uint64_t msec;
+ struct timespec stop;
+ WT_CONNECTION_IMPL *conn;
+ uint64_t msec;
- if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
- return;
+ if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
+ return;
- conn = S2C(session);
- __wt_epoch(session, &stop);
-
- /* Get time diff in milliseconds. */
- msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_start);
- __wt_verbose(session,
- WT_VERB_CHECKPOINT, "time: %" PRIu64 " ms, gen: %" PRIu64
- ": Full database checkpoint %s",
- msec, __wt_gen(session, WT_GEN_CHECKPOINT), msg);
+ conn = S2C(session);
+ __wt_epoch(session, &stop);
+ /* Get time diff in milliseconds. */
+ msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_start);
+ __wt_verbose(session, WT_VERB_CHECKPOINT,
+ "time: %" PRIu64 " ms, gen: %" PRIu64 ": Full database checkpoint %s", msec,
+ __wt_gen(session, WT_GEN_CHECKPOINT), msg);
}
/*
* __checkpoint_fail_reset --
- * Reset fields when a failure occurs.
+ * Reset fields when a failure occurs.
*/
static void
__checkpoint_fail_reset(WT_SESSION_IMPL *session)
{
- WT_BTREE *btree;
+ WT_BTREE *btree;
- btree = S2BT(session);
- btree->modified = true;
- __wt_meta_ckptlist_free(session, &btree->ckpt);
+ btree = S2BT(session);
+ btree->modified = true;
+ __wt_meta_ckptlist_free(session, &btree->ckpt);
}
/*
* __checkpoint_prepare --
- * Start the transaction for a checkpoint and gather handles.
+ * Start the transaction for a checkpoint and gather handles.
*/
static int
-__checkpoint_prepare(
- WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[])
+__checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
- const char *txn_cfg[] = { WT_CONFIG_BASE(session,
- WT_SESSION_begin_transaction), "isolation=snapshot", NULL };
- bool use_timestamp;
-
- conn = S2C(session);
- txn = &session->txn;
- txn_global = &conn->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
-
- WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
- use_timestamp = (cval.val != 0);
-
- /*
- * Start a snapshot transaction for the checkpoint.
- *
- * Note: we don't go through the public API calls because they have
- * side effects on cursors, which applications can hold open across
- * calls to checkpoint.
- */
- WT_RET(__wt_txn_begin(session, txn_cfg));
-
- WT_DIAGNOSTIC_YIELD;
-
- /* Ensure a transaction ID is allocated prior to sharing it globally */
- WT_RET(__wt_txn_id_check(session));
-
- /* Keep track of handles acquired for locking. */
- WT_RET(__wt_meta_track_on(session));
- *trackingp = true;
-
- /*
- * Mark the connection as clean. If some data gets modified after
- * generating checkpoint transaction id, connection will be reset to
- * dirty when reconciliation marks the btree dirty on encountering the
- * dirty page.
- */
- conn->modified = false;
-
- /*
- * Save the checkpoint session ID.
- *
- * We never do checkpoints in the default session (with id zero).
- */
- WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
- txn_global->checkpoint_id = session->id;
-
- /*
- * Remove the checkpoint transaction from the global table.
- *
- * This allows ordinary visibility checks to move forward because
- * checkpoints often take a long time and only write to the metadata.
- */
- __wt_writelock(session, &txn_global->rwlock);
- txn_global->checkpoint_state = *txn_state;
- txn_global->checkpoint_state.pinned_id = txn->snap_min;
-
- /*
- * Sanity check that the oldest ID hasn't moved on before we have
- * cleared our entry.
- */
- WT_ASSERT(session,
- WT_TXNID_LE(txn_global->oldest_id, txn_state->id) &&
- WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id));
-
- /*
- * Clear our entry from the global transaction session table. Any
- * operation that needs to know about the ID for this checkpoint will
- * consider the checkpoint ID in the global structure. Most operations
- * can safely ignore the checkpoint ID (see the visible all check for
- * details).
- */
- txn_state->id = txn_state->pinned_id =
- txn_state->metadata_pinned = WT_TXN_NONE;
-
- /*
- * Set the checkpoint transaction's timestamp, if requested.
- *
- * We rely on having the global transaction data locked so the oldest
- * timestamp can't move past the stable timestamp.
- */
- WT_ASSERT(session, !F_ISSET(txn,
- WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ |
- WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ));
-
- if (use_timestamp) {
- /*
- * If the user wants timestamps then set the metadata
- * checkpoint timestamp based on whether or not a stable
- * timestamp is actually in use. Only set it when we're not
- * running recovery because recovery doesn't set the recovery
- * timestamp until its checkpoint is complete.
- */
- if (txn_global->has_stable_timestamp) {
- txn->read_timestamp = txn_global->stable_timestamp;
- txn_global->checkpoint_timestamp = txn->read_timestamp;
- F_SET(txn, WT_TXN_HAS_TS_READ);
- if (!F_ISSET(conn, WT_CONN_RECOVERING))
- txn_global->meta_ckpt_timestamp =
- txn->read_timestamp;
- } else if (!F_ISSET(conn, WT_CONN_RECOVERING))
- txn_global->meta_ckpt_timestamp =
- txn_global->recovery_timestamp;
- } else if (!F_ISSET(conn, WT_CONN_RECOVERING))
- txn_global->meta_ckpt_timestamp = 0;
-
- __wt_writeunlock(session, &txn_global->rwlock);
-
- if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) {
- __wt_verbose_timestamp(session, txn->read_timestamp,
- "Checkpoint requested at stable timestamp");
-
- /*
- * The snapshot we established when the transaction started may
- * be too early to match the timestamp we just read.
- *
- * Get a new one.
- */
- __wt_txn_get_snapshot(session);
- }
-
- /*
- * Get a list of handles we want to flush; for named checkpoints this
- * may pull closed objects into the session cache.
- *
- * First, gather all handles, then start the checkpoint transaction,
- * then release any clean handles.
- */
- WT_ASSERT(session, session->ckpt_handle_next == 0);
- WT_WITH_TABLE_READ_LOCK(session, ret =
- __checkpoint_apply_all(session, cfg, __wt_checkpoint_get_handles));
- return (ret);
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+ const char *txn_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL};
+ bool use_timestamp;
+
+ conn = S2C(session);
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
+
+ WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
+ use_timestamp = (cval.val != 0);
+
+ /*
+ * Start a snapshot transaction for the checkpoint.
+ *
+ * Note: we don't go through the public API calls because they have
+ * side effects on cursors, which applications can hold open across
+ * calls to checkpoint.
+ */
+ WT_RET(__wt_txn_begin(session, txn_cfg));
+
+ WT_DIAGNOSTIC_YIELD;
+
+ /* Ensure a transaction ID is allocated prior to sharing it globally */
+ WT_RET(__wt_txn_id_check(session));
+
+ /* Keep track of handles acquired for locking. */
+ WT_RET(__wt_meta_track_on(session));
+ *trackingp = true;
+
+ /*
+ * Mark the connection as clean. If some data gets modified after generating checkpoint
+ * transaction id, connection will be reset to dirty when reconciliation marks the btree dirty
+ * on encountering the dirty page.
+ */
+ conn->modified = false;
+
+ /*
+ * Save the checkpoint session ID.
+ *
+ * We never do checkpoints in the default session (with id zero).
+ */
+ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
+ txn_global->checkpoint_id = session->id;
+
+ /*
+ * Remove the checkpoint transaction from the global table.
+ *
+ * This allows ordinary visibility checks to move forward because
+ * checkpoints often take a long time and only write to the metadata.
+ */
+ __wt_writelock(session, &txn_global->rwlock);
+ txn_global->checkpoint_state = *txn_state;
+ txn_global->checkpoint_state.pinned_id = txn->snap_min;
+
+ /*
+ * Sanity check that the oldest ID hasn't moved on before we have cleared our entry.
+ */
+ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) &&
+ WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id));
+
+ /*
+ * Clear our entry from the global transaction session table. Any operation that needs to know
+ * about the ID for this checkpoint will consider the checkpoint ID in the global structure.
+ * Most operations can safely ignore the checkpoint ID (see the visible all check for details).
+ */
+ txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE;
+
+ /*
+ * Set the checkpoint transaction's timestamp, if requested.
+ *
+ * We rely on having the global transaction data locked so the oldest
+ * timestamp can't move past the stable timestamp.
+ */
+ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ |
+ WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ));
+
+ if (use_timestamp) {
+ /*
+ * If the user wants timestamps then set the metadata checkpoint timestamp based on whether
+ * or not a stable timestamp is actually in use. Only set it when we're not running recovery
+ * because recovery doesn't set the recovery timestamp until its checkpoint is complete.
+ */
+ if (txn_global->has_stable_timestamp) {
+ txn->read_timestamp = txn_global->stable_timestamp;
+ txn_global->checkpoint_timestamp = txn->read_timestamp;
+ F_SET(txn, WT_TXN_HAS_TS_READ);
+ if (!F_ISSET(conn, WT_CONN_RECOVERING))
+ txn_global->meta_ckpt_timestamp = txn->read_timestamp;
+ } else if (!F_ISSET(conn, WT_CONN_RECOVERING))
+ txn_global->meta_ckpt_timestamp = txn_global->recovery_timestamp;
+ } else if (!F_ISSET(conn, WT_CONN_RECOVERING))
+ txn_global->meta_ckpt_timestamp = 0;
+
+ __wt_writeunlock(session, &txn_global->rwlock);
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) {
+ __wt_verbose_timestamp(
+ session, txn->read_timestamp, "Checkpoint requested at stable timestamp");
+
+ /*
+ * The snapshot we established when the transaction started may
+ * be too early to match the timestamp we just read.
+ *
+ * Get a new one.
+ */
+ __wt_txn_get_snapshot(session);
+ }
+
+ /*
+ * Get a list of handles we want to flush; for named checkpoints this
+ * may pull closed objects into the session cache.
+ *
+ * First, gather all handles, then start the checkpoint transaction,
+ * then release any clean handles.
+ */
+ WT_ASSERT(session, session->ckpt_handle_next == 0);
+ WT_WITH_TABLE_READ_LOCK(
+ session, ret = __checkpoint_apply_all(session, cfg, __wt_checkpoint_get_handles));
+ return (ret);
}
/*
* __txn_checkpoint_can_skip --
- * Determine whether it's safe to skip taking a checkpoint.
+ * Determine whether it's safe to skip taking a checkpoint.
*/
static int
-__txn_checkpoint_can_skip(WT_SESSION_IMPL *session,
- const char *cfg[], bool *fullp, bool *use_timestampp, bool *can_skipp)
+__txn_checkpoint_can_skip(
+ WT_SESSION_IMPL *session, const char *cfg[], bool *fullp, bool *use_timestampp, bool *can_skipp)
{
- WT_CONFIG targetconf;
- WT_CONFIG_ITEM cval, k, v;
- WT_CONNECTION_IMPL *conn;
- WT_TXN_GLOBAL *txn_global;
- bool full, use_timestamp;
-
- /*
- * Default to not skipping - also initialize the other output
- * parameters - even though they will always be initialized unless
- * there is an error and callers need to ignore the results on error.
- */
- *can_skipp = *fullp = *use_timestampp = false;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- /*
- * This function also parses out some configuration options and hands
- * them back to the caller - make sure it does that parsing regardless
- * of the result.
- *
- * Determine if this is going to be a full checkpoint, that is a
- * checkpoint that applies to all data tables in a database.
- */
- WT_RET(__wt_config_gets(session, cfg, "target", &cval));
- __wt_config_subinit(session, &targetconf, &cval);
- *fullp = full = __wt_config_next(&targetconf, &k, &v) != 0;
-
- WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
- *use_timestampp = use_timestamp = cval.val != 0;
-
- /* Never skip non-full checkpoints */
- if (!full)
- return (0);
-
- /* Never skip if force is configured. */
- WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
- if (cval.val != 0)
- return (0);
-
- /* Never skip named checkpoints. */
- WT_RET(__wt_config_gets(session, cfg, "name", &cval));
- if (cval.len != 0)
- return (0);
-
- /*
- * It isn't currently safe to skip timestamp checkpoints - see WT-4958.
- * We should fix this so we can skip timestamp checkpoints if they
- * don't have new content.
- */
- if (use_timestamp)
- return (0);
-
- /*
- * Skip checkpointing the database if nothing has been dirtied since
- * the last checkpoint. That said there can be short instances when a
- * btree gets marked dirty and the connection is yet to be. We might
- * skip a checkpoint in that short instance, which is okay because by
- * the next time we get to checkpoint, the connection would have been
- * marked dirty and hence the checkpoint will not be skipped again.
- */
- if (!conn->modified) {
- *can_skipp = true;
- return (0);
- }
-
- /*
- * If the checkpoint is using timestamps, and the stable timestamp
- * hasn't been updated since the last checkpoint there is nothing
- * more that could be written.
- */
- if (use_timestamp && txn_global->has_stable_timestamp &&
- txn_global->last_ckpt_timestamp != WT_TS_NONE &&
- txn_global->last_ckpt_timestamp == txn_global->stable_timestamp) {
- *can_skipp = true;
- return (0);
- }
-
- return (0);
+ WT_CONFIG targetconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ bool full, use_timestamp;
+
+ /*
+ * Default to not skipping - also initialize the other output parameters - even though they will
+ * always be initialized unless there is an error and callers need to ignore the results on
+ * error.
+ */
+ *can_skipp = *fullp = *use_timestampp = false;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ /*
+ * This function also parses out some configuration options and hands
+ * them back to the caller - make sure it does that parsing regardless
+ * of the result.
+ *
+ * Determine if this is going to be a full checkpoint, that is a
+ * checkpoint that applies to all data tables in a database.
+ */
+ WT_RET(__wt_config_gets(session, cfg, "target", &cval));
+ __wt_config_subinit(session, &targetconf, &cval);
+ *fullp = full = __wt_config_next(&targetconf, &k, &v) != 0;
+
+ WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
+ *use_timestampp = use_timestamp = cval.val != 0;
+
+ /* Never skip non-full checkpoints */
+ if (!full)
+ return (0);
+
+ /* Never skip if force is configured. */
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ if (cval.val != 0)
+ return (0);
+
+ /* Never skip named checkpoints. */
+ WT_RET(__wt_config_gets(session, cfg, "name", &cval));
+ if (cval.len != 0)
+ return (0);
+
+ /*
+ * It isn't currently safe to skip timestamp checkpoints - see WT-4958. We should fix this so we
+ * can skip timestamp checkpoints if they don't have new content.
+ */
+ if (use_timestamp)
+ return (0);
+
+ /*
+ * Skip checkpointing the database if nothing has been dirtied since the last checkpoint. That
+ * said there can be short instances when a btree gets marked dirty and the connection is yet to
+ * be. We might skip a checkpoint in that short instance, which is okay because by the next time
+ * we get to checkpoint, the connection would have been marked dirty and hence the checkpoint
+ * will not be skipped again.
+ */
+ if (!conn->modified) {
+ *can_skipp = true;
+ return (0);
+ }
+
+ /*
+ * If the checkpoint is using timestamps, and the stable timestamp hasn't been updated since the
+ * last checkpoint there is nothing more that could be written.
+ */
+ if (use_timestamp && txn_global->has_stable_timestamp &&
+ txn_global->last_ckpt_timestamp != WT_TS_NONE &&
+ txn_global->last_ckpt_timestamp == txn_global->stable_timestamp) {
+ *can_skipp = true;
+ return (0);
+ }
+
+ return (0);
}
/*
* __txn_checkpoint --
- * Checkpoint a database or a list of objects in the database.
+ * Checkpoint a database or a list of objects in the database.
*/
static int
__txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CACHE *cache;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_ISOLATION saved_isolation;
- wt_timestamp_t ckpt_tmp_ts;
- uint64_t fsync_duration_usecs, generation, time_start, time_stop;
- u_int i;
- bool can_skip, failed, full, idle, logging, tracking, use_timestamp;
- void *saved_meta_next;
-
- conn = S2C(session);
- cache = conn->cache;
- txn = &session->txn;
- txn_global = &conn->txn_global;
- saved_isolation = session->isolation;
- full = idle = logging = tracking = use_timestamp = false;
-
- /* Avoid doing work if possible. */
- WT_RET(__txn_checkpoint_can_skip(session,
- cfg, &full, &use_timestamp, &can_skip));
- if (can_skip) {
- WT_STAT_CONN_INCR(session, txn_checkpoint_skipped);
- return (0);
- }
-
- /*
- * Do a pass over the configuration arguments and figure out what kind
- * of checkpoint this is.
- */
- WT_RET(__checkpoint_apply_all(session, cfg, NULL));
-
- logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED);
-
- /* Reset the maximum page size seen by eviction. */
- conn->cache->evict_max_page_size = 0;
-
- /* Initialize the verbose tracking timer */
- __wt_epoch(session, &conn->ckpt_timer_start);
-
- /* Initialize the checkpoint progress tracking data */
- conn->ckpt_progress_msg_count = 0;
- conn->ckpt_write_bytes = 0;
- conn->ckpt_write_pages = 0;
-
- /*
- * Update the global oldest ID so we do all possible cleanup.
- *
- * This is particularly important for compact, so that all dirty pages
- * can be fully written.
- */
- WT_ERR(__wt_txn_update_oldest(
- session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
-
- /* Flush data-sources before we start the checkpoint. */
- WT_ERR(__checkpoint_data_source(session, cfg));
-
- /*
- * Try to reduce the amount of dirty data in cache so there is less
- * work do during the critical section of the checkpoint.
- */
- __checkpoint_reduce_dirty_cache(session);
-
- /* Tell logging that we are about to start a database checkpoint. */
- if (full && logging)
- WT_ERR(__wt_txn_checkpoint_log(
- session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));
-
- __checkpoint_verbose_track(session, "starting transaction");
-
- if (full)
- __wt_epoch(session, &conn->ckpt_timer_scrub_end);
-
- /*
- * Start the checkpoint for real.
- *
- * Bump the global checkpoint generation, used to figure out whether
- * checkpoint has visited a tree. Use an atomic increment even though
- * we are single-threaded because readers of the checkpoint generation
- * don't hold the checkpoint lock.
- *
- * We do need to update it before clearing the checkpoint's entry out
- * of the transaction table, or a thread evicting in a tree could
- * ignore the checkpoint's transaction.
- */
- generation = __wt_gen_next(session, WT_GEN_CHECKPOINT);
- WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation);
-
- /*
- * We want to skip checkpointing clean handles whenever possible. That
- * is, when the checkpoint is not named or forced. However, we need to
- * take care about ordering with respect to the checkpoint transaction.
- *
- * We can't skip clean handles before starting the transaction or the
- * checkpoint can miss updates in trees that become dirty as the
- * checkpoint is starting. If we wait until the transaction has
- * started before locking a handle, there could be a metadata-changing
- * operation in between (e.g., salvage) that will cause a write
- * conflict when the checkpoint goes to write the metadata.
- *
- * Hold the schema lock while starting the transaction and gathering
- * handles so the set we get is complete and correct.
- */
- WT_WITH_SCHEMA_LOCK(session,
- ret = __checkpoint_prepare(session, &tracking, cfg));
- WT_ERR(ret);
-
- WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT);
-
- /*
- * Unblock updates -- we can figure out that any updates to clean pages
- * after this point are too new to be written in the checkpoint.
- */
- cache->eviction_scrub_target = 0.0;
- WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
-
- /* Tell logging that we have started a database checkpoint. */
- if (full && logging)
- WT_ERR(__wt_txn_checkpoint_log(
- session, full, WT_TXN_LOG_CKPT_START, NULL));
-
- __checkpoint_timing_stress(session);
-
- WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper));
-
- /*
- * Clear the dhandle so the visibility check doesn't get confused about
- * the snap min. Don't bother restoring the handle since it doesn't
- * make sense to carry a handle across a checkpoint.
- */
- session->dhandle = NULL;
-
- /*
- * Record the timestamp from the transaction if we were successful.
- * Store it in a temp variable now because it will be invalidated during
- * commit but we don't want to set it until we know the checkpoint
- * is successful. We have to set the system information before we
- * release the snapshot.
- */
- ckpt_tmp_ts = 0;
- if (full) {
- WT_ERR(__wt_meta_sysinfo_set(session));
- ckpt_tmp_ts = txn->read_timestamp;
- }
-
- /* Release the snapshot so we aren't pinning updates in cache. */
- __wt_txn_release_snapshot(session);
-
- /* Mark all trees as open for business (particularly eviction). */
- WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync));
- __wt_evict_server_wake(session);
-
- __checkpoint_verbose_track(session, "committing transaction");
-
- /*
- * Checkpoints have to hit disk (it would be reasonable to configure for
- * lazy checkpoints, but we don't support them yet).
- */
- time_start = __wt_clock(session);
- WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
- time_stop = __wt_clock(session);
- fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
- WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post);
- WT_STAT_CONN_SET(session,
- txn_checkpoint_fsync_post_duration, fsync_duration_usecs);
-
- __checkpoint_verbose_track(session, "sync completed");
-
- /*
- * Commit the transaction now that we are sure that all files in the
- * checkpoint have been flushed to disk. It's OK to commit before
- * checkpointing the metadata since we know that all files in the
- * checkpoint are now in a consistent state.
- */
- WT_ERR(__wt_txn_commit(session, NULL));
-
- /*
- * Ensure that the metadata changes are durable before the checkpoint
- * is resolved. Do this by either checkpointing the metadata or syncing
- * the log file.
- * Recovery relies on the checkpoint LSN in the metadata only being
- * updated by full checkpoints so only checkpoint the metadata for
- * full or non-logged checkpoints.
- *
- * This is very similar to __wt_meta_track_off, ideally they would be
- * merged.
- */
- if (full || !logging) {
- session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
- /* Disable metadata tracking during the metadata checkpoint. */
- saved_meta_next = session->meta_track_next;
- session->meta_track_next = NULL;
- WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session),
- WT_WITH_METADATA_LOCK(session,
- ret = __wt_checkpoint(session, cfg)));
- session->meta_track_next = saved_meta_next;
- WT_ERR(ret);
-
- WT_WITH_DHANDLE(session,
- WT_SESSION_META_DHANDLE(session),
- ret = __wt_checkpoint_sync(session, NULL));
- WT_ERR(ret);
-
- __checkpoint_verbose_track(session, "metadata sync completed");
- } else
- WT_WITH_DHANDLE(session,
- WT_SESSION_META_DHANDLE(session),
- ret = __wt_txn_checkpoint_log(
- session, false, WT_TXN_LOG_CKPT_SYNC, NULL));
-
- /*
- * Now that the metadata is stable, re-open the metadata file for
- * regular eviction by clearing the checkpoint_pinned flag.
- */
- txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
-
- if (full) {
- __checkpoint_stats(session);
-
- /*
- * If timestamps were used to define the content of the
- * checkpoint update the saved last checkpoint timestamp,
- * otherwise leave it alone. If a checkpoint is taken without
- * timestamps, it's likely a bug, but we don't want to clear
- * the saved last checkpoint timestamp regardless.
- */
- if (use_timestamp)
- conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts;
- }
+ WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_ISOLATION saved_isolation;
+ wt_timestamp_t ckpt_tmp_ts;
+ uint64_t fsync_duration_usecs, generation, time_start, time_stop;
+ u_int i;
+ bool can_skip, failed, full, idle, logging, tracking, use_timestamp;
+ void *saved_meta_next;
+
+ conn = S2C(session);
+ cache = conn->cache;
+ txn = &session->txn;
+ txn_global = &conn->txn_global;
+ saved_isolation = session->isolation;
+ full = idle = logging = tracking = use_timestamp = false;
+
+ /* Avoid doing work if possible. */
+ WT_RET(__txn_checkpoint_can_skip(session, cfg, &full, &use_timestamp, &can_skip));
+ if (can_skip) {
+ WT_STAT_CONN_INCR(session, txn_checkpoint_skipped);
+ return (0);
+ }
+
+ /*
+ * Do a pass over the configuration arguments and figure out what kind of checkpoint this is.
+ */
+ WT_RET(__checkpoint_apply_all(session, cfg, NULL));
+
+ logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED);
+
+ /* Reset the maximum page size seen by eviction. */
+ conn->cache->evict_max_page_size = 0;
+
+ /* Initialize the verbose tracking timer */
+ __wt_epoch(session, &conn->ckpt_timer_start);
+
+ /* Initialize the checkpoint progress tracking data */
+ conn->ckpt_progress_msg_count = 0;
+ conn->ckpt_write_bytes = 0;
+ conn->ckpt_write_pages = 0;
+
+ /*
+ * Update the global oldest ID so we do all possible cleanup.
+ *
+ * This is particularly important for compact, so that all dirty pages
+ * can be fully written.
+ */
+ WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+
+ /* Flush data-sources before we start the checkpoint. */
+ WT_ERR(__checkpoint_data_source(session, cfg));
+
+ /*
+ * Try to reduce the amount of dirty data in cache so there is less work do during the critical
+ * section of the checkpoint.
+ */
+ __checkpoint_reduce_dirty_cache(session);
+
+ /* Tell logging that we are about to start a database checkpoint. */
+ if (full && logging)
+ WT_ERR(__wt_txn_checkpoint_log(session, full, WT_TXN_LOG_CKPT_PREPARE, NULL));
+
+ __checkpoint_verbose_track(session, "starting transaction");
+
+ if (full)
+ __wt_epoch(session, &conn->ckpt_timer_scrub_end);
+
+ /*
+ * Start the checkpoint for real.
+ *
+ * Bump the global checkpoint generation, used to figure out whether
+ * checkpoint has visited a tree. Use an atomic increment even though
+ * we are single-threaded because readers of the checkpoint generation
+ * don't hold the checkpoint lock.
+ *
+ * We do need to update it before clearing the checkpoint's entry out
+ * of the transaction table, or a thread evicting in a tree could
+ * ignore the checkpoint's transaction.
+ */
+ generation = __wt_gen_next(session, WT_GEN_CHECKPOINT);
+ WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation);
+
+ /*
+ * We want to skip checkpointing clean handles whenever possible. That
+ * is, when the checkpoint is not named or forced. However, we need to
+ * take care about ordering with respect to the checkpoint transaction.
+ *
+ * We can't skip clean handles before starting the transaction or the
+ * checkpoint can miss updates in trees that become dirty as the
+ * checkpoint is starting. If we wait until the transaction has
+ * started before locking a handle, there could be a metadata-changing
+ * operation in between (e.g., salvage) that will cause a write
+ * conflict when the checkpoint goes to write the metadata.
+ *
+ * Hold the schema lock while starting the transaction and gathering
+ * handles so the set we get is complete and correct.
+ */
+ WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, &tracking, cfg));
+ WT_ERR(ret);
+
+ WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT);
+
+ /*
+ * Unblock updates -- we can figure out that any updates to clean pages after this point are too
+ * new to be written in the checkpoint.
+ */
+ cache->eviction_scrub_target = 0.0;
+ WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
+
+ /* Tell logging that we have started a database checkpoint. */
+ if (full && logging)
+ WT_ERR(__wt_txn_checkpoint_log(session, full, WT_TXN_LOG_CKPT_START, NULL));
+
+ __checkpoint_timing_stress(session);
+
+ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper));
+
+ /*
+ * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't
+ * bother restoring the handle since it doesn't make sense to carry a handle across a
+ * checkpoint.
+ */
+ session->dhandle = NULL;
+
+ /*
+ * Record the timestamp from the transaction if we were successful. Store it in a temp variable
+ * now because it will be invalidated during commit but we don't want to set it until we know
+ * the checkpoint is successful. We have to set the system information before we release the
+ * snapshot.
+ */
+ ckpt_tmp_ts = 0;
+ if (full) {
+ WT_ERR(__wt_meta_sysinfo_set(session));
+ ckpt_tmp_ts = txn->read_timestamp;
+ }
+
+ /* Release the snapshot so we aren't pinning updates in cache. */
+ __wt_txn_release_snapshot(session);
+
+ /* Mark all trees as open for business (particularly eviction). */
+ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync));
+ __wt_evict_server_wake(session);
+
+ __checkpoint_verbose_track(session, "committing transaction");
+
+ /*
+ * Checkpoints have to hit disk (it would be reasonable to configure for lazy checkpoints, but
+ * we don't support them yet).
+ */
+ time_start = __wt_clock(session);
+ WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync));
+ time_stop = __wt_clock(session);
+ fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start);
+ WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post);
+ WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs);
+
+ __checkpoint_verbose_track(session, "sync completed");
+
+ /*
+ * Commit the transaction now that we are sure that all files in the checkpoint have been
+ * flushed to disk. It's OK to commit before checkpointing the metadata since we know that all
+ * files in the checkpoint are now in a consistent state.
+ */
+ WT_ERR(__wt_txn_commit(session, NULL));
+
+ /*
+ * Ensure that the metadata changes are durable before the checkpoint
+ * is resolved. Do this by either checkpointing the metadata or syncing
+ * the log file.
+ * Recovery relies on the checkpoint LSN in the metadata only being
+ * updated by full checkpoints so only checkpoint the metadata for
+ * full or non-logged checkpoints.
+ *
+ * This is very similar to __wt_meta_track_off, ideally they would be
+ * merged.
+ */
+ if (full || !logging) {
+ session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
+ /* Disable metadata tracking during the metadata checkpoint. */
+ saved_meta_next = session->meta_track_next;
+ session->meta_track_next = NULL;
+ WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session),
+ WT_WITH_METADATA_LOCK(session, ret = __wt_checkpoint(session, cfg)));
+ session->meta_track_next = saved_meta_next;
+ WT_ERR(ret);
+
+ WT_WITH_DHANDLE(
+ session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL));
+ WT_ERR(ret);
+
+ __checkpoint_verbose_track(session, "metadata sync completed");
+ } else
+ WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session),
+ ret = __wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_SYNC, NULL));
+
+ /*
+ * Now that the metadata is stable, re-open the metadata file for regular eviction by clearing
+ * the checkpoint_pinned flag.
+ */
+ txn_global->checkpoint_state.pinned_id = WT_TXN_NONE;
+
+ if (full) {
+ __checkpoint_stats(session);
+
+ /*
+ * If timestamps were used to define the content of the checkpoint update the saved last
+ * checkpoint timestamp, otherwise leave it alone. If a checkpoint is taken without
+ * timestamps, it's likely a bug, but we don't want to clear the saved last checkpoint
+ * timestamp regardless.
+ */
+ if (use_timestamp)
+ conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts;
+ }
err:
- /*
- * Reset the timer so that next checkpoint tracks the progress only if
- * configured.
- */
- conn->ckpt_timer_start.tv_sec = 0;
-
- /*
- * XXX
- * Rolling back the changes here is problematic.
- *
- * If we unroll here, we need a way to roll back changes to the avail
- * list for each tree that was successfully synced before the error
- * occurred. Otherwise, the next time we try this operation, we will
- * try to free an old checkpoint again.
- *
- * OTOH, if we commit the changes after a failure, we have partially
- * overwritten the checkpoint, so what ends up on disk is not
- * consistent.
- */
- failed = ret != 0;
- if (failed)
- conn->modified = true;
-
- session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
- if (tracking)
- WT_TRET(__wt_meta_track_off(session, false, failed));
-
- cache->eviction_scrub_target = 0.0;
- WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
-
- if (F_ISSET(txn, WT_TXN_RUNNING)) {
- /*
- * Clear the dhandle so the visibility check doesn't get
- * confused about the snap min. Don't bother restoring the
- * handle since it doesn't make sense to carry a handle across
- * a checkpoint.
- */
- session->dhandle = NULL;
- WT_TRET(__wt_txn_rollback(session, NULL));
- }
-
- /*
- * Tell logging that we have finished a database checkpoint. Do not
- * write a log record if the database was idle.
- */
- if (full && logging) {
- if (ret == 0 &&
- F_ISSET(((WT_CURSOR_BTREE *)
- session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT))
- idle = true;
- WT_TRET(__wt_txn_checkpoint_log(session, full,
- (ret == 0 && !idle) ?
- WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL));
- }
-
- for (i = 0; i < session->ckpt_handle_next; ++i) {
- if (session->ckpt_handle[i] == NULL)
- continue;
- /*
- * If the operation failed, mark all trees dirty so they are
- * included if a future checkpoint can succeed.
- */
- if (failed)
- WT_WITH_DHANDLE(session, session->ckpt_handle[i],
- __checkpoint_fail_reset(session));
- WT_WITH_DHANDLE(session, session->ckpt_handle[i],
- WT_TRET(__wt_session_release_dhandle(session)));
- }
-
- __wt_free(session, session->ckpt_handle);
- session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
-
- session->isolation = txn->isolation = saved_isolation;
- return (ret);
+ /*
+ * Reset the timer so that next checkpoint tracks the progress only if configured.
+ */
+ conn->ckpt_timer_start.tv_sec = 0;
+
+ /*
+ * XXX
+ * Rolling back the changes here is problematic.
+ *
+ * If we unroll here, we need a way to roll back changes to the avail
+ * list for each tree that was successfully synced before the error
+ * occurred. Otherwise, the next time we try this operation, we will
+ * try to free an old checkpoint again.
+ *
+ * OTOH, if we commit the changes after a failure, we have partially
+ * overwritten the checkpoint, so what ends up on disk is not
+ * consistent.
+ */
+ failed = ret != 0;
+ if (failed)
+ conn->modified = true;
+
+ session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED;
+ if (tracking)
+ WT_TRET(__wt_meta_track_off(session, false, failed));
+
+ cache->eviction_scrub_target = 0.0;
+ WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0);
+
+ if (F_ISSET(txn, WT_TXN_RUNNING)) {
+ /*
+ * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't
+ * bother restoring the handle since it doesn't make sense to carry a handle across a
+ * checkpoint.
+ */
+ session->dhandle = NULL;
+ WT_TRET(__wt_txn_rollback(session, NULL));
+ }
+
+ /*
+ * Tell logging that we have finished a database checkpoint. Do not write a log record if the
+ * database was idle.
+ */
+ if (full && logging) {
+ if (ret == 0 &&
+ F_ISSET(((WT_CURSOR_BTREE *)session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT))
+ idle = true;
+ WT_TRET(__wt_txn_checkpoint_log(session, full,
+ (ret == 0 && !idle) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL));
+ }
+
+ for (i = 0; i < session->ckpt_handle_next; ++i) {
+ if (session->ckpt_handle[i] == NULL)
+ continue;
+ /*
+ * If the operation failed, mark all trees dirty so they are included if a future checkpoint
+ * can succeed.
+ */
+ if (failed)
+ WT_WITH_DHANDLE(session, session->ckpt_handle[i], __checkpoint_fail_reset(session));
+ WT_WITH_DHANDLE(
+ session, session->ckpt_handle[i], WT_TRET(__wt_session_release_dhandle(session)));
+ }
+
+ __wt_free(session, session->ckpt_handle);
+ session->ckpt_handle_allocated = session->ckpt_handle_next = 0;
+
+ session->isolation = txn->isolation = saved_isolation;
+ return (ret);
}
/*
* __txn_checkpoint_wrapper --
- * Checkpoint wrapper.
+ * Checkpoint wrapper.
*/
static int
__txn_checkpoint_wrapper(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_DECL_RET;
- WT_TXN_GLOBAL *txn_global;
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
- txn_global = &S2C(session)->txn_global;
+ txn_global = &S2C(session)->txn_global;
- WT_STAT_CONN_SET(session, txn_checkpoint_running, 1);
- txn_global->checkpoint_running = true;
+ WT_STAT_CONN_SET(session, txn_checkpoint_running, 1);
+ txn_global->checkpoint_running = true;
- ret = __txn_checkpoint(session, cfg);
+ ret = __txn_checkpoint(session, cfg);
- WT_STAT_CONN_SET(session, txn_checkpoint_running, 0);
- txn_global->checkpoint_running = false;
+ WT_STAT_CONN_SET(session, txn_checkpoint_running, 0);
+ txn_global->checkpoint_running = false;
- return (ret);
+ return (ret);
}
/*
* __wt_txn_checkpoint --
- * Checkpoint a database or a list of objects in the database.
+ * Checkpoint a database or a list of objects in the database.
*/
int
__wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting)
{
- WT_DECL_RET;
- uint32_t orig_flags;
-
- /*
- * Reset open cursors. Do this explicitly, even though it will happen
- * implicitly in the call to begin_transaction for the checkpoint, the
- * checkpoint code will acquire the schema lock before we do that, and
- * some implementation of WT_CURSOR::reset might need the schema lock.
- */
- WT_RET(__wt_session_reset_cursors(session, false));
-
- /* Ensure the metadata table is open before taking any locks. */
- WT_RET(__wt_metadata_cursor(session, NULL));
-
- /*
- * Don't highjack the session checkpoint thread for eviction.
- *
- * Application threads are not generally available for potentially slow
- * operations, but checkpoint does enough I/O it may be called upon to
- * perform slow operations for the block manager.
- *
- * Application checkpoints wait until the checkpoint lock is available,
- * compaction checkpoints don't.
- *
- * Checkpoints should always use a separate session for lookaside
- * updates, otherwise those updates are pinned until the checkpoint
- * commits. Also, there are unfortunate interactions between the
- * special rules for lookaside eviction and the special handling of the
- * checkpoint transaction.
- */
+ WT_DECL_RET;
+ uint32_t orig_flags;
+
+ /*
+ * Reset open cursors. Do this explicitly, even though it will happen implicitly in the call to
+ * begin_transaction for the checkpoint, the checkpoint code will acquire the schema lock before
+ * we do that, and some implementation of WT_CURSOR::reset might need the schema lock.
+ */
+ WT_RET(__wt_session_reset_cursors(session, false));
+
+ /* Ensure the metadata table is open before taking any locks. */
+ WT_RET(__wt_metadata_cursor(session, NULL));
+
+/*
+ * Don't highjack the session checkpoint thread for eviction.
+ *
+ * Application threads are not generally available for potentially slow
+ * operations, but checkpoint does enough I/O it may be called upon to
+ * perform slow operations for the block manager.
+ *
+ * Application checkpoints wait until the checkpoint lock is available,
+ * compaction checkpoints don't.
+ *
+ * Checkpoints should always use a separate session for lookaside
+ * updates, otherwise those updates are pinned until the checkpoint
+ * commits. Also, there are unfortunate interactions between the
+ * special rules for lookaside eviction and the special handling of the
+ * checkpoint transaction.
+ */
#undef WT_CHECKPOINT_SESSION_FLAGS
-#define WT_CHECKPOINT_SESSION_FLAGS \
- (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE)
+#define WT_CHECKPOINT_SESSION_FLAGS (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE)
#undef WT_CHECKPOINT_SESSION_FLAGS_OFF
-#define WT_CHECKPOINT_SESSION_FLAGS_OFF \
- (WT_SESSION_LOOKASIDE_CURSOR)
- orig_flags = F_MASK(session,
- WT_CHECKPOINT_SESSION_FLAGS | WT_CHECKPOINT_SESSION_FLAGS_OFF);
- F_SET(session, WT_CHECKPOINT_SESSION_FLAGS);
- F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS_OFF);
-
- /*
- * Only one checkpoint can be active at a time, and checkpoints must run
- * in the same order as they update the metadata. It's probably a bad
- * idea to run checkpoints out of multiple threads, but as compaction
- * calls checkpoint directly, it can be tough to avoid. Serialize here
- * to ensure we don't get into trouble.
- */
- if (waiting)
- WT_WITH_CHECKPOINT_LOCK(session,
- ret = __txn_checkpoint_wrapper(session, cfg));
- else
- WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret,
- ret = __txn_checkpoint_wrapper(session, cfg));
-
- F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS);
- F_SET(session, orig_flags);
-
- return (ret);
+#define WT_CHECKPOINT_SESSION_FLAGS_OFF (WT_SESSION_LOOKASIDE_CURSOR)
+ orig_flags = F_MASK(session, WT_CHECKPOINT_SESSION_FLAGS | WT_CHECKPOINT_SESSION_FLAGS_OFF);
+ F_SET(session, WT_CHECKPOINT_SESSION_FLAGS);
+ F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS_OFF);
+
+ /*
+ * Only one checkpoint can be active at a time, and checkpoints must run in the same order as
+ * they update the metadata. It's probably a bad idea to run checkpoints out of multiple
+ * threads, but as compaction calls checkpoint directly, it can be tough to avoid. Serialize
+ * here to ensure we don't get into trouble.
+ */
+ if (waiting)
+ WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint_wrapper(session, cfg));
+ else
+ WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, ret = __txn_checkpoint_wrapper(session, cfg));
+
+ F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS);
+ F_SET(session, orig_flags);
+
+ return (ret);
}
/*
* __drop --
- * Drop all checkpoints with a specific name.
+ * Drop all checkpoints with a specific name.
*/
static void
__drop(WT_CKPT *ckptbase, const char *name, size_t len)
{
- WT_CKPT *ckpt;
-
- /*
- * If we're dropping internal checkpoints, match to the '.' separating
- * the checkpoint name from the generational number, and take all that
- * we can find. Applications aren't allowed to use any variant of this
- * name, so the test is still pretty simple, if the leading bytes match,
- * it's one we want to drop.
- */
- if (strncmp(WT_CHECKPOINT, name, len) == 0) {
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
- F_SET(ckpt, WT_CKPT_DELETE);
- } else
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (WT_STRING_MATCH(ckpt->name, name, len))
- F_SET(ckpt, WT_CKPT_DELETE);
+ WT_CKPT *ckpt;
+
+ /*
+ * If we're dropping internal checkpoints, match to the '.' separating the checkpoint name from
+ * the generational number, and take all that we can find. Applications aren't allowed to use
+ * any variant of this name, so the test is still pretty simple, if the leading bytes match,
+ * it's one we want to drop.
+ */
+ if (strncmp(WT_CHECKPOINT, name, len) == 0) {
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT))
+ F_SET(ckpt, WT_CKPT_DELETE);
+ } else
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (WT_STRING_MATCH(ckpt->name, name, len))
+ F_SET(ckpt, WT_CKPT_DELETE);
}
/*
* __drop_from --
- * Drop all checkpoints after, and including, the named checkpoint.
+ * Drop all checkpoints after, and including, the named checkpoint.
*/
static void
__drop_from(WT_CKPT *ckptbase, const char *name, size_t len)
{
- WT_CKPT *ckpt;
- bool matched;
-
- /*
- * There's a special case -- if the name is "all", then we delete all
- * of the checkpoints.
- */
- if (WT_STRING_MATCH("all", name, len)) {
- WT_CKPT_FOREACH(ckptbase, ckpt)
- F_SET(ckpt, WT_CKPT_DELETE);
- return;
- }
-
- /*
- * We use the first checkpoint we can find, that is, if there are two
- * checkpoints with the same name in the list, we'll delete from the
- * first match to the end.
- */
- matched = false;
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (!matched && !WT_STRING_MATCH(ckpt->name, name, len))
- continue;
-
- matched = true;
- F_SET(ckpt, WT_CKPT_DELETE);
- }
+ WT_CKPT *ckpt;
+ bool matched;
+
+ /*
+ * There's a special case -- if the name is "all", then we delete all of the checkpoints.
+ */
+ if (WT_STRING_MATCH("all", name, len)) {
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ F_SET(ckpt, WT_CKPT_DELETE);
+ return;
+ }
+
+ /*
+ * We use the first checkpoint we can find, that is, if there are two checkpoints with the same
+ * name in the list, we'll delete from the first match to the end.
+ */
+ matched = false;
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ if (!matched && !WT_STRING_MATCH(ckpt->name, name, len))
+ continue;
+
+ matched = true;
+ F_SET(ckpt, WT_CKPT_DELETE);
+ }
}
/*
* __drop_to --
- * Drop all checkpoints before, and including, the named checkpoint.
+ * Drop all checkpoints before, and including, the named checkpoint.
*/
static void
__drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
{
- WT_CKPT *ckpt, *mark;
-
- /*
- * We use the last checkpoint we can find, that is, if there are two
- * checkpoints with the same name in the list, we'll delete from the
- * beginning to the second match, not the first.
- */
- mark = NULL;
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (WT_STRING_MATCH(ckpt->name, name, len))
- mark = ckpt;
-
- if (mark == NULL)
- return;
-
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- F_SET(ckpt, WT_CKPT_DELETE);
-
- if (ckpt == mark)
- break;
- }
+ WT_CKPT *ckpt, *mark;
+
+ /*
+ * We use the last checkpoint we can find, that is, if there are two checkpoints with the same
+ * name in the list, we'll delete from the beginning to the second match, not the first.
+ */
+ mark = NULL;
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (WT_STRING_MATCH(ckpt->name, name, len))
+ mark = ckpt;
+
+ if (mark == NULL)
+ return;
+
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ F_SET(ckpt, WT_CKPT_DELETE);
+
+ if (ckpt == mark)
+ break;
+ }
}
/*
* __checkpoint_lock_dirty_tree_int --
- * Helper for __checkpoint_lock_dirty_tree. Intended to be called while
- * holding the hot backup lock.
+ * Helper for __checkpoint_lock_dirty_tree. Intended to be called while holding the hot backup
+ * lock.
*/
static int
-__checkpoint_lock_dirty_tree_int(
- WT_SESSION_IMPL *session, bool is_checkpoint,
- bool force, WT_BTREE *btree, WT_CKPT *ckpt, WT_CKPT *ckptbase)
+__checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, bool force,
+ WT_BTREE *btree, WT_CKPT *ckpt, WT_CKPT *ckptbase)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
-
- WT_UNUSED(is_checkpoint);
- conn = S2C(session);
-
- /*
- * We can't delete checkpoints if a backup cursor is open. WiredTiger
- * checkpoints are uniquely named and it's OK to have multiple of them
- * in the system: clear the delete flag for them, and otherwise fail.
- * Hold the lock until we're done (blocking hot backups from starting),
- * we don't want to race with a future hot backup.
- */
- if (conn->hot_backup)
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (!F_ISSET(ckpt, WT_CKPT_DELETE))
- continue;
- if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
- F_CLR(ckpt, WT_CKPT_DELETE);
- continue;
- }
- WT_RET_MSG(session, EBUSY,
- "checkpoint %s blocked by hot backup: it would"
- "delete an existing checkpoint, and checkpoints "
- "cannot be deleted during a hot backup",
- ckpt->name);
- }
- /*
- * Mark old checkpoints that are being deleted and figure out which
- * trees we can skip in this checkpoint.
- */
- WT_RET(__checkpoint_mark_skip(session, ckptbase, force));
- if (F_ISSET(btree, WT_BTREE_SKIP_CKPT))
- return (0);
- /*
- * Lock the checkpoints that will be deleted.
- *
- * Checkpoints are only locked when tracking is enabled, which covers
- * checkpoint and drop operations, but not close. The reasoning is
- * there should be no access to a checkpoint during close, because any
- * thread accessing a checkpoint will also have the current file handle
- * open.
- */
- if (WT_META_TRACKING(session))
- WT_CKPT_FOREACH(ckptbase, ckpt) {
- if (!F_ISSET(ckpt, WT_CKPT_DELETE))
- continue;
- /*
- * We can't delete checkpoints referenced by a cursor.
- * WiredTiger checkpoints are uniquely named and it's
- * OK to have multiple in the system: clear the delete
- * flag for them, and otherwise fail.
- */
- ret = __wt_session_lock_checkpoint(session, ckpt->name);
- if (ret == 0)
- continue;
- if (ret == EBUSY &&
- WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
- F_CLR(ckpt, WT_CKPT_DELETE);
- continue;
- }
- WT_RET_MSG(session, ret,
- "checkpoints cannot be dropped when in-use");
- }
- /*
- * There are special trees: those being bulk-loaded, salvaged, upgraded
- * or verified during the checkpoint. They should never be part of a
- * checkpoint: we will fail to lock them because the operations have
- * exclusive access to the handles. Named checkpoints will fail in that
- * case, ordinary checkpoints skip files that cannot be opened normally.
- */
- WT_ASSERT(session,
- !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS));
-
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ WT_UNUSED(is_checkpoint);
+ conn = S2C(session);
+
+ /*
+ * We can't delete checkpoints if a backup cursor is open. WiredTiger checkpoints are uniquely
+ * named and it's OK to have multiple of them in the system: clear the delete flag for them, and
+ * otherwise fail. Hold the lock until we're done (blocking hot backups from starting), we don't
+ * want to race with a future hot backup.
+ */
+ if (conn->hot_backup)
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+ F_CLR(ckpt, WT_CKPT_DELETE);
+ continue;
+ }
+ WT_RET_MSG(session, EBUSY,
+ "checkpoint %s blocked by hot backup: it would"
+ "delete an existing checkpoint, and checkpoints "
+ "cannot be deleted during a hot backup",
+ ckpt->name);
+ }
+ /*
+ * Mark old checkpoints that are being deleted and figure out which trees we can skip in this
+ * checkpoint.
+ */
+ WT_RET(__checkpoint_mark_skip(session, ckptbase, force));
+ if (F_ISSET(btree, WT_BTREE_SKIP_CKPT))
+ return (0);
+ /*
+ * Lock the checkpoints that will be deleted.
+ *
+ * Checkpoints are only locked when tracking is enabled, which covers
+ * checkpoint and drop operations, but not close. The reasoning is
+ * there should be no access to a checkpoint during close, because any
+ * thread accessing a checkpoint will also have the current file handle
+ * open.
+ */
+ if (WT_META_TRACKING(session))
+ WT_CKPT_FOREACH (ckptbase, ckpt) {
+ if (!F_ISSET(ckpt, WT_CKPT_DELETE))
+ continue;
+ /*
+ * We can't delete checkpoints referenced by a cursor. WiredTiger checkpoints are
+ * uniquely named and it's OK to have multiple in the system: clear the delete flag for
+ * them, and otherwise fail.
+ */
+ ret = __wt_session_lock_checkpoint(session, ckpt->name);
+ if (ret == 0)
+ continue;
+ if (ret == EBUSY && WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) {
+ F_CLR(ckpt, WT_CKPT_DELETE);
+ continue;
+ }
+ WT_RET_MSG(session, ret, "checkpoints cannot be dropped when in-use");
+ }
+ /*
+ * There are special trees: those being bulk-loaded, salvaged, upgraded or verified during the
+ * checkpoint. They should never be part of a checkpoint: we will fail to lock them because the
+ * operations have exclusive access to the handles. Named checkpoints will fail in that case,
+ * ordinary checkpoints skip files that cannot be opened normally.
+ */
+ WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS));
+
+ return (0);
}
/*
* __checkpoint_lock_dirty_tree --
- * Decide whether the tree needs to be included in the checkpoint and if
- * so, acquire the necessary locks.
+ * Decide whether the tree needs to be included in the checkpoint and if so, acquire the
+ * necessary locks.
*/
static int
-__checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session,
- bool is_checkpoint, bool force, bool need_tracking, const char *cfg[])
+__checkpoint_lock_dirty_tree(
+ WT_SESSION_IMPL *session, bool is_checkpoint, bool force, bool need_tracking, const char *cfg[])
{
- WT_BTREE *btree;
- WT_CKPT *ckpt, *ckptbase;
- WT_CONFIG dropconf;
- WT_CONFIG_ITEM cval, k, v;
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
- const char *name;
- char *name_alloc;
-
- btree = S2BT(session);
- ckpt = ckptbase = NULL;
- dhandle = session->dhandle;
- name_alloc = NULL;
-
- /* Only referenced in diagnostic builds. */
- WT_UNUSED(is_checkpoint);
-
- /*
- * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied
- * with wrapping the entire assert condition in the unused macro.
- */
- WT_UNUSED(need_tracking);
-
- /*
- * Most callers need meta tracking to be on here, otherwise it is
- * possible for this checkpoint to cleanup handles that are still in
- * use. The exceptions are:
- * - Checkpointing the metadata handle itself.
- * - On connection close when we know there can't be any races.
- */
- WT_ASSERT(session, !need_tracking ||
- WT_IS_METADATA(dhandle) || WT_META_TRACKING(session));
-
- /* Get the list of checkpoints for this file. */
- WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase));
-
- /* This may be a named checkpoint, check the configuration. */
- cval.len = 0;
- if (cfg != NULL)
- WT_ERR(__wt_config_gets(session, cfg, "name", &cval));
- if (cval.len == 0)
- name = WT_CHECKPOINT;
- else {
- WT_ERR(__checkpoint_name_ok(session, cval.str, cval.len));
- WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc));
- name = name_alloc;
- }
-
- /* We may be dropping specific checkpoints, check the configuration. */
- if (cfg != NULL) {
- cval.len = 0;
- WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
- if (cval.len != 0) {
- __wt_config_subinit(session, &dropconf, &cval);
- while ((ret =
- __wt_config_next(&dropconf, &k, &v)) == 0) {
- /* Disallow unsafe checkpoint names. */
- if (v.len == 0)
- WT_ERR(__checkpoint_name_ok(
- session, k.str, k.len));
- else
- WT_ERR(__checkpoint_name_ok(
- session, v.str, v.len));
-
- if (v.len == 0)
- __drop(ckptbase, k.str, k.len);
- else if (WT_STRING_MATCH("from", k.str, k.len))
- __drop_from(ckptbase, v.str, v.len);
- else if (WT_STRING_MATCH("to", k.str, k.len))
- __drop_to(ckptbase, v.str, v.len);
- else
- WT_ERR_MSG(session, EINVAL,
- "unexpected value for checkpoint "
- "key: %.*s",
- (int)k.len, k.str);
- }
- WT_ERR_NOTFOUND_OK(ret);
- }
- }
-
- /* Drop checkpoints with the same name as the one we're taking. */
- __drop(ckptbase, name, strlen(name));
-
- /* Set the name of the new entry at the end of the list. */
- WT_CKPT_FOREACH(ckptbase, ckpt)
- ;
- WT_ERR(__wt_strdup(session, name, &ckpt->name));
-
- /*
- * There is some interaction between backups and checkpoints. Perform
- * all backup related operations that the checkpoint needs now, while
- * holding the hot backup read lock.
- */
- WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session,
- ret = __checkpoint_lock_dirty_tree_int(
- session, is_checkpoint, force, btree, ckpt, ckptbase));
- WT_ERR(ret);
- if (F_ISSET(btree, WT_BTREE_SKIP_CKPT))
- goto err;
-
- WT_ASSERT(session, btree->ckpt == NULL &&
- !F_ISSET(btree, WT_BTREE_SKIP_CKPT));
- btree->ckpt = ckptbase;
-
- if (0) {
+ WT_BTREE *btree;
+ WT_CKPT *ckpt, *ckptbase;
+ WT_CONFIG dropconf;
+ WT_CONFIG_ITEM cval, k, v;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ char *name_alloc;
+ const char *name;
+
+ btree = S2BT(session);
+ ckpt = ckptbase = NULL;
+ dhandle = session->dhandle;
+ name_alloc = NULL;
+
+ /* Only referenced in diagnostic builds. */
+ WT_UNUSED(is_checkpoint);
+
+ /*
+ * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied with wrapping the entire
+ * assert condition in the unused macro.
+ */
+ WT_UNUSED(need_tracking);
+
+ /*
+ * Most callers need meta tracking to be on here, otherwise it is
+ * possible for this checkpoint to cleanup handles that are still in
+ * use. The exceptions are:
+ * - Checkpointing the metadata handle itself.
+ * - On connection close when we know there can't be any races.
+ */
+ WT_ASSERT(session, !need_tracking || WT_IS_METADATA(dhandle) || WT_META_TRACKING(session));
+
+ /* Get the list of checkpoints for this file. */
+ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase));
+
+ /* This may be a named checkpoint, check the configuration. */
+ cval.len = 0;
+ if (cfg != NULL)
+ WT_ERR(__wt_config_gets(session, cfg, "name", &cval));
+ if (cval.len == 0)
+ name = WT_CHECKPOINT;
+ else {
+ WT_ERR(__checkpoint_name_ok(session, cval.str, cval.len));
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc));
+ name = name_alloc;
+ }
+
+ /* We may be dropping specific checkpoints, check the configuration. */
+ if (cfg != NULL) {
+ cval.len = 0;
+ WT_ERR(__wt_config_gets(session, cfg, "drop", &cval));
+ if (cval.len != 0) {
+ __wt_config_subinit(session, &dropconf, &cval);
+ while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) {
+ /* Disallow unsafe checkpoint names. */
+ if (v.len == 0)
+ WT_ERR(__checkpoint_name_ok(session, k.str, k.len));
+ else
+ WT_ERR(__checkpoint_name_ok(session, v.str, v.len));
+
+ if (v.len == 0)
+ __drop(ckptbase, k.str, k.len);
+ else if (WT_STRING_MATCH("from", k.str, k.len))
+ __drop_from(ckptbase, v.str, v.len);
+ else if (WT_STRING_MATCH("to", k.str, k.len))
+ __drop_to(ckptbase, v.str, v.len);
+ else
+ WT_ERR_MSG(session, EINVAL,
+ "unexpected value for checkpoint "
+ "key: %.*s",
+ (int)k.len, k.str);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ }
+ }
+
+ /* Drop checkpoints with the same name as the one we're taking. */
+ __drop(ckptbase, name, strlen(name));
+
+ /* Set the name of the new entry at the end of the list. */
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ ;
+ WT_ERR(__wt_strdup(session, name, &ckpt->name));
+
+ /*
+ * There is some interaction between backups and checkpoints. Perform all backup related
+ * operations that the checkpoint needs now, while holding the hot backup read lock.
+ */
+ WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session,
+ ret = __checkpoint_lock_dirty_tree_int(session, is_checkpoint, force, btree, ckpt, ckptbase));
+ WT_ERR(ret);
+ if (F_ISSET(btree, WT_BTREE_SKIP_CKPT))
+ goto err;
+
+ WT_ASSERT(session, btree->ckpt == NULL && !F_ISSET(btree, WT_BTREE_SKIP_CKPT));
+ btree->ckpt = ckptbase;
+
+ if (0) {
err:
- __wt_meta_ckptlist_free(session, &ckptbase);
- }
- __wt_free(session, name_alloc);
+ __wt_meta_ckptlist_free(session, &ckptbase);
+ }
+ __wt_free(session, name_alloc);
- return (ret);
+ return (ret);
}
/*
* __checkpoint_mark_skip --
- * Figure out whether the checkpoint can be skipped for a tree.
+ * Figure out whether the checkpoint can be skipped for a tree.
*/
static int
-__checkpoint_mark_skip(
- WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
+__checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force)
{
- WT_BTREE *btree;
- WT_CKPT *ckpt;
- int deleted;
- const char *name;
-
- btree = S2BT(session);
-
- /*
- * Check for clean objects not requiring a checkpoint.
- *
- * If we're closing a handle, and the object is clean, we can skip the
- * checkpoint, whatever checkpoints we have are sufficient. (We might
- * not have any checkpoints if the object was never modified, and that's
- * OK: the object creation code doesn't mark the tree modified so we can
- * skip newly created trees here.)
- *
- * If the application repeatedly checkpoints an object (imagine hourly
- * checkpoints using the same explicit or internal name), there's no
- * reason to repeat the checkpoint for clean objects. The test is if
- * the only checkpoint we're deleting is the last one in the list and
- * it has the same name as the checkpoint we're about to take, skip the
- * work. (We can't skip checkpoints that delete more than the last
- * checkpoint because deleting those checkpoints might free up space in
- * the file.) This means an application toggling between two (or more)
- * checkpoint names will repeatedly take empty checkpoints, but that's
- * not likely enough to make detection worthwhile.
- *
- * Checkpoint read-only objects otherwise: the application must be able
- * to open the checkpoint in a cursor after taking any checkpoint, which
- * means it must exist.
- */
- F_CLR(btree, WT_BTREE_SKIP_CKPT);
- if (!btree->modified && !force) {
- deleted = 0;
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (F_ISSET(ckpt, WT_CKPT_DELETE))
- ++deleted;
-
- /*
- * Complicated test: if the tree is clean and last two
- * checkpoints have the same name (correcting for internal
- * checkpoint names with their generational suffix numbers), we
- * can skip the checkpoint, there's nothing to do. The
- * exception is if we're deleting two or more checkpoints: then
- * we may save space.
- */
- name = (ckpt - 1)->name;
- if (ckpt > ckptbase + 1 && deleted < 2 &&
- (strcmp(name, (ckpt - 2)->name) == 0 ||
- (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
- WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) {
- F_SET(btree, WT_BTREE_SKIP_CKPT);
- return (0);
- }
- }
-
- return (0);
+ WT_BTREE *btree;
+ WT_CKPT *ckpt;
+ int deleted;
+ const char *name;
+
+ btree = S2BT(session);
+
+ /*
+ * Check for clean objects not requiring a checkpoint.
+ *
+ * If we're closing a handle, and the object is clean, we can skip the
+ * checkpoint, whatever checkpoints we have are sufficient. (We might
+ * not have any checkpoints if the object was never modified, and that's
+ * OK: the object creation code doesn't mark the tree modified so we can
+ * skip newly created trees here.)
+ *
+ * If the application repeatedly checkpoints an object (imagine hourly
+ * checkpoints using the same explicit or internal name), there's no
+ * reason to repeat the checkpoint for clean objects. The test is if
+ * the only checkpoint we're deleting is the last one in the list and
+ * it has the same name as the checkpoint we're about to take, skip the
+ * work. (We can't skip checkpoints that delete more than the last
+ * checkpoint because deleting those checkpoints might free up space in
+ * the file.) This means an application toggling between two (or more)
+ * checkpoint names will repeatedly take empty checkpoints, but that's
+ * not likely enough to make detection worthwhile.
+ *
+ * Checkpoint read-only objects otherwise: the application must be able
+ * to open the checkpoint in a cursor after taking any checkpoint, which
+ * means it must exist.
+ */
+ F_CLR(btree, WT_BTREE_SKIP_CKPT);
+ if (!btree->modified && !force) {
+ deleted = 0;
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_DELETE))
+ ++deleted;
+
+ /*
+ * Complicated test: if the tree is clean and last two checkpoints have the same name
+ * (correcting for internal checkpoint names with their generational suffix numbers), we can
+ * skip the checkpoint, there's nothing to do. The exception is if we're deleting two or
+ * more checkpoints: then we may save space.
+ */
+ name = (ckpt - 1)->name;
+ if (ckpt > ckptbase + 1 && deleted < 2 &&
+ (strcmp(name, (ckpt - 2)->name) == 0 ||
+ (WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
+ WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) {
+ F_SET(btree, WT_BTREE_SKIP_CKPT);
+ return (0);
+ }
+ }
+
+ return (0);
}
/*
* __wt_checkpoint_tree_reconcile_update --
- * Update a checkpoint based on reconciliation results.
+ * Update a checkpoint based on reconciliation results.
*/
void
-__wt_checkpoint_tree_reconcile_update(
- WT_SESSION_IMPL *session, wt_timestamp_t newest_durable_ts,
- wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn,
- wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn)
+__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t newest_durable_ts,
+ wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts,
+ uint64_t newest_stop_txn)
{
- WT_BTREE *btree;
- WT_CKPT *ckpt, *ckptbase;
-
- btree = S2BT(session);
-
- /*
- * Reconciliation just wrote a checkpoint, everything has been written.
- * Update the checkpoint with reconciliation information. The reason
- * for this function is the reconciliation code just passes through the
- * btree structure's checkpoint array, it doesn't know any more.
- */
- ckptbase = btree->ckpt;
- WT_CKPT_FOREACH(ckptbase, ckpt)
- if (F_ISSET(ckpt, WT_CKPT_ADD)) {
- ckpt->write_gen = btree->write_gen;
- ckpt->newest_durable_ts = newest_durable_ts;
- ckpt->oldest_start_ts = oldest_start_ts;
- ckpt->oldest_start_txn = oldest_start_txn;
- ckpt->newest_stop_ts = newest_stop_ts;
- ckpt->newest_stop_txn = newest_stop_txn;
- }
+ WT_BTREE *btree;
+ WT_CKPT *ckpt, *ckptbase;
+
+ btree = S2BT(session);
+
+ /*
+ * Reconciliation just wrote a checkpoint, everything has been written. Update the checkpoint
+ * with reconciliation information. The reason for this function is the reconciliation code just
+ * passes through the btree structure's checkpoint array, it doesn't know any more.
+ */
+ ckptbase = btree->ckpt;
+ WT_CKPT_FOREACH (ckptbase, ckpt)
+ if (F_ISSET(ckpt, WT_CKPT_ADD)) {
+ ckpt->write_gen = btree->write_gen;
+ ckpt->newest_durable_ts = newest_durable_ts;
+ ckpt->oldest_start_ts = oldest_start_ts;
+ ckpt->oldest_start_txn = oldest_start_txn;
+ ckpt->newest_stop_ts = newest_stop_ts;
+ ckpt->newest_stop_txn = newest_stop_txn;
+ }
}
/*
* __checkpoint_tree --
- * Checkpoint a single tree.
- * Assumes all necessary locks have been acquired by the caller.
+ * Checkpoint a single tree. Assumes all necessary locks have been acquired by the caller.
*/
static int
-__checkpoint_tree(
- WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[])
+__checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[])
{
- WT_BM *bm;
- WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
- WT_LSN ckptlsn;
- bool fake_ckpt, resolve_bm;
-
- WT_UNUSED(cfg);
-
- btree = S2BT(session);
- bm = btree->bm;
- conn = S2C(session);
- dhandle = session->dhandle;
- fake_ckpt = resolve_bm = false;
-
- /*
- * Set the checkpoint LSN to the maximum LSN so that if logging is
- * disabled, recovery will never roll old changes forward over the
- * non-logged changes in this checkpoint. If logging is enabled, a
- * real checkpoint LSN will be assigned for this checkpoint and
- * overwrite this.
- */
- WT_MAX_LSN(&ckptlsn);
-
- /*
- * If an object has never been used (in other words, if it could become
- * a bulk-loaded file), then we must fake the checkpoint. This is good
- * because we don't write physical checkpoint blocks for just-created
- * files, but it's not just a good idea. The reason is because deleting
- * a physical checkpoint requires writing the file, and fake checkpoints
- * can't write the file. If you (1) create a physical checkpoint for an
- * empty file which writes blocks, (2) start bulk-loading records into
- * the file, (3) during the bulk-load perform another checkpoint with
- * the same name; in order to keep from having two checkpoints with the
- * same name you would have to use the bulk-load's fake checkpoint to
- * delete a physical checkpoint, and that will end in tears.
- */
- if (is_checkpoint && btree->original) {
- __wt_checkpoint_tree_reconcile_update(session,
- WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
-
- fake_ckpt = true;
- goto fake;
- }
-
- /*
- * Mark the root page dirty to ensure something gets written. (If the
- * tree is modified, we must write the root page anyway, this doesn't
- * add additional writes to the process. If the tree is not modified,
- * we have to dirty the root page to ensure something gets written.)
- * This is really about paranoia: if the tree modification value gets
- * out of sync with the set of dirty pages (modify is set, but there
- * are no dirty pages), we perform a checkpoint without any writes, no
- * checkpoint is created, and then things get bad.
- * While marking the root page as dirty, we do not want to dirty the
- * btree because we are marking the btree as clean just after this call.
- * Also, marking the btree dirty at this stage will unnecessarily mark
- * the connection as dirty causing checkpoint-skip code to fail.
- */
- WT_ERR(__wt_page_modify_init(session, btree->root.page));
- __wt_page_only_modify_set(session, btree->root.page);
-
- /*
- * Clear the tree's modified flag; any changes before we clear the flag
- * are guaranteed to be part of this checkpoint (unless reconciliation
- * skips updates for transactional reasons), and changes subsequent to
- * the checkpoint start, which might not be included, will re-set the
- * modified flag. The "unless reconciliation skips updates" problem is
- * handled in the reconciliation code: if reconciliation skips updates,
- * it sets the modified flag itself.
- */
- btree->modified = false;
- WT_FULL_BARRIER();
-
- /* Tell logging that a file checkpoint is starting. */
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
- WT_ERR(__wt_txn_checkpoint_log(
- session, false, WT_TXN_LOG_CKPT_START, &ckptlsn));
-
- /* Tell the block manager that a file checkpoint is starting. */
- WT_ERR(bm->checkpoint_start(bm, session));
- resolve_bm = true;
-
- /* Flush the file from the cache, creating the checkpoint. */
- if (is_checkpoint)
- WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT));
- else
- WT_ERR(__wt_evict_file(session, WT_SYNC_CLOSE));
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DATA_HANDLE *dhandle;
+ WT_DECL_RET;
+ WT_LSN ckptlsn;
+ bool fake_ckpt, resolve_bm;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+ bm = btree->bm;
+ conn = S2C(session);
+ dhandle = session->dhandle;
+ fake_ckpt = resolve_bm = false;
+
+ /*
+ * Set the checkpoint LSN to the maximum LSN so that if logging is disabled, recovery will never
+ * roll old changes forward over the non-logged changes in this checkpoint. If logging is
+ * enabled, a real checkpoint LSN will be assigned for this checkpoint and overwrite this.
+ */
+ WT_MAX_LSN(&ckptlsn);
+
+ /*
+ * If an object has never been used (in other words, if it could become a bulk-loaded file),
+ * then we must fake the checkpoint. This is good because we don't write physical checkpoint
+ * blocks for just-created files, but it's not just a good idea. The reason is because deleting
+ * a physical checkpoint requires writing the file, and fake checkpoints can't write the file.
+ * If you (1) create a physical checkpoint for an empty file which writes blocks, (2) start
+ * bulk-loading records into the file, (3) during the bulk-load perform another checkpoint with
+ * the same name; in order to keep from having two checkpoints with the same name you would have
+ * to use the bulk-load's fake checkpoint to delete a physical checkpoint, and that will end in
+ * tears.
+ */
+ if (is_checkpoint && btree->original) {
+ __wt_checkpoint_tree_reconcile_update(
+ session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX);
+
+ fake_ckpt = true;
+ goto fake;
+ }
+
+ /*
+ * Mark the root page dirty to ensure something gets written. (If the tree is modified, we must
+ * write the root page anyway, this doesn't add additional writes to the process. If the tree is
+ * not modified, we have to dirty the root page to ensure something gets written.) This is
+ * really about paranoia: if the tree modification value gets out of sync with the set of dirty
+ * pages (modify is set, but there are no dirty pages), we perform a checkpoint without any
+ * writes, no checkpoint is created, and then things get bad. While marking the root page as
+ * dirty, we do not want to dirty the btree because we are marking the btree as clean just after
+ * this call. Also, marking the btree dirty at this stage will unnecessarily mark the connection
+ * as dirty causing checkpoint-skip code to fail.
+ */
+ WT_ERR(__wt_page_modify_init(session, btree->root.page));
+ __wt_page_only_modify_set(session, btree->root.page);
+
+ /*
+ * Clear the tree's modified flag; any changes before we clear the flag are guaranteed to be
+ * part of this checkpoint (unless reconciliation skips updates for transactional reasons), and
+ * changes subsequent to the checkpoint start, which might not be included, will re-set the
+ * modified flag. The "unless reconciliation skips updates" problem is handled in the
+ * reconciliation code: if reconciliation skips updates, it sets the modified flag itself.
+ */
+ btree->modified = false;
+ WT_FULL_BARRIER();
+
+ /* Tell logging that a file checkpoint is starting. */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ WT_ERR(__wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_START, &ckptlsn));
+
+ /* Tell the block manager that a file checkpoint is starting. */
+ WT_ERR(bm->checkpoint_start(bm, session));
+ resolve_bm = true;
+
+ /* Flush the file from the cache, creating the checkpoint. */
+ if (is_checkpoint)
+ WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT));
+ else
+ WT_ERR(__wt_evict_file(session, WT_SYNC_CLOSE));
fake:
- /*
- * If we're faking a checkpoint and logging is enabled, recovery should
- * roll forward any changes made between now and the next checkpoint,
- * so set the checkpoint LSN to the beginning of time.
- */
- if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
- WT_INIT_LSN(&ckptlsn);
-
- /*
- * Update the object's metadata.
- *
- * If the object is the metadata, the call to __wt_meta_ckptlist_set
- * will update the turtle file and swap the new one into place. We
- * need to make sure the metadata is on disk before the turtle file is
- * updated.
- *
- * If we are doing a checkpoint in a file without a transaction (e.g.,
- * closing a dirty tree before an exclusive operation like verify),
- * the metadata update will be auto-committed. In that case, we need to
- * sync the file here or we could roll forward the metadata in
- * recovery and open a checkpoint that isn't yet durable.
- */
- if (WT_IS_METADATA(dhandle) ||
- !F_ISSET(&session->txn, WT_TXN_RUNNING))
- WT_ERR(__wt_checkpoint_sync(session, NULL));
-
- WT_ERR(__wt_meta_ckptlist_set(
- session, dhandle->name, btree->ckpt, &ckptlsn));
-
- /*
- * If we wrote a checkpoint (rather than faking one), we have to resolve
- * it. Normally, tracking is enabled and resolution deferred until
- * transaction end. The exception is if the handle is being discarded,
- * in which case the handle will be gone by the time we try to apply or
- * unroll the meta tracking event.
- */
- if (!fake_ckpt) {
- resolve_bm = false;
- if (WT_META_TRACKING(session) && is_checkpoint)
- WT_ERR(__wt_meta_track_checkpoint(session));
- else
- WT_ERR(bm->checkpoint_resolve(bm, session, false));
- }
-
- /* Tell logging that the checkpoint is complete. */
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
- WT_ERR(__wt_txn_checkpoint_log(
- session, false, WT_TXN_LOG_CKPT_STOP, NULL));
+ /*
+ * If we're faking a checkpoint and logging is enabled, recovery should roll forward any changes
+ * made between now and the next checkpoint, so set the checkpoint LSN to the beginning of time.
+ */
+ if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ WT_INIT_LSN(&ckptlsn);
+
+ /*
+ * Update the object's metadata.
+ *
+ * If the object is the metadata, the call to __wt_meta_ckptlist_set
+ * will update the turtle file and swap the new one into place. We
+ * need to make sure the metadata is on disk before the turtle file is
+ * updated.
+ *
+ * If we are doing a checkpoint in a file without a transaction (e.g.,
+ * closing a dirty tree before an exclusive operation like verify),
+ * the metadata update will be auto-committed. In that case, we need to
+ * sync the file here or we could roll forward the metadata in
+ * recovery and open a checkpoint that isn't yet durable.
+ */
+ if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING))
+ WT_ERR(__wt_checkpoint_sync(session, NULL));
+
+ WT_ERR(__wt_meta_ckptlist_set(session, dhandle->name, btree->ckpt, &ckptlsn));
+
+ /*
+ * If we wrote a checkpoint (rather than faking one), we have to resolve it. Normally, tracking
+ * is enabled and resolution deferred until transaction end. The exception is if the handle is
+ * being discarded, in which case the handle will be gone by the time we try to apply or unroll
+ * the meta tracking event.
+ */
+ if (!fake_ckpt) {
+ resolve_bm = false;
+ if (WT_META_TRACKING(session) && is_checkpoint)
+ WT_ERR(__wt_meta_track_checkpoint(session));
+ else
+ WT_ERR(bm->checkpoint_resolve(bm, session, false));
+ }
+
+ /* Tell logging that the checkpoint is complete. */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ WT_ERR(__wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_STOP, NULL));
err:
- /* Resolved the checkpoint for the block manager in the error path. */
- if (resolve_bm)
- WT_TRET(bm->checkpoint_resolve(bm, session, ret != 0));
-
- /*
- * If the checkpoint didn't complete successfully, make sure the
- * tree is marked dirty.
- */
- if (ret != 0) {
- btree->modified = true;
- conn->modified = true;
- }
-
- __wt_meta_ckptlist_free(session, &btree->ckpt);
-
- return (ret);
+ /* Resolved the checkpoint for the block manager in the error path. */
+ if (resolve_bm)
+ WT_TRET(bm->checkpoint_resolve(bm, session, ret != 0));
+
+ /*
+ * If the checkpoint didn't complete successfully, make sure the tree is marked dirty.
+ */
+ if (ret != 0) {
+ btree->modified = true;
+ conn->modified = true;
+ }
+
+ __wt_meta_ckptlist_free(session, &btree->ckpt);
+
+ return (ret);
}
/*
* __checkpoint_presync --
- * Visit all handles after the checkpoint writes are complete and before
- * syncing. At this point, all trees should be completely open for
- * business.
+ * Visit all handles after the checkpoint writes are complete and before syncing. At this point,
+ * all trees should be completely open for business.
*/
static int
__checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_BTREE *btree;
+ WT_BTREE *btree;
- WT_UNUSED(cfg);
+ WT_UNUSED(cfg);
- btree = S2BT(session);
- WT_ASSERT(session,
- btree->checkpoint_gen == __wt_gen(session, WT_GEN_CHECKPOINT));
- btree->evict_walk_period = btree->evict_walk_saved;
- return (0);
+ btree = S2BT(session);
+ WT_ASSERT(session, btree->checkpoint_gen == __wt_gen(session, WT_GEN_CHECKPOINT));
+ btree->evict_walk_period = btree->evict_walk_saved;
+ return (0);
}
/*
* __checkpoint_tree_helper --
- * Checkpoint a tree (suitable for use in *_apply functions).
+ * Checkpoint a tree (suitable for use in *_apply functions).
*/
static int
__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_TXN *txn;
- bool with_timestamp;
-
- btree = S2BT(session);
- txn = &session->txn;
-
- /* Are we using a read timestamp for this checkpoint transaction? */
- with_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_READ);
-
- /*
- * For tables with immediate durability (indicated by having logging
- * enabled), ignore any read timestamp configured for the checkpoint.
- */
- if (__wt_btree_immediately_durable(session))
- F_CLR(txn, WT_TXN_HAS_TS_READ);
-
- ret = __checkpoint_tree(session, true, cfg);
-
- /* Restore the use of the timestamp for other tables. */
- if (with_timestamp)
- F_SET(txn, WT_TXN_HAS_TS_READ);
-
- /*
- * Whatever happened, we aren't visiting this tree again in this
- * checkpoint. Don't keep updates pinned any longer.
- */
- __checkpoint_update_generation(session);
-
- /*
- * In case this tree was being skipped by the eviction server
- * during the checkpoint, restore the previous state.
- */
- btree->evict_walk_period = btree->evict_walk_saved;
-
- /*
- * Wake the eviction server, in case application threads have
- * stalled while the eviction server decided it couldn't make
- * progress. Without this, application threads will be stalled
- * until the eviction server next wakes.
- */
- __wt_evict_server_wake(session);
-
- return (ret);
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_TXN *txn;
+ bool with_timestamp;
+
+ btree = S2BT(session);
+ txn = &session->txn;
+
+ /* Are we using a read timestamp for this checkpoint transaction? */
+ with_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_READ);
+
+ /*
+ * For tables with immediate durability (indicated by having logging enabled), ignore any read
+ * timestamp configured for the checkpoint.
+ */
+ if (__wt_btree_immediately_durable(session))
+ F_CLR(txn, WT_TXN_HAS_TS_READ);
+
+ ret = __checkpoint_tree(session, true, cfg);
+
+ /* Restore the use of the timestamp for other tables. */
+ if (with_timestamp)
+ F_SET(txn, WT_TXN_HAS_TS_READ);
+
+ /*
+ * Whatever happened, we aren't visiting this tree again in this checkpoint. Don't keep updates
+ * pinned any longer.
+ */
+ __checkpoint_update_generation(session);
+
+ /*
+ * In case this tree was being skipped by the eviction server during the checkpoint, restore the
+ * previous state.
+ */
+ btree->evict_walk_period = btree->evict_walk_saved;
+
+ /*
+ * Wake the eviction server, in case application threads have stalled while the eviction server
+ * decided it couldn't make progress. Without this, application threads will be stalled until
+ * the eviction server next wakes.
+ */
+ __wt_evict_server_wake(session);
+
+ return (ret);
}
/*
* __wt_checkpoint --
- * Checkpoint a file.
+ * Checkpoint a file.
*/
int
__wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- bool force;
-
- /* Should not be called with a checkpoint handle. */
- WT_ASSERT(session, session->dhandle->checkpoint == NULL);
-
- /* We must hold the metadata lock if checkpointing the metadata. */
- WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) ||
- F_ISSET(session, WT_SESSION_LOCKED_METADATA));
-
- WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
- force = cval.val != 0;
- WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(
- session, true, force, true, cfg));
- WT_RET(ret);
- if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT))
- return (0);
- return (__checkpoint_tree(session, true, cfg));
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ bool force;
+
+ /* Should not be called with a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+ /* We must hold the metadata lock if checkpointing the metadata. */
+ WT_ASSERT(
+ session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA));
+
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = cval.val != 0;
+ WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg));
+ WT_RET(ret);
+ if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT))
+ return (0);
+ return (__checkpoint_tree(session, true, cfg));
}
/*
* __wt_checkpoint_sync --
- * Sync a file that has been checkpointed, and wait for the result.
+ * Sync a file that has been checkpointed, and wait for the result.
*/
int
__wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_BM *bm;
+ WT_BM *bm;
- WT_UNUSED(cfg);
+ WT_UNUSED(cfg);
- bm = S2BT(session)->bm;
+ bm = S2BT(session)->bm;
- /* Should not be called with a checkpoint handle. */
- WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+ /* Should not be called with a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
- /* Unnecessary if checkpoint_sync has been configured "off". */
- if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
- return (0);
+ /* Unnecessary if checkpoint_sync has been configured "off". */
+ if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC))
+ return (0);
- return (bm->sync(bm, session, true));
+ return (bm->sync(bm, session, true));
}
/*
* __wt_checkpoint_close --
- * Checkpoint a single file as part of closing the handle.
+ * Checkpoint a single file as part of closing the handle.
*/
int
__wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
{
- WT_BTREE *btree;
- WT_DECL_RET;
- bool bulk, need_tracking;
-
- btree = S2BT(session);
- bulk = F_ISSET(btree, WT_BTREE_BULK);
-
- /*
- * We've done the final checkpoint before the final close, subsequent
- * writes to normal objects are wasted effort. Discard the objects to
- * validate exit accounting.
- */
- if (final && !WT_IS_METADATA(session->dhandle))
- return (__wt_evict_file(session, WT_SYNC_DISCARD));
-
- /*
- * If closing an unmodified file, check that no update is required
- * for active readers.
- */
- if (!btree->modified && !bulk) {
- WT_RET(__wt_txn_update_oldest(
- session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
- return (__wt_txn_visible_all(session, btree->rec_max_txn,
- btree->rec_max_timestamp) ?
- __wt_evict_file(session, WT_SYNC_DISCARD) : EBUSY);
- }
-
- /*
- * Don't flush data from trees when there is a stable timestamp set:
- * that can lead to files that are inconsistent on disk after a crash.
- */
- if (btree->modified && !bulk &&
- S2C(session)->txn_global.has_stable_timestamp &&
- !__wt_btree_immediately_durable(session))
- return (__wt_set_return(session, EBUSY));
-
- /*
- * Turn on metadata tracking if:
- * - The session is not already doing metadata tracking.
- * - The file was not bulk loaded.
- * - The close is not during connection close.
- */
- need_tracking = !WT_META_TRACKING(session) && !bulk && !final;
-
- if (need_tracking)
- WT_RET(__wt_meta_track_on(session));
-
- WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(
- session, false, false, need_tracking, NULL));
- WT_ASSERT(session, ret == 0);
- if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT))
- ret = __checkpoint_tree(session, false, NULL);
-
- if (need_tracking)
- WT_TRET(__wt_meta_track_off(session, true, ret != 0));
-
- return (ret);
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ bool bulk, need_tracking;
+
+ btree = S2BT(session);
+ bulk = F_ISSET(btree, WT_BTREE_BULK);
+
+ /*
+ * We've done the final checkpoint before the final close, subsequent writes to normal objects
+ * are wasted effort. Discard the objects to validate exit accounting.
+ */
+ if (final && !WT_IS_METADATA(session->dhandle))
+ return (__wt_evict_file(session, WT_SYNC_DISCARD));
+
+ /*
+ * If closing an unmodified file, check that no update is required for active readers.
+ */
+ if (!btree->modified && !bulk) {
+ WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
+ return (__wt_txn_visible_all(session, btree->rec_max_txn, btree->rec_max_timestamp) ?
+ __wt_evict_file(session, WT_SYNC_DISCARD) :
+ EBUSY);
+ }
+
+ /*
+ * Don't flush data from trees when there is a stable timestamp set: that can lead to files that
+ * are inconsistent on disk after a crash.
+ */
+ if (btree->modified && !bulk && S2C(session)->txn_global.has_stable_timestamp &&
+ !__wt_btree_immediately_durable(session))
+ return (__wt_set_return(session, EBUSY));
+
+ /*
+ * Turn on metadata tracking if:
+ * - The session is not already doing metadata tracking.
+ * - The file was not bulk loaded.
+ * - The close is not during connection close.
+ */
+ need_tracking = !WT_META_TRACKING(session) && !bulk && !final;
+
+ if (need_tracking)
+ WT_RET(__wt_meta_track_on(session));
+
+ WT_SAVE_DHANDLE(
+ session, ret = __checkpoint_lock_dirty_tree(session, false, false, need_tracking, NULL));
+ WT_ASSERT(session, ret == 0);
+ if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT))
+ ret = __checkpoint_tree(session, false, NULL);
+
+ if (need_tracking)
+ WT_TRET(__wt_meta_track_off(session, true, ret != 0));
+
+ return (ret);
}
/*
* __checkpoint_timing_stress --
- * Optionally add a 10 second delay to a checkpoint to simulate a long
- * running checkpoint for debug purposes. The reason for this option is
- * finding operations that can block while waiting for a checkpoint to
- * complete.
+ * Optionally add a 10 second delay to a checkpoint to simulate a long running checkpoint for
+ * debug purposes. The reason for this option is finding operations that can block while waiting
+ * for a checkpoint to complete.
*/
static void
__checkpoint_timing_stress(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
-
- /*
- * We only want to sleep if the flag is set and the checkpoint comes
- * from the API, so check if the session used is either of the two
- * sessions set aside for internal checkpoints.
- */
- if (conn->ckpt_session != session &&
- conn->meta_ckpt_session != session &&
- FLD_ISSET(conn->timing_stress_flags,
- WT_TIMING_STRESS_CHECKPOINT_SLOW))
- __wt_sleep(10, 0);
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /*
+ * We only want to sleep if the flag is set and the checkpoint comes from the API, so check if
+ * the session used is either of the two sessions set aside for internal checkpoints.
+ */
+ if (conn->ckpt_session != session && conn->meta_ckpt_session != session &&
+ FLD_ISSET(conn->timing_stress_flags, WT_TIMING_STRESS_CHECKPOINT_SLOW))
+ __wt_sleep(10, 0);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c
index 1f42ab5eb43..43d9c380eb5 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ext.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ext.c
@@ -10,97 +10,90 @@
/*
* __wt_ext_transaction_id --
- * Return the session's transaction ID.
+ * Return the session's transaction ID.
*/
uint64_t
__wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
{
- WT_SESSION_IMPL *session;
+ WT_SESSION_IMPL *session;
- (void)wt_api; /* Unused parameters */
- session = (WT_SESSION_IMPL *)wt_session;
- /* Ignore failures: the only case is running out of transaction IDs. */
- WT_IGNORE_RET(__wt_txn_id_check(session));
- return (session->txn.id);
+ (void)wt_api; /* Unused parameters */
+ session = (WT_SESSION_IMPL *)wt_session;
+ /* Ignore failures: the only case is running out of transaction IDs. */
+ WT_IGNORE_RET(__wt_txn_id_check(session));
+ return (session->txn.id);
}
/*
* __wt_ext_transaction_isolation_level --
- * Return if the current transaction's isolation level.
+ * Return if the current transaction's isolation level.
*/
int
-__wt_ext_transaction_isolation_level(
- WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
+__wt_ext_transaction_isolation_level(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session)
{
- WT_SESSION_IMPL *session;
- WT_TXN *txn;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
- (void)wt_api; /* Unused parameters */
+ (void)wt_api; /* Unused parameters */
- session = (WT_SESSION_IMPL *)wt_session;
- txn = &session->txn;
+ session = (WT_SESSION_IMPL *)wt_session;
+ txn = &session->txn;
- if (txn->isolation == WT_ISO_READ_COMMITTED)
- return (WT_TXN_ISO_READ_COMMITTED);
- if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
- return (WT_TXN_ISO_READ_UNCOMMITTED);
- return (WT_TXN_ISO_SNAPSHOT);
+ if (txn->isolation == WT_ISO_READ_COMMITTED)
+ return (WT_TXN_ISO_READ_COMMITTED);
+ if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
+ return (WT_TXN_ISO_READ_UNCOMMITTED);
+ return (WT_TXN_ISO_SNAPSHOT);
}
/*
* __wt_ext_transaction_notify --
- * Request notification of transaction resolution.
+ * Request notification of transaction resolution.
*/
int
-__wt_ext_transaction_notify(
- WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify)
+__wt_ext_transaction_notify(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify)
{
- WT_SESSION_IMPL *session;
- WT_TXN *txn;
+ WT_SESSION_IMPL *session;
+ WT_TXN *txn;
- (void)wt_api; /* Unused parameters */
+ (void)wt_api; /* Unused parameters */
- session = (WT_SESSION_IMPL *)wt_session;
- txn = &session->txn;
+ session = (WT_SESSION_IMPL *)wt_session;
+ txn = &session->txn;
- /*
- * XXX
- * For now, a single slot for notifications: I'm not bothering with
- * more than one because more than one data-source in a transaction
- * doesn't work anyway.
- */
- if (txn->notify == notify)
- return (0);
- if (txn->notify != NULL)
- WT_RET_MSG(
- session, WT_ERROR, "transaction notify already scheduled");
+ /*
+ * XXX For now, a single slot for notifications: I'm not bothering with more than one because
+ * more than one data-source in a transaction doesn't work anyway.
+ */
+ if (txn->notify == notify)
+ return (0);
+ if (txn->notify != NULL)
+ WT_RET_MSG(session, WT_ERROR, "transaction notify already scheduled");
- txn->notify = notify;
+ txn->notify = notify;
- return (0);
+ return (0);
}
/*
* __wt_ext_transaction_oldest --
- * Return the oldest transaction ID not yet visible to a running
- * transaction.
+ * Return the oldest transaction ID not yet visible to a running transaction.
*/
uint64_t
__wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api)
{
- return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id);
+ return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id);
}
/*
* __wt_ext_transaction_visible --
- * Return if the current transaction can see the given transaction ID.
+ * Return if the current transaction can see the given transaction ID.
*/
int
__wt_ext_transaction_visible(
- WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id)
+ WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id)
{
- (void)wt_api; /* Unused parameters */
+ (void)wt_api; /* Unused parameters */
- return (__wt_txn_visible(
- (WT_SESSION_IMPL *)wt_session, transaction_id, WT_TS_NONE));
+ return (__wt_txn_visible((WT_SESSION_IMPL *)wt_session, transaction_id, WT_TS_NONE));
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c
index 07a1b1152cb..f74f0d45562 100644
--- a/src/third_party/wiredtiger/src/txn/txn_log.c
+++ b/src/third_party/wiredtiger/src/txn/txn_log.c
@@ -11,769 +11,724 @@
#ifdef HAVE_DIAGNOSTIC
/*
* __txn_op_log_row_key_check --
- * Confirm the cursor references the correct key.
+ * Confirm the cursor references the correct key.
*/
static void
__txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
- WT_CURSOR *cursor;
- WT_ITEM key;
- WT_PAGE *page;
- WT_ROW *rip;
- int cmp;
-
- cursor = &cbt->iface;
- WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_SET));
-
- memset(&key, 0, sizeof(key));
-
- /*
- * We used to take the row-store logging key from the page referenced by
- * the cursor, then switched to taking it from the cursor itself. Check
- * they are the same.
- *
- * If the cursor references a WT_INSERT item, take the key from there,
- * else take the key from the original page.
- */
- if (cbt->ins == NULL) {
- session = (WT_SESSION_IMPL *)cbt->iface.session;
- page = cbt->ref->page;
- WT_ASSERT(session, cbt->slot < page->entries);
- rip = &page->pg_row[cbt->slot];
- WT_ASSERT(session,
- __wt_row_leaf_key(session, page, rip, &key, false) == 0);
- } else {
- key.data = WT_INSERT_KEY(cbt->ins);
- key.size = WT_INSERT_KEY_SIZE(cbt->ins);
- }
-
- WT_ASSERT(session, __wt_compare(
- session, cbt->btree->collator, &key, &cursor->key, &cmp) == 0);
- WT_ASSERT(session, cmp == 0);
-
- __wt_buf_free(session, &key);
+ WT_CURSOR *cursor;
+ WT_ITEM key;
+ WT_PAGE *page;
+ WT_ROW *rip;
+ int cmp;
+
+ cursor = &cbt->iface;
+ WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_SET));
+
+ memset(&key, 0, sizeof(key));
+
+ /*
+ * We used to take the row-store logging key from the page referenced by
+ * the cursor, then switched to taking it from the cursor itself. Check
+ * they are the same.
+ *
+ * If the cursor references a WT_INSERT item, take the key from there,
+ * else take the key from the original page.
+ */
+ if (cbt->ins == NULL) {
+ session = (WT_SESSION_IMPL *)cbt->iface.session;
+ page = cbt->ref->page;
+ WT_ASSERT(session, cbt->slot < page->entries);
+ rip = &page->pg_row[cbt->slot];
+ WT_ASSERT(session, __wt_row_leaf_key(session, page, rip, &key, false) == 0);
+ } else {
+ key.data = WT_INSERT_KEY(cbt->ins);
+ key.size = WT_INSERT_KEY_SIZE(cbt->ins);
+ }
+
+ WT_ASSERT(session, __wt_compare(session, cbt->btree->collator, &key, &cursor->key, &cmp) == 0);
+ WT_ASSERT(session, cmp == 0);
+
+ __wt_buf_free(session, &key);
}
#endif
/*
* __txn_op_log --
- * Log an operation for the current transaction.
+ * Log an operation for the current transaction.
*/
static int
-__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec,
- WT_TXN_OP *op, WT_CURSOR_BTREE *cbt, uint32_t fileid)
+__txn_op_log(
+ WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt, uint32_t fileid)
{
- WT_CURSOR *cursor;
- WT_ITEM value;
- WT_UPDATE *upd;
- uint64_t recno;
-
- cursor = &cbt->iface;
- upd = op->u.op_upd;
- value.data = upd->data;
- value.size = upd->size;
-
- /*
- * Log the row- or column-store insert, modify, remove or update. Our
- * caller doesn't log reserve operations, we shouldn't see them here.
- */
- if (cbt->btree->type == BTREE_ROW) {
+ WT_CURSOR *cursor;
+ WT_ITEM value;
+ WT_UPDATE *upd;
+ uint64_t recno;
+
+ cursor = &cbt->iface;
+ upd = op->u.op_upd;
+ value.data = upd->data;
+ value.size = upd->size;
+
+ /*
+ * Log the row- or column-store insert, modify, remove or update. Our caller doesn't log reserve
+ * operations, we shouldn't see them here.
+ */
+ if (cbt->btree->type == BTREE_ROW) {
#ifdef HAVE_DIAGNOSTIC
- __txn_op_log_row_key_check(session, cbt);
+ __txn_op_log_row_key_check(session, cbt);
#endif
- switch (upd->type) {
- case WT_UPDATE_MODIFY:
- WT_RET(__wt_logop_row_modify_pack(
- session, logrec, fileid, &cursor->key, &value));
- break;
- case WT_UPDATE_STANDARD:
- WT_RET(__wt_logop_row_put_pack(
- session, logrec, fileid, &cursor->key, &value));
- break;
- case WT_UPDATE_TOMBSTONE:
- WT_RET(__wt_logop_row_remove_pack(
- session, logrec, fileid, &cursor->key));
- break;
- default:
- return (__wt_illegal_value(session, upd->type));
- }
- } else {
- recno = WT_INSERT_RECNO(cbt->ins);
- WT_ASSERT(session, recno != WT_RECNO_OOB);
-
- switch (upd->type) {
- case WT_UPDATE_MODIFY:
- WT_RET(__wt_logop_col_modify_pack(
- session, logrec, fileid, recno, &value));
- break;
- case WT_UPDATE_STANDARD:
- WT_RET(__wt_logop_col_put_pack(
- session, logrec, fileid, recno, &value));
- break;
- case WT_UPDATE_TOMBSTONE:
- WT_RET(__wt_logop_col_remove_pack(
- session, logrec, fileid, recno));
- break;
- default:
- return (__wt_illegal_value(session, upd->type));
- }
- }
-
- return (0);
+ switch (upd->type) {
+ case WT_UPDATE_MODIFY:
+ WT_RET(__wt_logop_row_modify_pack(session, logrec, fileid, &cursor->key, &value));
+ break;
+ case WT_UPDATE_STANDARD:
+ WT_RET(__wt_logop_row_put_pack(session, logrec, fileid, &cursor->key, &value));
+ break;
+ case WT_UPDATE_TOMBSTONE:
+ WT_RET(__wt_logop_row_remove_pack(session, logrec, fileid, &cursor->key));
+ break;
+ default:
+ return (__wt_illegal_value(session, upd->type));
+ }
+ } else {
+ recno = WT_INSERT_RECNO(cbt->ins);
+ WT_ASSERT(session, recno != WT_RECNO_OOB);
+
+ switch (upd->type) {
+ case WT_UPDATE_MODIFY:
+ WT_RET(__wt_logop_col_modify_pack(session, logrec, fileid, recno, &value));
+ break;
+ case WT_UPDATE_STANDARD:
+ WT_RET(__wt_logop_col_put_pack(session, logrec, fileid, recno, &value));
+ break;
+ case WT_UPDATE_TOMBSTONE:
+ WT_RET(__wt_logop_col_remove_pack(session, logrec, fileid, recno));
+ break;
+ default:
+ return (__wt_illegal_value(session, upd->type));
+ }
+ }
+
+ return (0);
}
/*
* __txn_oplist_printlog --
- * Print a list of operations from a log record.
+ * Print a list of operations from a log record.
*/
static int
-__txn_oplist_printlog(WT_SESSION_IMPL *session,
- const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args)
+__txn_oplist_printlog(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args)
{
- bool firstrecord;
+ bool firstrecord;
- firstrecord = true;
- WT_RET(__wt_fprintf(session, args->fs, " \"ops\": [\n"));
+ firstrecord = true;
+ WT_RET(__wt_fprintf(session, args->fs, " \"ops\": [\n"));
- /* The logging subsystem zero-pads records. */
- while (*pp < end && **pp) {
- if (!firstrecord)
- WT_RET(__wt_fprintf(session, args->fs, ",\n"));
- WT_RET(__wt_fprintf(session, args->fs, " {"));
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp) {
+ if (!firstrecord)
+ WT_RET(__wt_fprintf(session, args->fs, ",\n"));
+ WT_RET(__wt_fprintf(session, args->fs, " {"));
- firstrecord = false;
+ firstrecord = false;
- WT_RET(__wt_txn_op_printlog(session, pp, end, args));
- WT_RET(__wt_fprintf(session, args->fs, "\n }"));
- }
+ WT_RET(__wt_txn_op_printlog(session, pp, end, args));
+ WT_RET(__wt_fprintf(session, args->fs, "\n }"));
+ }
- WT_RET(__wt_fprintf(session, args->fs, "\n ]\n"));
+ WT_RET(__wt_fprintf(session, args->fs, "\n ]\n"));
- return (0);
+ return (0);
}
/*
* __wt_txn_op_free --
- * Free memory associated with a transactional operation.
+ * Free memory associated with a transactional operation.
*/
void
__wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op)
{
- switch (op->type) {
- case WT_TXN_OP_NONE:
- /*
- * The free function can be called more than once: when there's
- * no operation, a free is unnecessary or has already been done.
- */
- return;
- case WT_TXN_OP_BASIC_COL:
- case WT_TXN_OP_INMEM_COL:
- case WT_TXN_OP_REF_DELETE:
- case WT_TXN_OP_TRUNCATE_COL:
- break;
-
- case WT_TXN_OP_BASIC_ROW:
- case WT_TXN_OP_INMEM_ROW:
- __wt_buf_free(session, &op->u.op_row.key);
- break;
-
- case WT_TXN_OP_TRUNCATE_ROW:
- __wt_buf_free(session, &op->u.truncate_row.start);
- __wt_buf_free(session, &op->u.truncate_row.stop);
- break;
- }
-
- (void)__wt_atomic_subi32(&op->btree->dhandle->session_inuse, 1);
-
- op->type = WT_TXN_OP_NONE;
- op->flags = 0;
+ switch (op->type) {
+ case WT_TXN_OP_NONE:
+ /*
+ * The free function can be called more than once: when there's no operation, a free is
+ * unnecessary or has already been done.
+ */
+ return;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_REF_DELETE:
+ case WT_TXN_OP_TRUNCATE_COL:
+ break;
+
+ case WT_TXN_OP_BASIC_ROW:
+ case WT_TXN_OP_INMEM_ROW:
+ __wt_buf_free(session, &op->u.op_row.key);
+ break;
+
+ case WT_TXN_OP_TRUNCATE_ROW:
+ __wt_buf_free(session, &op->u.truncate_row.start);
+ __wt_buf_free(session, &op->u.truncate_row.stop);
+ break;
+ }
+
+ (void)__wt_atomic_subi32(&op->btree->dhandle->session_inuse, 1);
+
+ op->type = WT_TXN_OP_NONE;
+ op->flags = 0;
}
/*
* __txn_logrec_init --
- * Allocate and initialize a buffer for a transaction's log records.
+ * Allocate and initialize a buffer for a transaction's log records.
*/
static int
__txn_logrec_init(WT_SESSION_IMPL *session)
{
- WT_DECL_ITEM(logrec);
- WT_DECL_RET;
- WT_TXN *txn;
- size_t header_size;
- uint32_t rectype;
- const char *fmt;
-
- txn = &session->txn;
- rectype = WT_LOGREC_COMMIT;
- fmt = WT_UNCHECKED_STRING(Iq);
-
- if (txn->logrec != NULL)
- return (0);
-
- /*
- * The only way we should ever get in here without a txn id is if we
- * are recording diagnostic information. In that case, allocate an id.
- */
- if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE) &&
- txn->id == WT_TXN_NONE)
- WT_RET(__wt_txn_id_check(session));
- else
- WT_ASSERT(session, txn->id != WT_TXN_NONE);
-
- WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
- WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
-
- WT_ERR(__wt_struct_pack(session,
- (uint8_t *)logrec->data + logrec->size, header_size,
- fmt, rectype, txn->id));
- logrec->size += (uint32_t)header_size;
- txn->logrec = logrec;
-
- if (0) {
-err: __wt_logrec_free(session, &logrec);
- }
- return (ret);
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ WT_TXN *txn;
+ size_t header_size;
+ uint32_t rectype;
+ const char *fmt;
+
+ txn = &session->txn;
+ rectype = WT_LOGREC_COMMIT;
+ fmt = WT_UNCHECKED_STRING(Iq);
+
+ if (txn->logrec != NULL) {
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_ID));
+ return (0);
+ }
+
+ /*
+ * The only way we should ever get in here without a txn id is if we are recording diagnostic
+ * information. In that case, allocate an id.
+ */
+ if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE) && txn->id == WT_TXN_NONE)
+ WT_RET(__wt_txn_id_check(session));
+ else
+ WT_ASSERT(session, txn->id != WT_TXN_NONE);
+
+ WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(
+ session, (uint8_t *)logrec->data + logrec->size, header_size, fmt, rectype, txn->id));
+ logrec->size += (uint32_t)header_size;
+ txn->logrec = logrec;
+
+ if (0) {
+err:
+ __wt_logrec_free(session, &logrec);
+ }
+ return (ret);
}
/*
* __wt_txn_log_op --
- * Write the last logged operation into the in-memory buffer.
+ * Write the last logged operation into the in-memory buffer.
*/
int
__wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_ITEM *logrec;
- WT_TXN *txn;
- WT_TXN_OP *op;
-
- uint32_t fileid;
-
- conn = S2C(session);
- txn = &session->txn;
-
- if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
- F_ISSET(session, WT_SESSION_NO_LOGGING) ||
- (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) &&
- !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)))
- return (0);
-
- /* We'd better have a transaction. */
- WT_ASSERT(session,
- F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID));
-
- WT_ASSERT(session, txn->mod_count > 0);
- op = txn->mod + txn->mod_count - 1;
- fileid = op->btree->id;
-
- /*
- * If this operation is diagnostic only, set the ignore bit on the
- * fileid so that recovery can skip it.
- */
- if (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) &&
- FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))
- FLD_SET(fileid, WT_LOGOP_IGNORE);
-
- WT_RET(__txn_logrec_init(session));
- logrec = txn->logrec;
-
- switch (op->type) {
- case WT_TXN_OP_NONE:
- case WT_TXN_OP_INMEM_COL:
- case WT_TXN_OP_INMEM_ROW:
- case WT_TXN_OP_REF_DELETE:
- /* Nothing to log, we're done. */
- break;
- case WT_TXN_OP_BASIC_COL:
- case WT_TXN_OP_BASIC_ROW:
- ret = __txn_op_log(session, logrec, op, cbt, fileid);
- break;
- case WT_TXN_OP_TRUNCATE_COL:
- ret = __wt_logop_col_truncate_pack(session, logrec, fileid,
- op->u.truncate_col.start, op->u.truncate_col.stop);
- break;
- case WT_TXN_OP_TRUNCATE_ROW:
- ret = __wt_logop_row_truncate_pack(session, logrec, fileid,
- &op->u.truncate_row.start, &op->u.truncate_row.stop,
- (uint32_t)op->u.truncate_row.mode);
- break;
- }
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_ITEM *logrec;
+ WT_TXN *txn;
+ WT_TXN_OP *op;
+
+ uint32_t fileid;
+
+ conn = S2C(session);
+ txn = &session->txn;
+
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
+ F_ISSET(session, WT_SESSION_NO_LOGGING) ||
+ (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) &&
+ !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)))
+ return (0);
+
+ /* We'd better have a transaction. */
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID));
+
+ WT_ASSERT(session, txn->mod_count > 0);
+ op = txn->mod + txn->mod_count - 1;
+ fileid = op->btree->id;
+
+ /*
+ * If this operation is diagnostic only, set the ignore bit on the fileid so that recovery can
+ * skip it.
+ */
+ if (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) &&
+ FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))
+ FLD_SET(fileid, WT_LOGOP_IGNORE);
+
+ WT_RET(__txn_logrec_init(session));
+ logrec = txn->logrec;
+
+ switch (op->type) {
+ case WT_TXN_OP_NONE:
+ case WT_TXN_OP_INMEM_COL:
+ case WT_TXN_OP_INMEM_ROW:
+ case WT_TXN_OP_REF_DELETE:
+ /* Nothing to log, we're done. */
+ break;
+ case WT_TXN_OP_BASIC_COL:
+ case WT_TXN_OP_BASIC_ROW:
+ ret = __txn_op_log(session, logrec, op, cbt, fileid);
+ break;
+ case WT_TXN_OP_TRUNCATE_COL:
+ ret = __wt_logop_col_truncate_pack(
+ session, logrec, fileid, op->u.truncate_col.start, op->u.truncate_col.stop);
+ break;
+ case WT_TXN_OP_TRUNCATE_ROW:
+ ret = __wt_logop_row_truncate_pack(session, logrec, fileid, &op->u.truncate_row.start,
+ &op->u.truncate_row.stop, (uint32_t)op->u.truncate_row.mode);
+ break;
+ }
+ return (ret);
}
/*
* __wt_txn_log_commit --
- * Write the operations of a transaction to the log at commit time.
+ * Write the operations of a transaction to the log at commit time.
*/
int
__wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_TXN *txn;
-
- WT_UNUSED(cfg);
- txn = &session->txn;
- /*
- * If there are no log records there is nothing to do.
- */
- if (txn->logrec == NULL)
- return (0);
-
- /* Write updates to the log. */
- return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync));
+ WT_TXN *txn;
+
+ WT_UNUSED(cfg);
+ txn = &session->txn;
+ /*
+ * If there are no log records there is nothing to do.
+ */
+ if (txn->logrec == NULL)
+ return (0);
+
+ /* Write updates to the log. */
+ return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync));
}
/*
* __txn_log_file_sync --
- * Write a log record for a file sync.
+ * Write a log record for a file sync.
*/
static int
__txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp)
{
- WT_BTREE *btree;
- WT_DECL_ITEM(logrec);
- WT_DECL_RET;
- size_t header_size;
- uint32_t rectype, start;
- const char *fmt;
- bool need_sync;
-
- btree = S2BT(session);
- rectype = WT_LOGREC_FILE_SYNC;
- start = LF_ISSET(WT_TXN_LOG_CKPT_START) ? 1 : 0;
- fmt = WT_UNCHECKED_STRING(III);
- need_sync = LF_ISSET(WT_TXN_LOG_CKPT_SYNC);
-
- WT_RET(__wt_struct_size(
- session, &header_size, fmt, rectype, btree->id, start));
- WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
-
- WT_ERR(__wt_struct_pack(session,
- (uint8_t *)logrec->data + logrec->size, header_size,
- fmt, rectype, btree->id, start));
- logrec->size += (uint32_t)header_size;
-
- WT_ERR(__wt_log_write(
- session, logrec, lsnp, need_sync ? WT_LOG_FSYNC : 0));
-err: __wt_logrec_free(session, &logrec);
- return (ret);
+ WT_BTREE *btree;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ size_t header_size;
+ uint32_t rectype, start;
+ const char *fmt;
+ bool need_sync;
+
+ btree = S2BT(session);
+ rectype = WT_LOGREC_FILE_SYNC;
+ start = LF_ISSET(WT_TXN_LOG_CKPT_START) ? 1 : 0;
+ fmt = WT_UNCHECKED_STRING(III);
+ need_sync = LF_ISSET(WT_TXN_LOG_CKPT_SYNC);
+
+ WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, btree->id, start));
+ WT_RET(__wt_logrec_alloc(session, header_size, &logrec));
+
+ WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, header_size, fmt,
+ rectype, btree->id, start));
+ logrec->size += (uint32_t)header_size;
+
+ WT_ERR(__wt_log_write(session, logrec, lsnp, need_sync ? WT_LOG_FSYNC : 0));
+err:
+ __wt_logrec_free(session, &logrec);
+ return (ret);
}
/*
* __wt_txn_checkpoint_logread --
- * Read a log record for a checkpoint operation.
+ * Read a log record for a checkpoint operation.
*/
int
-__wt_txn_checkpoint_logread(WT_SESSION_IMPL *session,
- const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn)
+__wt_txn_checkpoint_logread(
+ WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn)
{
- WT_DECL_RET;
- WT_ITEM ckpt_snapshot_unused;
- uint32_t ckpt_file, ckpt_offset;
- u_int ckpt_nsnapshot_unused;
- const char *fmt;
-
- fmt = WT_UNCHECKED_STRING(IIIu);
-
- if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt,
- &ckpt_file, &ckpt_offset,
- &ckpt_nsnapshot_unused, &ckpt_snapshot_unused)) != 0)
- WT_RET_MSG(session,
- ret, "txn_checkpoint_logread: unpack failure");
- WT_SET_LSN(ckpt_lsn, ckpt_file, ckpt_offset);
- *pp = end;
- return (0);
+ WT_DECL_RET;
+ WT_ITEM ckpt_snapshot_unused;
+ uint32_t ckpt_file, ckpt_offset;
+ u_int ckpt_nsnapshot_unused;
+ const char *fmt;
+
+ fmt = WT_UNCHECKED_STRING(IIIu);
+
+ if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset,
+ &ckpt_nsnapshot_unused, &ckpt_snapshot_unused)) != 0)
+ WT_RET_MSG(session, ret, "txn_checkpoint_logread: unpack failure");
+ WT_SET_LSN(ckpt_lsn, ckpt_file, ckpt_offset);
+ *pp = end;
+ return (0);
}
/*
* __wt_txn_ts_log --
- * Write a log record recording timestamps in the transaction.
+ * Write a log record recording timestamps in the transaction.
*/
int
__wt_txn_ts_log(WT_SESSION_IMPL *session)
{
- struct timespec t;
- WT_CONNECTION_IMPL *conn;
- WT_ITEM *logrec;
- WT_TXN *txn;
- wt_timestamp_t commit, durable, first, prepare, read;
-
- conn = S2C(session);
- txn = &session->txn;
-
- if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
- F_ISSET(session, WT_SESSION_NO_LOGGING) ||
- !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))
- return (0);
-
- /* We'd better have a transaction running. */
- WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
-
- WT_RET(__txn_logrec_init(session));
- logrec = txn->logrec;
- commit = durable = first = prepare = read = WT_TS_NONE;
- if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
- commit = txn->commit_timestamp;
- first = txn->first_commit_timestamp;
- }
- if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
- durable = txn->durable_timestamp;
- if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
- prepare = txn->prepare_timestamp;
- if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
- read = txn->read_timestamp;
-
- __wt_epoch(session, &t);
- return (__wt_logop_txn_timestamp_pack(session, logrec,
- (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec,
- commit, durable, first, prepare, read));
+ struct timespec t;
+ WT_CONNECTION_IMPL *conn;
+ WT_ITEM *logrec;
+ WT_TXN *txn;
+ wt_timestamp_t commit, durable, first, prepare, read;
+
+ conn = S2C(session);
+ txn = &session->txn;
+
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ||
+ F_ISSET(session, WT_SESSION_NO_LOGGING) ||
+ !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))
+ return (0);
+
+ /*
+ * There is a rare usage case of a prepared transaction that has no modifications, but then
+ * commits and sets timestamps. If an empty transaction has been prepared, don't bother writing
+ * a timestamp operation record.
+ */
+ if (F_ISSET(txn, WT_TXN_PREPARE) && txn->mod_count == 0)
+ return (0);
+
+ /* We'd better have a transaction running. */
+ WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
+
+ WT_RET(__txn_logrec_init(session));
+ logrec = txn->logrec;
+ commit = durable = first = prepare = read = WT_TS_NONE;
+ if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
+ commit = txn->commit_timestamp;
+ first = txn->first_commit_timestamp;
+ }
+ if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
+ durable = txn->durable_timestamp;
+ if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
+ prepare = txn->prepare_timestamp;
+ if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ read = txn->read_timestamp;
+
+ __wt_epoch(session, &t);
+ return (__wt_logop_txn_timestamp_pack(session, logrec, (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec,
+ commit, durable, first, prepare, read));
}
/*
* __wt_txn_checkpoint_log --
- * Write a log record for a checkpoint operation.
+ * Write a log record for a checkpoint operation.
*/
int
-__wt_txn_checkpoint_log(
- WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp)
+__wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_ITEM(logrec);
- WT_DECL_RET;
- WT_ITEM *ckpt_snapshot, empty;
- WT_LSN *ckpt_lsn;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- size_t recsize;
- uint32_t i, rectype;
- uint8_t *end, *p;
- const char *fmt;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- txn = &session->txn;
- ckpt_lsn = &txn->ckpt_lsn;
-
- /*
- * If this is a file sync, log it unless there is a full checkpoint in
- * progress.
- */
- if (!full) {
- if (txn->full_ckpt) {
- if (lsnp != NULL)
- *lsnp = *ckpt_lsn;
- return (0);
- }
- return (__txn_log_file_sync(session, flags, lsnp));
- }
-
- switch (flags) {
- case WT_TXN_LOG_CKPT_PREPARE:
- txn->full_ckpt = true;
-
- if (conn->compat_major >= WT_LOG_V2_MAJOR) {
- /*
- * Write the system log record containing a checkpoint
- * start operation.
- */
- rectype = WT_LOGREC_SYSTEM;
- fmt = WT_UNCHECKED_STRING(I);
- WT_ERR(__wt_struct_size(
- session, &recsize, fmt, rectype));
- WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
-
- WT_ERR(__wt_struct_pack(session,
- (uint8_t *)logrec->data + logrec->size, recsize,
- fmt, rectype));
- logrec->size += (uint32_t)recsize;
- WT_ERR(__wt_logop_checkpoint_start_pack(
- session, logrec));
- WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0));
- } else {
- WT_ERR(__wt_log_printf(session,
- "CHECKPOINT: Starting record"));
- WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
- }
-
- /*
- * We take and immediately release the visibility lock.
- * Acquiring the write lock guarantees that any transaction
- * that has written to the log has also made its transaction
- * visible at this time.
- */
- __wt_writelock(session, &txn_global->visibility_rwlock);
- __wt_writeunlock(session, &txn_global->visibility_rwlock);
-
- /*
- * We need to make sure that the log records in the checkpoint
- * LSN are on disk. In particular to make sure that the
- * current log file exists.
- */
- WT_ERR(__wt_log_force_sync(session, ckpt_lsn));
- break;
- case WT_TXN_LOG_CKPT_START:
- /* Take a copy of the transaction snapshot. */
- txn->ckpt_nsnapshot = txn->snapshot_count;
- recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
- WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
- p = txn->ckpt_snapshot->mem;
- end = p + recsize;
- for (i = 0; i < txn->snapshot_count; i++)
- WT_ERR(__wt_vpack_uint(
- &p, WT_PTRDIFF(end, p), txn->snapshot[i]));
- break;
- case WT_TXN_LOG_CKPT_STOP:
- /*
- * During a clean connection close, we get here without the
- * prepare or start steps. In that case, log the current LSN
- * as the checkpoint LSN.
- */
- if (!txn->full_ckpt) {
- txn->ckpt_nsnapshot = 0;
- WT_CLEAR(empty);
- ckpt_snapshot = &empty;
- WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
- } else
- ckpt_snapshot = txn->ckpt_snapshot;
-
- /* Write the checkpoint log record. */
- rectype = WT_LOGREC_CHECKPOINT;
- fmt = WT_UNCHECKED_STRING(IIIIu);
- WT_ERR(__wt_struct_size(session, &recsize,
- fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
- txn->ckpt_nsnapshot, ckpt_snapshot));
- WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
-
- WT_ERR(__wt_struct_pack(session,
- (uint8_t *)logrec->data + logrec->size, recsize,
- fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset,
- txn->ckpt_nsnapshot, ckpt_snapshot));
- logrec->size += (uint32_t)recsize;
- WT_ERR(__wt_log_write(session, logrec, lsnp,
- F_ISSET(conn, WT_CONN_CKPT_SYNC) ?
- WT_LOG_FSYNC : 0));
-
- /*
- * If this full checkpoint completed successfully and there is
- * no hot backup in progress and this is not an unclean
- * recovery, tell the logging subsystem the checkpoint LSN so
- * that it can archive. Do not update the logging checkpoint
- * LSN if this is during a clean connection close, only during
- * a full checkpoint. A clean close may not update any
- * metadata LSN and we do not want to archive in that case.
- */
- if (!conn->hot_backup &&
- (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) ||
- FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) &&
- txn->full_ckpt)
- __wt_log_ckpt(session, ckpt_lsn);
-
- /* FALLTHROUGH */
- case WT_TXN_LOG_CKPT_CLEANUP:
- /* Cleanup any allocated resources */
- WT_INIT_LSN(ckpt_lsn);
- txn->ckpt_nsnapshot = 0;
- __wt_scr_free(session, &txn->ckpt_snapshot);
- txn->full_ckpt = false;
- break;
- default:
- WT_ERR(__wt_illegal_value(session, flags));
- }
-
-err: __wt_logrec_free(session, &logrec);
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_ITEM(logrec);
+ WT_DECL_RET;
+ WT_ITEM *ckpt_snapshot, empty;
+ WT_LSN *ckpt_lsn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ size_t recsize;
+ uint32_t i, rectype;
+ uint8_t *end, *p;
+ const char *fmt;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ txn = &session->txn;
+ ckpt_lsn = &txn->ckpt_lsn;
+
+ /*
+ * If this is a file sync, log it unless there is a full checkpoint in progress.
+ */
+ if (!full) {
+ if (txn->full_ckpt) {
+ if (lsnp != NULL)
+ *lsnp = *ckpt_lsn;
+ return (0);
+ }
+ return (__txn_log_file_sync(session, flags, lsnp));
+ }
+
+ switch (flags) {
+ case WT_TXN_LOG_CKPT_PREPARE:
+ txn->full_ckpt = true;
+
+ if (conn->compat_major >= WT_LOG_V2_MAJOR) {
+ /*
+ * Write the system log record containing a checkpoint start operation.
+ */
+ rectype = WT_LOGREC_SYSTEM;
+ fmt = WT_UNCHECKED_STRING(I);
+ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype));
+ WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+ WT_ERR(__wt_struct_pack(
+ session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype));
+ logrec->size += (uint32_t)recsize;
+ WT_ERR(__wt_logop_checkpoint_start_pack(session, logrec));
+ WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0));
+ } else {
+ WT_ERR(__wt_log_printf(session, "CHECKPOINT: Starting record"));
+ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
+ }
+
+ /*
+ * We take and immediately release the visibility lock. Acquiring the write lock guarantees
+ * that any transaction that has written to the log has also made its transaction visible at
+ * this time.
+ */
+ __wt_writelock(session, &txn_global->visibility_rwlock);
+ __wt_writeunlock(session, &txn_global->visibility_rwlock);
+
+ /*
+ * We need to make sure that the log records in the checkpoint LSN are on disk. In
+ * particular to make sure that the current log file exists.
+ */
+ WT_ERR(__wt_log_force_sync(session, ckpt_lsn));
+ break;
+ case WT_TXN_LOG_CKPT_START:
+ /* Take a copy of the transaction snapshot. */
+ txn->ckpt_nsnapshot = txn->snapshot_count;
+ recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE;
+ WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot));
+ p = txn->ckpt_snapshot->mem;
+ end = p + recsize;
+ for (i = 0; i < txn->snapshot_count; i++)
+ WT_ERR(__wt_vpack_uint(&p, WT_PTRDIFF(end, p), txn->snapshot[i]));
+ break;
+ case WT_TXN_LOG_CKPT_STOP:
+ /*
+ * During a clean connection close, we get here without the prepare or start steps. In that
+ * case, log the current LSN as the checkpoint LSN.
+ */
+ if (!txn->full_ckpt) {
+ txn->ckpt_nsnapshot = 0;
+ WT_CLEAR(empty);
+ ckpt_snapshot = &empty;
+ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true));
+ } else
+ ckpt_snapshot = txn->ckpt_snapshot;
+
+ /* Write the checkpoint log record. */
+ rectype = WT_LOGREC_CHECKPOINT;
+ fmt = WT_UNCHECKED_STRING(IIIIu);
+ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->l.file,
+ ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot));
+ WT_ERR(__wt_logrec_alloc(session, recsize, &logrec));
+
+ WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt,
+ rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot));
+ logrec->size += (uint32_t)recsize;
+ WT_ERR(__wt_log_write(
+ session, logrec, lsnp, F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0));
+
+ /*
+ * If this full checkpoint completed successfully and there is no hot backup in progress and
+ * this is not an unclean recovery, tell the logging subsystem the checkpoint LSN so that it
+ * can archive. Do not update the logging checkpoint LSN if this is during a clean
+ * connection close, only during a full checkpoint. A clean close may not update any
+ * metadata LSN and we do not want to archive in that case.
+ */
+ if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) ||
+ FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) &&
+ txn->full_ckpt)
+ __wt_log_ckpt(session, ckpt_lsn);
+
+ /* FALLTHROUGH */
+ case WT_TXN_LOG_CKPT_CLEANUP:
+ /* Cleanup any allocated resources */
+ WT_INIT_LSN(ckpt_lsn);
+ txn->ckpt_nsnapshot = 0;
+ __wt_scr_free(session, &txn->ckpt_snapshot);
+ txn->full_ckpt = false;
+ break;
+ default:
+ WT_ERR(__wt_illegal_value(session, flags));
+ }
+
+err:
+ __wt_logrec_free(session, &logrec);
+ return (ret);
}
/*
* __wt_txn_truncate_log --
- * Begin truncating a range of a file.
+ * Begin truncating a range of a file.
*/
int
-__wt_txn_truncate_log(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
+__wt_txn_truncate_log(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
{
- WT_BTREE *btree;
- WT_ITEM *item;
- WT_TXN_OP *op;
-
- btree = S2BT(session);
-
- WT_RET(__txn_next_op(session, &op));
-
- if (btree->type == BTREE_ROW) {
- op->type = WT_TXN_OP_TRUNCATE_ROW;
- op->u.truncate_row.mode = WT_TXN_TRUNC_ALL;
- WT_CLEAR(op->u.truncate_row.start);
- WT_CLEAR(op->u.truncate_row.stop);
- if (start != NULL) {
- op->u.truncate_row.mode = WT_TXN_TRUNC_START;
- item = &op->u.truncate_row.start;
- WT_RET(__wt_cursor_get_raw_key(&start->iface, item));
- WT_RET(__wt_buf_set(
- session, item, item->data, item->size));
- }
- if (stop != NULL) {
- op->u.truncate_row.mode =
- (op->u.truncate_row.mode == WT_TXN_TRUNC_ALL) ?
- WT_TXN_TRUNC_STOP : WT_TXN_TRUNC_BOTH;
- item = &op->u.truncate_row.stop;
- WT_RET(__wt_cursor_get_raw_key(&stop->iface, item));
- WT_RET(__wt_buf_set(
- session, item, item->data, item->size));
- }
- } else {
- op->type = WT_TXN_OP_TRUNCATE_COL;
- op->u.truncate_col.start =
- (start == NULL) ? WT_RECNO_OOB : start->recno;
- op->u.truncate_col.stop =
- (stop == NULL) ? WT_RECNO_OOB : stop->recno;
- }
-
- /* Write that operation into the in-memory log. */
- WT_RET(__wt_txn_log_op(session, NULL));
-
- WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM));
- F_SET(session, WT_SESSION_LOGGING_INMEM);
- return (0);
+ WT_BTREE *btree;
+ WT_ITEM *item;
+ WT_TXN_OP *op;
+
+ btree = S2BT(session);
+
+ WT_RET(__txn_next_op(session, &op));
+
+ if (btree->type == BTREE_ROW) {
+ op->type = WT_TXN_OP_TRUNCATE_ROW;
+ op->u.truncate_row.mode = WT_TXN_TRUNC_ALL;
+ WT_CLEAR(op->u.truncate_row.start);
+ WT_CLEAR(op->u.truncate_row.stop);
+ if (start != NULL) {
+ op->u.truncate_row.mode = WT_TXN_TRUNC_START;
+ item = &op->u.truncate_row.start;
+ WT_RET(__wt_cursor_get_raw_key(&start->iface, item));
+ WT_RET(__wt_buf_set(session, item, item->data, item->size));
+ }
+ if (stop != NULL) {
+ op->u.truncate_row.mode =
+ (op->u.truncate_row.mode == WT_TXN_TRUNC_ALL) ? WT_TXN_TRUNC_STOP : WT_TXN_TRUNC_BOTH;
+ item = &op->u.truncate_row.stop;
+ WT_RET(__wt_cursor_get_raw_key(&stop->iface, item));
+ WT_RET(__wt_buf_set(session, item, item->data, item->size));
+ }
+ } else {
+ op->type = WT_TXN_OP_TRUNCATE_COL;
+ op->u.truncate_col.start = (start == NULL) ? WT_RECNO_OOB : start->recno;
+ op->u.truncate_col.stop = (stop == NULL) ? WT_RECNO_OOB : stop->recno;
+ }
+
+ /* Write that operation into the in-memory log. */
+ WT_RET(__wt_txn_log_op(session, NULL));
+
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM));
+ F_SET(session, WT_SESSION_LOGGING_INMEM);
+ return (0);
}
/*
* __wt_txn_truncate_end --
- * Finish truncating a range of a file.
+ * Finish truncating a range of a file.
*/
void
__wt_txn_truncate_end(WT_SESSION_IMPL *session)
{
- F_CLR(session, WT_SESSION_LOGGING_INMEM);
+ F_CLR(session, WT_SESSION_LOGGING_INMEM);
}
/*
* __txn_printlog --
- * Print a log record in a human-readable format.
+ * Print a log record in a human-readable format.
*/
static int
-__txn_printlog(WT_SESSION_IMPL *session,
- WT_ITEM *rawrec, WT_LSN *lsnp, WT_LSN *next_lsnp,
- void *cookie, int firstrecord)
+__txn_printlog(WT_SESSION_IMPL *session, WT_ITEM *rawrec, WT_LSN *lsnp, WT_LSN *next_lsnp,
+ void *cookie, int firstrecord)
{
- WT_LOG_RECORD *logrec;
- WT_TXN_PRINTLOG_ARGS *args;
- uint64_t txnid;
- uint32_t fileid, lsnfile, lsnoffset, rectype;
- int32_t start;
- const uint8_t *end, *p;
- const char *msg;
- bool compressed;
-
- WT_UNUSED(next_lsnp);
- args = cookie;
-
- p = WT_LOG_SKIP_HEADER(rawrec->data);
- end = (const uint8_t *)rawrec->data + rawrec->size;
- logrec = (WT_LOG_RECORD *)rawrec->data;
- compressed = F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED);
-
- /* First, peek at the log record type. */
- WT_RET(__wt_logrec_read(session, &p, end, &rectype));
-
- if (!firstrecord)
- WT_RET(__wt_fprintf(session, args->fs, ",\n"));
-
- WT_RET(__wt_fprintf(session, args->fs,
- " { \"lsn\" : [%" PRIu32 ",%" PRIu32 "],\n",
- lsnp->l.file, lsnp->l.offset));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"hdr_flags\" : \"%s\",\n", compressed ? "compressed" : ""));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"rec_len\" : %" PRIu32 ",\n", logrec->len));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"mem_len\" : %" PRIu32 ",\n",
- compressed ? logrec->mem_len : logrec->len));
-
- switch (rectype) {
- case WT_LOGREC_CHECKPOINT:
- WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
- WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"type\" : \"checkpoint\",\n"));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"ckpt_lsn\" : [%" PRIu32 ",%" PRIu32 "]\n",
- lsnfile, lsnoffset));
- break;
-
- case WT_LOGREC_COMMIT:
- WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"type\" : \"commit\",\n"));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"txnid\" : %" PRIu64 ",\n", txnid));
- WT_RET(__txn_oplist_printlog(session, &p, end, args));
- break;
-
- case WT_LOGREC_FILE_SYNC:
- WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
- WT_UNCHECKED_STRING(Ii), &fileid, &start));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"type\" : \"file_sync\",\n"));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"fileid\" : %" PRIu32 ",\n", fileid));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"start\" : %" PRId32 "\n", start));
- break;
-
- case WT_LOGREC_MESSAGE:
- WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
- WT_UNCHECKED_STRING(S), &msg));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"type\" : \"message\",\n"));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"message\" : \"%s\"\n", msg));
- break;
-
- case WT_LOGREC_SYSTEM:
- WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p),
- WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
- WT_RET(__wt_fprintf(session, args->fs,
- " \"type\" : \"system\",\n"));
- WT_RET(__txn_oplist_printlog(session, &p, end, args));
- break;
- }
-
- WT_RET(__wt_fprintf(session, args->fs, " }"));
-
- return (0);
+ WT_LOG_RECORD *logrec;
+ WT_TXN_PRINTLOG_ARGS *args;
+ uint64_t txnid;
+ uint32_t fileid, lsnfile, lsnoffset, rectype;
+ int32_t start;
+ const uint8_t *end, *p;
+ const char *msg;
+ bool compressed;
+
+ WT_UNUSED(next_lsnp);
+ args = cookie;
+
+ p = WT_LOG_SKIP_HEADER(rawrec->data);
+ end = (const uint8_t *)rawrec->data + rawrec->size;
+ logrec = (WT_LOG_RECORD *)rawrec->data;
+ compressed = F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED);
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ if (!firstrecord)
+ WT_RET(__wt_fprintf(session, args->fs, ",\n"));
+
+ WT_RET(__wt_fprintf(session, args->fs, " { \"lsn\" : [%" PRIu32 ",%" PRIu32 "],\n",
+ lsnp->l.file, lsnp->l.offset));
+ WT_RET(__wt_fprintf(
+ session, args->fs, " \"hdr_flags\" : \"%s\",\n", compressed ? "compressed" : ""));
+ WT_RET(__wt_fprintf(session, args->fs, " \"rec_len\" : %" PRIu32 ",\n", logrec->len));
+ WT_RET(__wt_fprintf(session, args->fs, " \"mem_len\" : %" PRIu32 ",\n",
+ compressed ? logrec->mem_len : logrec->len));
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ WT_RET(__wt_struct_unpack(
+ session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
+ WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"checkpoint\",\n"));
+ WT_RET(__wt_fprintf(
+ session, args->fs, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRIu32 "]\n", lsnfile, lsnoffset));
+ break;
+
+ case WT_LOGREC_COMMIT:
+ WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid));
+ WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"commit\",\n"));
+ WT_RET(__wt_fprintf(session, args->fs, " \"txnid\" : %" PRIu64 ",\n", txnid));
+ WT_RET(__txn_oplist_printlog(session, &p, end, args));
+ break;
+
+ case WT_LOGREC_FILE_SYNC:
+ WT_RET(__wt_struct_unpack(
+ session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(Ii), &fileid, &start));
+ WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"file_sync\",\n"));
+ WT_RET(__wt_fprintf(session, args->fs, " \"fileid\" : %" PRIu32 ",\n", fileid));
+ WT_RET(__wt_fprintf(session, args->fs, " \"start\" : %" PRId32 "\n", start));
+ break;
+
+ case WT_LOGREC_MESSAGE:
+ WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(S), &msg));
+ WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"message\",\n"));
+ WT_RET(__wt_fprintf(session, args->fs, " \"message\" : \"%s\"\n", msg));
+ break;
+
+ case WT_LOGREC_SYSTEM:
+ WT_RET(__wt_struct_unpack(
+ session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset));
+ WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"system\",\n"));
+ WT_RET(__txn_oplist_printlog(session, &p, end, args));
+ break;
+ }
+
+ WT_RET(__wt_fprintf(session, args->fs, " }"));
+
+ return (0);
}
/*
* __wt_txn_printlog --
- * Print the log in a human-readable format.
+ * Print the log in a human-readable format.
*/
int
__wt_txn_printlog(WT_SESSION *wt_session, const char *ofile, uint32_t flags)
- WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
+ WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
- WT_DECL_RET;
- WT_FSTREAM *fs;
- WT_SESSION_IMPL *session;
- WT_TXN_PRINTLOG_ARGS args;
-
- session = (WT_SESSION_IMPL *)wt_session;
- if (ofile == NULL)
- fs = WT_STDOUT(session);
- else
- WT_RET(__wt_fopen(session, ofile,
- WT_FS_OPEN_CREATE | WT_FS_OPEN_FIXED,
- WT_STREAM_WRITE, &fs));
-
- WT_ERR(__wt_fprintf(session, fs, "[\n"));
- args.fs = fs;
- args.flags = flags;
- WT_ERR(__wt_log_scan(
- session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args));
- ret = __wt_fprintf(session, fs, "\n]\n");
-
-err: if (ofile != NULL)
- WT_TRET(__wt_fclose(session, &fs));
-
- return (ret);
+ WT_DECL_RET;
+ WT_FSTREAM *fs;
+ WT_SESSION_IMPL *session;
+ WT_TXN_PRINTLOG_ARGS args;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ if (ofile == NULL)
+ fs = WT_STDOUT(session);
+ else
+ WT_RET(
+ __wt_fopen(session, ofile, WT_FS_OPEN_CREATE | WT_FS_OPEN_FIXED, WT_STREAM_WRITE, &fs));
+
+ WT_ERR(__wt_fprintf(session, fs, "[\n"));
+ args.fs = fs;
+ args.flags = flags;
+ WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args));
+ ret = __wt_fprintf(session, fs, "\n]\n");
+
+err:
+ if (ofile != NULL)
+ WT_TRET(__wt_fclose(session, &fs));
+
+ return (ret);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_nsnap.c b/src/third_party/wiredtiger/src/txn/txn_nsnap.c
index 533c67b70b0..f652e23c87d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_nsnap.c
+++ b/src/third_party/wiredtiger/src/txn/txn_nsnap.c
@@ -10,420 +10,397 @@
/*
* __nsnap_destroy --
- * Destroy a named snapshot structure.
+ * Destroy a named snapshot structure.
*/
static void
__nsnap_destroy(WT_SESSION_IMPL *session, WT_NAMED_SNAPSHOT *nsnap)
{
- __wt_free(session, nsnap->name);
- __wt_free(session, nsnap->snapshot);
- __wt_free(session, nsnap);
+ __wt_free(session, nsnap->name);
+ __wt_free(session, nsnap->snapshot);
+ __wt_free(session, nsnap);
}
/*
* __nsnap_drop_one --
- * Drop a single named snapshot. The named snapshot lock must be held
- * write locked.
+ * Drop a single named snapshot. The named snapshot lock must be held write locked.
*/
static int
__nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
{
- WT_NAMED_SNAPSHOT *found;
- WT_TXN_GLOBAL *txn_global;
-
- txn_global = &S2C(session)->txn_global;
-
- TAILQ_FOREACH(found, &txn_global->nsnaph, q)
- if (WT_STRING_MATCH(found->name, name->str, name->len))
- break;
-
- if (found == NULL)
- return (WT_NOTFOUND);
-
- /* Bump the global ID if we are removing the first entry */
- if (found == TAILQ_FIRST(&txn_global->nsnaph)) {
- WT_ASSERT(session, !__wt_txn_visible_all(
- session, txn_global->nsnap_oldest_id, WT_TS_NONE));
- txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ?
- TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE;
- WT_DIAGNOSTIC_YIELD;
- WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE ||
- !__wt_txn_visible_all(
- session, txn_global->nsnap_oldest_id, WT_TS_NONE));
- }
- TAILQ_REMOVE(&txn_global->nsnaph, found, q);
- __nsnap_destroy(session, found);
- WT_STAT_CONN_INCR(session, txn_snapshots_dropped);
-
- return (0);
+ WT_NAMED_SNAPSHOT *found;
+ WT_TXN_GLOBAL *txn_global;
+
+ txn_global = &S2C(session)->txn_global;
+
+ TAILQ_FOREACH (found, &txn_global->nsnaph, q)
+ if (WT_STRING_MATCH(found->name, name->str, name->len))
+ break;
+
+ if (found == NULL)
+ return (WT_NOTFOUND);
+
+ /* Bump the global ID if we are removing the first entry */
+ if (found == TAILQ_FIRST(&txn_global->nsnaph)) {
+ WT_ASSERT(session, !__wt_txn_visible_all(session, txn_global->nsnap_oldest_id, WT_TS_NONE));
+ txn_global->nsnap_oldest_id =
+ (TAILQ_NEXT(found, q) != NULL) ? TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE;
+ WT_DIAGNOSTIC_YIELD;
+ WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE ||
+ !__wt_txn_visible_all(session, txn_global->nsnap_oldest_id, WT_TS_NONE));
+ }
+ TAILQ_REMOVE(&txn_global->nsnaph, found, q);
+ __nsnap_destroy(session, found);
+ WT_STAT_CONN_INCR(session, txn_snapshots_dropped);
+
+ return (0);
}
/*
* __nsnap_drop_to --
- * Drop named snapshots, if the name is NULL all snapshots will be
- * dropped. The named snapshot lock must be held write locked.
+ * Drop named snapshots, if the name is NULL all snapshots will be dropped. The named snapshot
+ * lock must be held write locked.
*/
static int
__nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive)
{
- WT_NAMED_SNAPSHOT *last, *nsnap, *prev;
- WT_TXN_GLOBAL *txn_global;
- uint64_t new_nsnap_oldest;
-
- last = nsnap = prev = NULL;
- txn_global = &S2C(session)->txn_global;
-
- if (TAILQ_EMPTY(&txn_global->nsnaph)) {
- if (name == NULL)
- return (0);
- /*
- * Dropping specific snapshots when there aren't any it's an
- * error.
- */
- WT_RET_MSG(session, EINVAL,
- "Named snapshot '%.*s' for drop not found",
- (int)name->len, name->str);
- }
-
- /*
- * The new ID will be none if we are removing all named snapshots
- * which is the default behavior of this loop.
- */
- new_nsnap_oldest = WT_TXN_NONE;
- if (name != NULL) {
- TAILQ_FOREACH(last, &txn_global->nsnaph, q) {
- if (WT_STRING_MATCH(last->name, name->str, name->len))
- break;
- prev = last;
- }
- if (last == NULL)
- WT_RET_MSG(session, EINVAL,
- "Named snapshot '%.*s' for drop not found",
- (int)name->len, name->str);
-
- if (!inclusive) {
- /* We are done if a drop before points to the head */
- if (prev == 0)
- return (0);
- last = prev;
- }
-
- if (TAILQ_NEXT(last, q) != NULL)
- new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id;
- }
-
- do {
- nsnap = TAILQ_FIRST(&txn_global->nsnaph);
- WT_ASSERT(session, nsnap != NULL);
- TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
- __nsnap_destroy(session, nsnap);
- WT_STAT_CONN_INCR(session, txn_snapshots_dropped);
- /* Last will be NULL in the all case so it will never match */
- } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph));
-
- /* Now that the queue of named snapshots is updated, update the ID */
- WT_ASSERT(session, !__wt_txn_visible_all(
- session, txn_global->nsnap_oldest_id, WT_TS_NONE) &&
- (new_nsnap_oldest == WT_TXN_NONE ||
- WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest)));
- txn_global->nsnap_oldest_id = new_nsnap_oldest;
- WT_DIAGNOSTIC_YIELD;
- WT_ASSERT(session,
- new_nsnap_oldest == WT_TXN_NONE ||
- !__wt_txn_visible_all(session, new_nsnap_oldest, WT_TS_NONE));
-
- return (0);
+ WT_NAMED_SNAPSHOT *last, *nsnap, *prev;
+ WT_TXN_GLOBAL *txn_global;
+ uint64_t new_nsnap_oldest;
+
+ last = nsnap = prev = NULL;
+ txn_global = &S2C(session)->txn_global;
+
+ if (TAILQ_EMPTY(&txn_global->nsnaph)) {
+ if (name == NULL)
+ return (0);
+ /*
+ * Dropping specific snapshots when there aren't any it's an error.
+ */
+ WT_RET_MSG(
+ session, EINVAL, "Named snapshot '%.*s' for drop not found", (int)name->len, name->str);
+ }
+
+ /*
+ * The new ID will be none if we are removing all named snapshots which is the default behavior
+ * of this loop.
+ */
+ new_nsnap_oldest = WT_TXN_NONE;
+ if (name != NULL) {
+ TAILQ_FOREACH (last, &txn_global->nsnaph, q) {
+ if (WT_STRING_MATCH(last->name, name->str, name->len))
+ break;
+ prev = last;
+ }
+ if (last == NULL)
+ WT_RET_MSG(session, EINVAL, "Named snapshot '%.*s' for drop not found", (int)name->len,
+ name->str);
+
+ if (!inclusive) {
+ /* We are done if a drop before points to the head */
+ if (prev == 0)
+ return (0);
+ last = prev;
+ }
+
+ if (TAILQ_NEXT(last, q) != NULL)
+ new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id;
+ }
+
+ do {
+ nsnap = TAILQ_FIRST(&txn_global->nsnaph);
+ WT_ASSERT(session, nsnap != NULL);
+ TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
+ __nsnap_destroy(session, nsnap);
+ WT_STAT_CONN_INCR(session, txn_snapshots_dropped);
+ /* Last will be NULL in the all case so it will never match */
+ } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph));
+
+ /* Now that the queue of named snapshots is updated, update the ID */
+ WT_ASSERT(session, !__wt_txn_visible_all(session, txn_global->nsnap_oldest_id, WT_TS_NONE) &&
+ (new_nsnap_oldest == WT_TXN_NONE ||
+ WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest)));
+ txn_global->nsnap_oldest_id = new_nsnap_oldest;
+ WT_DIAGNOSTIC_YIELD;
+ WT_ASSERT(session, new_nsnap_oldest == WT_TXN_NONE ||
+ !__wt_txn_visible_all(session, new_nsnap_oldest, WT_TS_NONE));
+
+ return (0);
}
/*
* __wt_txn_named_snapshot_begin --
- * Begin an named in-memory snapshot.
+ * Begin an named in-memory snapshot.
*/
int
__wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- WT_NAMED_SNAPSHOT *nsnap, *nsnap_new;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- const char *txn_cfg[] =
- { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction),
- "isolation=snapshot", NULL };
- bool include_updates, started_txn;
-
- started_txn = false;
- nsnap_new = NULL;
- txn_global = &S2C(session)->txn_global;
- txn = &session->txn;
-
- WT_RET(__wt_config_gets_def(session, cfg, "include_updates", 0, &cval));
- include_updates = cval.val != 0;
-
- WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
- WT_ASSERT(session, cval.len != 0);
-
- if (!F_ISSET(txn, WT_TXN_RUNNING)) {
- if (include_updates)
- WT_RET_MSG(session, EINVAL, "A transaction must be "
- "running to include updates in a named snapshot");
-
- WT_RET(__wt_txn_begin(session, txn_cfg));
- started_txn = true;
- }
- if (!include_updates)
- F_SET(txn, WT_TXN_READONLY);
-
- /* Save a copy of the transaction's snapshot. */
- WT_ERR(__wt_calloc_one(session, &nsnap_new));
- nsnap = nsnap_new;
- WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name));
-
- /*
- * To include updates from a writing transaction, make sure a
- * transaction ID has been allocated.
- */
- if (include_updates) {
- WT_ERR(__wt_txn_id_check(session));
- WT_ASSERT(session, txn->id != WT_TXN_NONE);
- nsnap->id = txn->id;
- } else
- nsnap->id = WT_TXN_NONE;
- nsnap->pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
- nsnap->snap_min = txn->snap_min;
- nsnap->snap_max = txn->snap_max;
- if (txn->snapshot_count > 0) {
- WT_ERR(__wt_calloc_def(
- session, txn->snapshot_count, &nsnap->snapshot));
- memcpy(nsnap->snapshot, txn->snapshot,
- txn->snapshot_count * sizeof(*nsnap->snapshot));
- }
- nsnap->snapshot_count = txn->snapshot_count;
-
- /* Update the list. */
-
- /*
- * The semantic is that a new snapshot with the same name as an
- * existing snapshot will replace the old one.
- */
- WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval));
-
- if (TAILQ_EMPTY(&txn_global->nsnaph)) {
- WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE &&
- !__wt_txn_visible_all(
- session, nsnap_new->pinned_id, WT_TS_NONE));
- __wt_readlock(session, &txn_global->rwlock);
- txn_global->nsnap_oldest_id = nsnap_new->pinned_id;
- __wt_readunlock(session, &txn_global->rwlock);
- }
- TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
- WT_STAT_CONN_INCR(session, txn_snapshots_created);
- nsnap_new = NULL;
-
-err: if (started_txn) {
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ WT_NAMED_SNAPSHOT *nsnap, *nsnap_new;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ const char *txn_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL};
+ bool include_updates, started_txn;
+
+ started_txn = false;
+ nsnap_new = NULL;
+ txn_global = &S2C(session)->txn_global;
+ txn = &session->txn;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "include_updates", 0, &cval));
+ include_updates = cval.val != 0;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
+ WT_ASSERT(session, cval.len != 0);
+
+ if (!F_ISSET(txn, WT_TXN_RUNNING)) {
+ if (include_updates)
+ WT_RET_MSG(session, EINVAL,
+ "A transaction must be "
+ "running to include updates in a named snapshot");
+
+ WT_RET(__wt_txn_begin(session, txn_cfg));
+ started_txn = true;
+ }
+ if (!include_updates)
+ F_SET(txn, WT_TXN_READONLY);
+
+ /* Save a copy of the transaction's snapshot. */
+ WT_ERR(__wt_calloc_one(session, &nsnap_new));
+ nsnap = nsnap_new;
+ WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name));
+
+ /*
+ * To include updates from a writing transaction, make sure a transaction ID has been allocated.
+ */
+ if (include_updates) {
+ WT_ERR(__wt_txn_id_check(session));
+ WT_ASSERT(session, txn->id != WT_TXN_NONE);
+ nsnap->id = txn->id;
+ } else
+ nsnap->id = WT_TXN_NONE;
+ nsnap->pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
+ nsnap->snap_min = txn->snap_min;
+ nsnap->snap_max = txn->snap_max;
+ if (txn->snapshot_count > 0) {
+ WT_ERR(__wt_calloc_def(session, txn->snapshot_count, &nsnap->snapshot));
+ memcpy(nsnap->snapshot, txn->snapshot, txn->snapshot_count * sizeof(*nsnap->snapshot));
+ }
+ nsnap->snapshot_count = txn->snapshot_count;
+
+ /* Update the list. */
+
+ /*
+ * The semantic is that a new snapshot with the same name as an existing snapshot will replace
+ * the old one.
+ */
+ WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval));
+
+ if (TAILQ_EMPTY(&txn_global->nsnaph)) {
+ WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE &&
+ !__wt_txn_visible_all(session, nsnap_new->pinned_id, WT_TS_NONE));
+ __wt_readlock(session, &txn_global->rwlock);
+ txn_global->nsnap_oldest_id = nsnap_new->pinned_id;
+ __wt_readunlock(session, &txn_global->rwlock);
+ }
+ TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
+ WT_STAT_CONN_INCR(session, txn_snapshots_created);
+ nsnap_new = NULL;
+
+err:
+ if (started_txn) {
#ifdef HAVE_DIAGNOSTIC
- uint64_t pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
+ uint64_t pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
#endif
- WT_TRET(__wt_txn_rollback(session, NULL));
- WT_DIAGNOSTIC_YIELD;
- WT_ASSERT(session,
- !__wt_txn_visible_all(session, pinned_id, WT_TS_NONE));
- }
+ WT_TRET(__wt_txn_rollback(session, NULL));
+ WT_DIAGNOSTIC_YIELD;
+ WT_ASSERT(session, !__wt_txn_visible_all(session, pinned_id, WT_TS_NONE));
+ }
- if (nsnap_new != NULL)
- __nsnap_destroy(session, nsnap_new);
+ if (nsnap_new != NULL)
+ __nsnap_destroy(session, nsnap_new);
- return (ret);
+ return (ret);
}
/*
* __wt_txn_named_snapshot_drop --
- * Drop named snapshots
+ * Drop named snapshots
*/
int
__wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG objectconf;
- WT_CONFIG_ITEM all_config, before_config, k, names_config, to_config, v;
- WT_DECL_RET;
-
- WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config));
- WT_RET(__wt_config_gets_def(
- session, cfg, "drop.names", 0, &names_config));
- WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config));
- WT_RET(__wt_config_gets_def(
- session, cfg, "drop.before", 0, &before_config));
-
- if (all_config.val != 0)
- WT_RET(__nsnap_drop_to(session, NULL, true));
- else if (before_config.len != 0)
- WT_RET(__nsnap_drop_to(session, &before_config, false));
- else if (to_config.len != 0)
- WT_RET(__nsnap_drop_to(session, &to_config, true));
-
- /* We are done if there are no named drops */
-
- if (names_config.len != 0) {
- __wt_config_subinit(session, &objectconf, &names_config);
- while ((ret = __wt_config_next(&objectconf, &k, &v)) == 0) {
- ret = __nsnap_drop_one(session, &k);
- if (ret != 0)
- WT_RET_MSG(session, EINVAL,
- "Named snapshot '%.*s' for drop not found",
- (int)k.len, k.str);
- }
- if (ret == WT_NOTFOUND)
- ret = 0;
- }
-
- return (ret);
+ WT_CONFIG objectconf;
+ WT_CONFIG_ITEM all_config, before_config, k, names_config, to_config, v;
+ WT_DECL_RET;
+
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config));
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.names", 0, &names_config));
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config));
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.before", 0, &before_config));
+
+ if (all_config.val != 0)
+ WT_RET(__nsnap_drop_to(session, NULL, true));
+ else if (before_config.len != 0)
+ WT_RET(__nsnap_drop_to(session, &before_config, false));
+ else if (to_config.len != 0)
+ WT_RET(__nsnap_drop_to(session, &to_config, true));
+
+ /* We are done if there are no named drops */
+
+ if (names_config.len != 0) {
+ __wt_config_subinit(session, &objectconf, &names_config);
+ while ((ret = __wt_config_next(&objectconf, &k, &v)) == 0) {
+ ret = __nsnap_drop_one(session, &k);
+ if (ret != 0)
+ WT_RET_MSG(
+ session, EINVAL, "Named snapshot '%.*s' for drop not found", (int)k.len, k.str);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ }
+
+ return (ret);
}
/*
* __wt_txn_named_snapshot_get --
- * Lookup a named snapshot for a transaction.
+ * Lookup a named snapshot for a transaction.
*/
int
__wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval)
{
- WT_NAMED_SNAPSHOT *nsnap;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- WT_TXN_STATE *txn_state;
-
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
- txn_state = WT_SESSION_TXN_STATE(session);
-
- txn->isolation = WT_ISO_SNAPSHOT;
- if (session->ncursors > 0)
- WT_RET(__wt_session_copy_values(session));
-
- __wt_readlock(session, &txn_global->nsnap_rwlock);
- TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
- if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) {
- /*
- * Acquire the scan lock so the oldest ID can't move
- * forward without seeing our pinned ID.
- */
- __wt_readlock(session, &txn_global->rwlock);
- txn_state->pinned_id = nsnap->pinned_id;
- __wt_readunlock(session, &txn_global->rwlock);
-
- WT_ASSERT(session, !__wt_txn_visible_all(
- session, txn_state->pinned_id, WT_TS_NONE) &&
- txn_global->nsnap_oldest_id != WT_TXN_NONE &&
- WT_TXNID_LE(txn_global->nsnap_oldest_id,
- txn_state->pinned_id));
- txn->snap_min = nsnap->snap_min;
- txn->snap_max = nsnap->snap_max;
- if ((txn->snapshot_count = nsnap->snapshot_count) != 0)
- memcpy(txn->snapshot, nsnap->snapshot,
- nsnap->snapshot_count *
- sizeof(*nsnap->snapshot));
- if (nsnap->id != WT_TXN_NONE) {
- WT_ASSERT(session, txn->id == WT_TXN_NONE);
- txn->id = nsnap->id;
- F_SET(txn, WT_TXN_READONLY);
- }
- F_SET(txn, WT_TXN_HAS_SNAPSHOT);
- break;
- }
- __wt_readunlock(session, &txn_global->nsnap_rwlock);
-
- if (nsnap == NULL)
- WT_RET_MSG(session, EINVAL,
- "Named snapshot '%.*s' not found",
- (int)nameval->len, nameval->str);
-
- /* Flag that this transaction is opened on a named snapshot */
- F_SET(txn, WT_TXN_NAMED_SNAPSHOT);
-
- return (0);
+ WT_NAMED_SNAPSHOT *nsnap;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *txn_state;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
+
+ txn->isolation = WT_ISO_SNAPSHOT;
+ if (session->ncursors > 0)
+ WT_RET(__wt_session_copy_values(session));
+
+ __wt_readlock(session, &txn_global->nsnap_rwlock);
+ TAILQ_FOREACH (nsnap, &txn_global->nsnaph, q)
+ if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) {
+ /*
+ * Acquire the scan lock so the oldest ID can't move forward without seeing our pinned
+ * ID.
+ */
+ __wt_readlock(session, &txn_global->rwlock);
+ txn_state->pinned_id = nsnap->pinned_id;
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ WT_ASSERT(session, !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE) &&
+ txn_global->nsnap_oldest_id != WT_TXN_NONE &&
+ WT_TXNID_LE(txn_global->nsnap_oldest_id, txn_state->pinned_id));
+ txn->snap_min = nsnap->snap_min;
+ txn->snap_max = nsnap->snap_max;
+ if ((txn->snapshot_count = nsnap->snapshot_count) != 0)
+ memcpy(
+ txn->snapshot, nsnap->snapshot, nsnap->snapshot_count * sizeof(*nsnap->snapshot));
+ if (nsnap->id != WT_TXN_NONE) {
+ WT_ASSERT(session, txn->id == WT_TXN_NONE);
+ txn->id = nsnap->id;
+ F_SET(txn, WT_TXN_READONLY);
+ }
+ F_SET(txn, WT_TXN_HAS_SNAPSHOT);
+ break;
+ }
+ __wt_readunlock(session, &txn_global->nsnap_rwlock);
+
+ if (nsnap == NULL)
+ WT_RET_MSG(
+ session, EINVAL, "Named snapshot '%.*s' not found", (int)nameval->len, nameval->str);
+
+ /* Flag that this transaction is opened on a named snapshot */
+ F_SET(txn, WT_TXN_NAMED_SNAPSHOT);
+
+ return (0);
}
/*
* __wt_txn_named_snapshot_config --
- * Check the configuration for a named snapshot
+ * Check the configuration for a named snapshot
*/
int
-__wt_txn_named_snapshot_config(WT_SESSION_IMPL *session,
- const char *cfg[], bool *has_create, bool *has_drops)
+__wt_txn_named_snapshot_config(
+ WT_SESSION_IMPL *session, const char *cfg[], bool *has_create, bool *has_drops)
{
- WT_CONFIG_ITEM all_config, before_config, names_config, to_config;
- WT_CONFIG_ITEM cval;
- WT_TXN *txn;
-
- txn = &session->txn;
- *has_create = *has_drops = false;
-
- /* Verify that the name is legal. */
- WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
- if (cval.len != 0) {
- if (WT_STRING_MATCH("all", cval.str, cval.len))
- WT_RET_MSG(session, EINVAL,
- "Can't create snapshot with reserved \"all\" name");
-
- WT_RET(__wt_name_check(session, cval.str, cval.len));
-
- if (F_ISSET(txn, WT_TXN_RUNNING) &&
- txn->isolation != WT_ISO_SNAPSHOT)
- WT_RET_MSG(session, EINVAL,
- "Can't create a named snapshot from a running "
- "transaction that isn't snapshot isolation");
- else if (F_ISSET(txn, WT_TXN_RUNNING) && txn->mod_count != 0)
- WT_RET_MSG(session, EINVAL,
- "Can't create a named snapshot from a running "
- "transaction that has made updates");
- *has_create = true;
- }
-
- /* Verify that the drop configuration is sane. */
- WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config));
- WT_RET(__wt_config_gets_def(
- session, cfg, "drop.names", 0, &names_config));
- WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config));
- WT_RET(__wt_config_gets_def(
- session, cfg, "drop.before", 0, &before_config));
-
- /* Avoid more work if no drops are configured. */
- if (all_config.val != 0 || names_config.len != 0 ||
- before_config.len != 0 || to_config.len != 0) {
- if (before_config.len != 0 && to_config.len != 0)
- WT_RET_MSG(session, EINVAL,
- "Illegal configuration; named snapshot drop can't "
- "specify both before and to options");
- if (all_config.val != 0 && (names_config.len != 0 ||
- to_config.len != 0 || before_config.len != 0))
- WT_RET_MSG(session, EINVAL,
- "Illegal configuration; named snapshot drop can't "
- "specify all and any other options");
- *has_drops = true;
- }
-
- if (!*has_create && !*has_drops)
- WT_RET_MSG(session, EINVAL,
- "WT_SESSION::snapshot API called without any drop or "
- "name option");
-
- return (0);
+ WT_CONFIG_ITEM all_config, before_config, names_config, to_config;
+ WT_CONFIG_ITEM cval;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ *has_create = *has_drops = false;
+
+ /* Verify that the name is legal. */
+ WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval));
+ if (cval.len != 0) {
+ if (WT_STRING_MATCH("all", cval.str, cval.len))
+ WT_RET_MSG(session, EINVAL, "Can't create snapshot with reserved \"all\" name");
+
+ WT_RET(__wt_name_check(session, cval.str, cval.len));
+
+ if (F_ISSET(txn, WT_TXN_RUNNING) && txn->isolation != WT_ISO_SNAPSHOT)
+ WT_RET_MSG(session, EINVAL,
+ "Can't create a named snapshot from a running "
+ "transaction that isn't snapshot isolation");
+ else if (F_ISSET(txn, WT_TXN_RUNNING) && txn->mod_count != 0)
+ WT_RET_MSG(session, EINVAL,
+ "Can't create a named snapshot from a running "
+ "transaction that has made updates");
+ *has_create = true;
+ }
+
+ /* Verify that the drop configuration is sane. */
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config));
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.names", 0, &names_config));
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config));
+ WT_RET(__wt_config_gets_def(session, cfg, "drop.before", 0, &before_config));
+
+ /* Avoid more work if no drops are configured. */
+ if (all_config.val != 0 || names_config.len != 0 || before_config.len != 0 ||
+ to_config.len != 0) {
+ if (before_config.len != 0 && to_config.len != 0)
+ WT_RET_MSG(session, EINVAL,
+ "Illegal configuration; named snapshot drop can't "
+ "specify both before and to options");
+ if (all_config.val != 0 &&
+ (names_config.len != 0 || to_config.len != 0 || before_config.len != 0))
+ WT_RET_MSG(session, EINVAL,
+ "Illegal configuration; named snapshot drop can't "
+ "specify all and any other options");
+ *has_drops = true;
+ }
+
+ if (!*has_create && !*has_drops)
+ WT_RET_MSG(session, EINVAL,
+ "WT_SESSION::snapshot API called without any drop or "
+ "name option");
+
+ return (0);
}
/*
* __wt_txn_named_snapshot_destroy --
- * Destroy all named snapshots on connection close
+ * Destroy all named snapshots on connection close
*/
void
__wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session)
{
- WT_NAMED_SNAPSHOT *nsnap;
- WT_TXN_GLOBAL *txn_global;
+ WT_NAMED_SNAPSHOT *nsnap;
+ WT_TXN_GLOBAL *txn_global;
- txn_global = &S2C(session)->txn_global;
- txn_global->nsnap_oldest_id = WT_TXN_NONE;
+ txn_global = &S2C(session)->txn_global;
+ txn_global->nsnap_oldest_id = WT_TXN_NONE;
- while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) {
- TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
- __nsnap_destroy(session, nsnap);
- }
+ while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) {
+ TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
+ __nsnap_destroy(session, nsnap);
+ }
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c
index 504b2c0e8b4..17e0b61c904 100644
--- a/src/third_party/wiredtiger/src/txn/txn_recover.c
+++ b/src/third_party/wiredtiger/src/txn/txn_recover.c
@@ -10,776 +10,717 @@
/* State maintained during recovery. */
typedef struct {
- const char *uri; /* File URI. */
- WT_CURSOR *c; /* Cursor used for recovery. */
- WT_LSN ckpt_lsn; /* File's checkpoint LSN. */
+ const char *uri; /* File URI. */
+ WT_CURSOR *c; /* Cursor used for recovery. */
+ WT_LSN ckpt_lsn; /* File's checkpoint LSN. */
} WT_RECOVERY_FILE;
typedef struct {
- WT_SESSION_IMPL *session;
-
- /* Files from the metadata, indexed by file ID. */
- WT_RECOVERY_FILE *files;
- size_t file_alloc; /* Allocated size of files array. */
- u_int max_fileid; /* Maximum file ID seen. */
- u_int nfiles; /* Number of files in the metadata. */
-
- WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */
- WT_LSN max_ckpt_lsn; /* Maximum checkpoint LSN seen. */
- WT_LSN max_rec_lsn; /* Maximum recovery LSN seen. */
-
- bool missing; /* Were there missing files? */
- bool metadata_only; /*
- * Set during the first recovery pass,
- * when only the metadata is recovered.
- */
+ WT_SESSION_IMPL *session;
+
+ /* Files from the metadata, indexed by file ID. */
+ WT_RECOVERY_FILE *files;
+ size_t file_alloc; /* Allocated size of files array. */
+ u_int max_fileid; /* Maximum file ID seen. */
+ u_int nfiles; /* Number of files in the metadata. */
+
+ WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */
+ WT_LSN max_ckpt_lsn; /* Maximum checkpoint LSN seen. */
+ WT_LSN max_rec_lsn; /* Maximum recovery LSN seen. */
+
+ bool missing; /* Were there missing files? */
+ bool metadata_only; /*
+ * Set during the first recovery pass,
+ * when only the metadata is recovered.
+ */
} WT_RECOVERY;
/*
* __recovery_cursor --
- * Get a cursor for a recovery operation.
+ * Get a cursor for a recovery operation.
*/
static int
-__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
- WT_LSN *lsnp, u_int id, bool duplicate, WT_CURSOR **cp)
+__recovery_cursor(
+ WT_SESSION_IMPL *session, WT_RECOVERY *r, WT_LSN *lsnp, u_int id, bool duplicate, WT_CURSOR **cp)
{
- WT_CURSOR *c;
- bool metadata_op;
- const char *cfg[] = { WT_CONFIG_BASE(
- session, WT_SESSION_open_cursor), "overwrite", NULL };
-
- c = NULL;
-
- /*
- * File ids with the bit set to ignore this operation are skipped.
- */
- if (WT_LOGOP_IS_IGNORED(id))
- return (0);
- /*
- * Metadata operations have an id of 0. Match operations based
- * on the id and the current pass of recovery for metadata.
- *
- * Only apply operations in the correct metadata phase, and if the LSN
- * is more recent than the last checkpoint. If there is no entry for a
- * file, assume it was dropped or missing after a hot backup.
- */
- metadata_op = id == WT_METAFILE_ID;
- if (r->metadata_only != metadata_op)
- ;
- else if (id >= r->nfiles || r->files[id].uri == NULL) {
- /* If a file is missing, output a verbose message once. */
- if (!r->missing)
- __wt_verbose(session, WT_VERB_RECOVERY,
- "No file found with ID %u (max %u)",
- id, r->nfiles);
- r->missing = true;
- } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) {
- /*
- * We're going to apply the operation. Get the cursor, opening
- * one if none is cached.
- */
- if ((c = r->files[id].c) == NULL) {
- WT_RET(__wt_open_cursor(
- session, r->files[id].uri, NULL, cfg, &c));
- r->files[id].c = c;
- }
- }
-
- if (duplicate && c != NULL)
- WT_RET(__wt_open_cursor(
- session, r->files[id].uri, NULL, cfg, &c));
-
- *cp = c;
- return (0);
+ WT_CURSOR *c;
+ const char *cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), "overwrite", NULL};
+ bool metadata_op;
+
+ c = NULL;
+
+ /*
+ * File ids with the bit set to ignore this operation are skipped.
+ */
+ if (WT_LOGOP_IS_IGNORED(id))
+ return (0);
+ /*
+ * Metadata operations have an id of 0. Match operations based
+ * on the id and the current pass of recovery for metadata.
+ *
+ * Only apply operations in the correct metadata phase, and if the LSN
+ * is more recent than the last checkpoint. If there is no entry for a
+ * file, assume it was dropped or missing after a hot backup.
+ */
+ metadata_op = id == WT_METAFILE_ID;
+ if (r->metadata_only != metadata_op)
+ ;
+ else if (id >= r->nfiles || r->files[id].uri == NULL) {
+ /* If a file is missing, output a verbose message once. */
+ if (!r->missing)
+ __wt_verbose(
+ session, WT_VERB_RECOVERY, "No file found with ID %u (max %u)", id, r->nfiles);
+ r->missing = true;
+ } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+ /*
+ * We're going to apply the operation. Get the cursor, opening one if none is cached.
+ */
+ if ((c = r->files[id].c) == NULL) {
+ WT_RET(__wt_open_cursor(session, r->files[id].uri, NULL, cfg, &c));
+ r->files[id].c = c;
+ }
+ }
+
+ if (duplicate && c != NULL)
+ WT_RET(__wt_open_cursor(session, r->files[id].uri, NULL, cfg, &c));
+
+ *cp = c;
+ return (0);
}
/*
* Helper to a cursor if this operation is to be applied during recovery.
*/
-#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \
- ret = __recovery_cursor(session, r, lsnp, fileid, false, cp); \
- __wt_verbose(session, WT_VERB_RECOVERY, \
- "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \
- "/%" PRIu32, \
- ret != 0 ? "Error" : \
- cursor == NULL ? "Skipping" : "Applying", \
- optype, fileid, (lsnp)->l.file, (lsnp)->l.offset); \
- WT_ERR(ret); \
- if (cursor == NULL) \
- break
+#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \
+ ret = __recovery_cursor(session, r, lsnp, fileid, false, cp); \
+ __wt_verbose(session, WT_VERB_RECOVERY, \
+ "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 "/%" PRIu32, \
+ ret != 0 ? "Error" : cursor == NULL ? "Skipping" : "Applying", optype, fileid, \
+ (lsnp)->l.file, (lsnp)->l.offset); \
+ WT_ERR(ret); \
+ if (cursor == NULL) \
+ break
/*
* __txn_op_apply --
- * Apply a transactional operation during recovery.
+ * Apply a transactional operation during recovery.
*/
static int
-__txn_op_apply(
- WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+__txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
{
- WT_CURSOR *cursor, *start, *stop;
- WT_DECL_RET;
- WT_ITEM key, start_key, stop_key, value;
- WT_SESSION_IMPL *session;
- wt_timestamp_t commit, durable, first, prepare, read;
- uint64_t recno, start_recno, stop_recno, t_nsec, t_sec;
- uint32_t fileid, mode, optype, opsize;
-
- session = r->session;
- cursor = NULL;
-
- /* Peek at the size and the type. */
- WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
- end = *pp + opsize;
-
- /*
- * If it is an operation type that should be ignored, we're done.
- * Note that file ids within known operations also use the same
- * macros to indicate that operation should be ignored.
- */
- if (WT_LOGOP_IS_IGNORED(optype)) {
- *pp += opsize;
- goto done;
- }
-
- switch (optype) {
- case WT_LOGOP_COL_MODIFY:
- WT_ERR(__wt_logop_col_modify_unpack(session, pp, end,
- &fileid, &recno, &value));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- cursor->set_key(cursor, recno);
- if ((ret = cursor->search(cursor)) != 0)
- WT_ERR_NOTFOUND_OK(ret);
- else {
- /*
- * Build/insert a complete value during recovery rather
- * than using cursor modify to create a partial update
- * (for no particular reason than simplicity).
- */
- WT_ERR(__wt_modify_apply(cursor, value.data));
- WT_ERR(cursor->insert(cursor));
- }
- break;
-
- case WT_LOGOP_COL_PUT:
- WT_ERR(__wt_logop_col_put_unpack(session, pp, end,
- &fileid, &recno, &value));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- cursor->set_key(cursor, recno);
- __wt_cursor_set_raw_value(cursor, &value);
- WT_ERR(cursor->insert(cursor));
- break;
-
- case WT_LOGOP_COL_REMOVE:
- WT_ERR(__wt_logop_col_remove_unpack(session, pp, end,
- &fileid, &recno));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- cursor->set_key(cursor, recno);
- WT_ERR(cursor->remove(cursor));
- break;
-
- case WT_LOGOP_COL_TRUNCATE:
- WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end,
- &fileid, &start_recno, &stop_recno));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
-
- /* Set up the cursors. */
- if (start_recno == WT_RECNO_OOB) {
- start = NULL;
- stop = cursor;
- } else if (stop_recno == WT_RECNO_OOB) {
- start = cursor;
- stop = NULL;
- } else {
- start = cursor;
- WT_ERR(__recovery_cursor(
- session, r, lsnp, fileid, true, &stop));
- }
-
- /* Set the keys. */
- if (start != NULL)
- start->set_key(start, start_recno);
- if (stop != NULL)
- stop->set_key(stop, stop_recno);
-
- WT_TRET(session->iface.truncate(&session->iface, NULL,
- start, stop, NULL));
- /* If we opened a duplicate cursor, close it now. */
- if (stop != NULL && stop != cursor)
- WT_TRET(stop->close(stop));
- WT_ERR(ret);
- break;
-
- case WT_LOGOP_ROW_MODIFY:
- WT_ERR(__wt_logop_row_modify_unpack(session, pp, end,
- &fileid, &key, &value));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- __wt_cursor_set_raw_key(cursor, &key);
- if ((ret = cursor->search(cursor)) != 0)
- WT_ERR_NOTFOUND_OK(ret);
- else {
- /*
- * Build/insert a complete value during recovery rather
- * than using cursor modify to create a partial update
- * (for no particular reason than simplicity).
- */
- WT_ERR(__wt_modify_apply(cursor, value.data));
- WT_ERR(cursor->insert(cursor));
- }
- break;
-
- case WT_LOGOP_ROW_PUT:
- WT_ERR(__wt_logop_row_put_unpack(session, pp, end,
- &fileid, &key, &value));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- __wt_cursor_set_raw_key(cursor, &key);
- __wt_cursor_set_raw_value(cursor, &value);
- WT_ERR(cursor->insert(cursor));
- break;
-
- case WT_LOGOP_ROW_REMOVE:
- WT_ERR(__wt_logop_row_remove_unpack(session, pp, end,
- &fileid, &key));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- __wt_cursor_set_raw_key(cursor, &key);
- WT_ERR(cursor->remove(cursor));
- break;
-
- case WT_LOGOP_ROW_TRUNCATE:
- WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end,
- &fileid, &start_key, &stop_key, &mode));
- GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
- /* Set up the cursors. */
- start = stop = NULL;
- switch (mode) {
- case WT_TXN_TRUNC_ALL:
- /* Both cursors stay NULL. */
- break;
- case WT_TXN_TRUNC_BOTH:
- start = cursor;
- WT_ERR(__recovery_cursor(
- session, r, lsnp, fileid, true, &stop));
- break;
- case WT_TXN_TRUNC_START:
- start = cursor;
- break;
- case WT_TXN_TRUNC_STOP:
- stop = cursor;
- break;
- default:
- WT_ERR(__wt_illegal_value(session, mode));
- }
-
- /* Set the keys. */
- if (start != NULL)
- __wt_cursor_set_raw_key(start, &start_key);
- if (stop != NULL)
- __wt_cursor_set_raw_key(stop, &stop_key);
-
- WT_TRET(session->iface.truncate(&session->iface, NULL,
- start, stop, NULL));
- /* If we opened a duplicate cursor, close it now. */
- if (stop != NULL && stop != cursor)
- WT_TRET(stop->close(stop));
- WT_ERR(ret);
- break;
- case WT_LOGOP_TXN_TIMESTAMP:
- /*
- * Timestamp records are informational only. We have to
- * unpack it to properly move forward in the log record
- * to the next operation, but otherwise ignore.
- */
- WT_ERR(__wt_logop_txn_timestamp_unpack(session, pp, end, &t_sec,
- &t_nsec, &commit, &durable, &first, &prepare, &read));
- break;
- default:
- WT_ERR(__wt_illegal_value(session, optype));
- }
+ WT_CURSOR *cursor, *start, *stop;
+ WT_DECL_RET;
+ WT_ITEM key, start_key, stop_key, value;
+ WT_SESSION_IMPL *session;
+ wt_timestamp_t commit, durable, first, prepare, read;
+ uint64_t recno, start_recno, stop_recno, t_nsec, t_sec;
+ uint32_t fileid, mode, optype, opsize;
+
+ session = r->session;
+ cursor = NULL;
+
+ /* Peek at the size and the type. */
+ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize));
+ end = *pp + opsize;
+
+ /*
+ * If it is an operation type that should be ignored, we're done. Note that file ids within
+ * known operations also use the same macros to indicate that operation should be ignored.
+ */
+ if (WT_LOGOP_IS_IGNORED(optype)) {
+ *pp += opsize;
+ goto done;
+ }
+
+ switch (optype) {
+ case WT_LOGOP_COL_MODIFY:
+ WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, &fileid, &recno, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ if ((ret = cursor->search(cursor)) != 0)
+ WT_ERR_NOTFOUND_OK(ret);
+ else {
+ /*
+ * Build/insert a complete value during recovery rather
+ * than using cursor modify to create a partial update
+ * (for no particular reason than simplicity).
+ */
+ WT_ERR(__wt_modify_apply(cursor, value.data));
+ WT_ERR(cursor->insert(cursor));
+ }
+ break;
+
+ case WT_LOGOP_COL_PUT:
+ WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_COL_REMOVE:
+ WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ cursor->set_key(cursor, recno);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_COL_TRUNCATE:
+ WT_ERR(
+ __wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+
+ /* Set up the cursors. */
+ if (start_recno == WT_RECNO_OOB) {
+ start = NULL;
+ stop = cursor;
+ } else if (stop_recno == WT_RECNO_OOB) {
+ start = cursor;
+ stop = NULL;
+ } else {
+ start = cursor;
+ WT_ERR(__recovery_cursor(session, r, lsnp, fileid, true, &stop));
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ start->set_key(start, start_recno);
+ if (stop != NULL)
+ stop->set_key(stop, stop_recno);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+
+ case WT_LOGOP_ROW_MODIFY:
+ WT_ERR(__wt_logop_row_modify_unpack(session, pp, end, &fileid, &key, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ if ((ret = cursor->search(cursor)) != 0)
+ WT_ERR_NOTFOUND_OK(ret);
+ else {
+ /*
+ * Build/insert a complete value during recovery rather
+ * than using cursor modify to create a partial update
+ * (for no particular reason than simplicity).
+ */
+ WT_ERR(__wt_modify_apply(cursor, value.data));
+ WT_ERR(cursor->insert(cursor));
+ }
+ break;
+
+ case WT_LOGOP_ROW_PUT:
+ WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ __wt_cursor_set_raw_value(cursor, &value);
+ WT_ERR(cursor->insert(cursor));
+ break;
+
+ case WT_LOGOP_ROW_REMOVE:
+ WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ __wt_cursor_set_raw_key(cursor, &key);
+ WT_ERR(cursor->remove(cursor));
+ break;
+
+ case WT_LOGOP_ROW_TRUNCATE:
+ WT_ERR(
+ __wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode));
+ GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
+ /* Set up the cursors. */
+ start = stop = NULL;
+ switch (mode) {
+ case WT_TXN_TRUNC_ALL:
+ /* Both cursors stay NULL. */
+ break;
+ case WT_TXN_TRUNC_BOTH:
+ start = cursor;
+ WT_ERR(__recovery_cursor(session, r, lsnp, fileid, true, &stop));
+ break;
+ case WT_TXN_TRUNC_START:
+ start = cursor;
+ break;
+ case WT_TXN_TRUNC_STOP:
+ stop = cursor;
+ break;
+ default:
+ WT_ERR(__wt_illegal_value(session, mode));
+ }
+
+ /* Set the keys. */
+ if (start != NULL)
+ __wt_cursor_set_raw_key(start, &start_key);
+ if (stop != NULL)
+ __wt_cursor_set_raw_key(stop, &stop_key);
+
+ WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL));
+ /* If we opened a duplicate cursor, close it now. */
+ if (stop != NULL && stop != cursor)
+ WT_TRET(stop->close(stop));
+ WT_ERR(ret);
+ break;
+ case WT_LOGOP_TXN_TIMESTAMP:
+ /*
+ * Timestamp records are informational only. We have to unpack it to properly move forward
+ * in the log record to the next operation, but otherwise ignore.
+ */
+ WT_ERR(__wt_logop_txn_timestamp_unpack(
+ session, pp, end, &t_sec, &t_nsec, &commit, &durable, &first, &prepare, &read));
+ break;
+ default:
+ WT_ERR(__wt_illegal_value(session, optype));
+ }
done:
- /* Reset the cursor so it doesn't block eviction. */
- if (cursor != NULL)
- WT_ERR(cursor->reset(cursor));
- return (0);
-
-err: __wt_err(session, ret,
- "operation apply failed during recovery: operation type %"
- PRIu32 " at LSN %" PRIu32 "/%" PRIu32,
- optype, lsnp->l.file, lsnp->l.offset);
- return (ret);
+ /* Reset the cursor so it doesn't block eviction. */
+ if (cursor != NULL)
+ WT_ERR(cursor->reset(cursor));
+ return (0);
+
+err:
+ __wt_err(session, ret, "operation apply failed during recovery: operation type %" PRIu32
+ " at LSN %" PRIu32 "/%" PRIu32,
+ optype, lsnp->l.file, lsnp->l.offset);
+ return (ret);
}
/*
* __txn_commit_apply --
- * Apply a commit record during recovery.
+ * Apply a commit record during recovery.
*/
static int
-__txn_commit_apply(
- WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
+__txn_commit_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end)
{
- /* The logging subsystem zero-pads records. */
- while (*pp < end && **pp)
- WT_RET(__txn_op_apply(r, lsnp, pp, end));
+ /* The logging subsystem zero-pads records. */
+ while (*pp < end && **pp)
+ WT_RET(__txn_op_apply(r, lsnp, pp, end));
- return (0);
+ return (0);
}
/*
* __txn_log_recover --
- * Roll the log forward to recover committed changes.
+ * Roll the log forward to recover committed changes.
*/
static int
-__txn_log_recover(WT_SESSION_IMPL *session,
- WT_ITEM *logrec, WT_LSN *lsnp, WT_LSN *next_lsnp,
- void *cookie, int firstrecord)
+__txn_log_recover(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, WT_LSN *next_lsnp,
+ void *cookie, int firstrecord)
{
- WT_DECL_RET;
- WT_RECOVERY *r;
- uint64_t txnid_unused;
- uint32_t rectype;
- const uint8_t *end, *p;
-
- r = cookie;
- p = WT_LOG_SKIP_HEADER(logrec->data);
- end = (const uint8_t *)logrec->data + logrec->size;
- WT_UNUSED(firstrecord);
-
- /* First, peek at the log record type. */
- WT_RET(__wt_logrec_read(session, &p, end, &rectype));
-
- /*
- * Record the highest LSN we process during the metadata phase.
- * If not the metadata phase, then stop at that LSN.
- */
- if (r->metadata_only)
- r->max_rec_lsn = *next_lsnp;
- else if (__wt_log_cmp(lsnp, &r->max_rec_lsn) >= 0)
- return (0);
-
- switch (rectype) {
- case WT_LOGREC_CHECKPOINT:
- if (r->metadata_only)
- WT_RET(__wt_txn_checkpoint_logread(
- session, &p, end, &r->ckpt_lsn));
- break;
-
- case WT_LOGREC_COMMIT:
- if ((ret = __wt_vunpack_uint(
- &p, WT_PTRDIFF(end, p), &txnid_unused)) != 0)
- WT_RET_MSG(
- session, ret, "txn_log_recover: unpack failure");
- WT_RET(__txn_commit_apply(r, lsnp, &p, end));
- break;
- }
-
- return (0);
+ WT_DECL_RET;
+ WT_RECOVERY *r;
+ uint64_t txnid_unused;
+ uint32_t rectype;
+ const uint8_t *end, *p;
+
+ r = cookie;
+ p = WT_LOG_SKIP_HEADER(logrec->data);
+ end = (const uint8_t *)logrec->data + logrec->size;
+ WT_UNUSED(firstrecord);
+
+ /* First, peek at the log record type. */
+ WT_RET(__wt_logrec_read(session, &p, end, &rectype));
+
+ /*
+ * Record the highest LSN we process during the metadata phase. If not the metadata phase, then
+ * stop at that LSN.
+ */
+ if (r->metadata_only)
+ r->max_rec_lsn = *next_lsnp;
+ else if (__wt_log_cmp(lsnp, &r->max_rec_lsn) >= 0)
+ return (0);
+
+ switch (rectype) {
+ case WT_LOGREC_CHECKPOINT:
+ if (r->metadata_only)
+ WT_RET(__wt_txn_checkpoint_logread(session, &p, end, &r->ckpt_lsn));
+ break;
+
+ case WT_LOGREC_COMMIT:
+ if ((ret = __wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid_unused)) != 0)
+ WT_RET_MSG(session, ret, "txn_log_recover: unpack failure");
+ WT_RET(__txn_commit_apply(r, lsnp, &p, end));
+ break;
+ }
+
+ return (0);
}
/*
* __recovery_set_checkpoint_timestamp --
- * Set the checkpoint timestamp as retrieved from the metadata file.
+ * Set the checkpoint timestamp as retrieved from the metadata file.
*/
static int
__recovery_set_checkpoint_timestamp(WT_RECOVERY *r)
{
- WT_CONFIG_ITEM cval;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- wt_timestamp_t ckpt_timestamp;
- char ts_string[WT_TS_INT_STRING_SIZE], *sys_config;
-
- sys_config = NULL;
-
- session = r->session;
- conn = S2C(session);
- /*
- * Read the system checkpoint information from the metadata file and
- * save the stable timestamp of the last checkpoint for later query.
- * This gets saved in the connection.
- */
- ckpt_timestamp = 0;
-
- /* Search in the metadata for the system information. */
- WT_ERR_NOTFOUND_OK(
- __wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config));
- if (sys_config != NULL) {
- WT_CLEAR(cval);
- WT_ERR_NOTFOUND_OK(__wt_config_getones(
- session, sys_config, "checkpoint_timestamp", &cval));
- if (cval.len != 0) {
- __wt_verbose(session, WT_VERB_RECOVERY,
- "Recovery timestamp %.*s",
- (int)cval.len, cval.str);
- WT_ERR(__wt_txn_parse_timestamp_raw(session,
- "recovery", &ckpt_timestamp, &cval));
- }
- }
-
- /*
- * Set the recovery checkpoint timestamp and the metadata checkpoint
- * timestamp so that the checkpoint after recovery writes the correct
- * value into the metadata.
- */
- conn->txn_global.meta_ckpt_timestamp =
- conn->txn_global.recovery_timestamp = ckpt_timestamp;
-
- __wt_verbose(session,
- WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
- "Set global recovery timestamp: %s",
- __wt_timestamp_to_string(
- conn->txn_global.recovery_timestamp, ts_string));
-
-err: __wt_free(session, sys_config);
- return (ret);
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ wt_timestamp_t ckpt_timestamp;
+ char ts_string[WT_TS_INT_STRING_SIZE], *sys_config;
+
+ sys_config = NULL;
+
+ session = r->session;
+ conn = S2C(session);
+ /*
+ * Read the system checkpoint information from the metadata file and save the stable timestamp
+ * of the last checkpoint for later query. This gets saved in the connection.
+ */
+ ckpt_timestamp = 0;
+
+ /* Search in the metadata for the system information. */
+ WT_ERR_NOTFOUND_OK(__wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config));
+ if (sys_config != NULL) {
+ WT_CLEAR(cval);
+ WT_ERR_NOTFOUND_OK(__wt_config_getones(session, sys_config, "checkpoint_timestamp", &cval));
+ if (cval.len != 0) {
+ __wt_verbose(
+ session, WT_VERB_RECOVERY, "Recovery timestamp %.*s", (int)cval.len, cval.str);
+ WT_ERR(__wt_txn_parse_timestamp_raw(session, "recovery", &ckpt_timestamp, &cval));
+ }
+ }
+
+ /*
+ * Set the recovery checkpoint timestamp and the metadata checkpoint timestamp so that the
+ * checkpoint after recovery writes the correct value into the metadata.
+ */
+ conn->txn_global.meta_ckpt_timestamp = conn->txn_global.recovery_timestamp = ckpt_timestamp;
+
+ __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
+ "Set global recovery timestamp: %s",
+ __wt_timestamp_to_string(conn->txn_global.recovery_timestamp, ts_string));
+
+err:
+ __wt_free(session, sys_config);
+ return (ret);
}
/*
* __recovery_setup_file --
- * Set up the recovery slot for a file.
+ * Set up the recovery slot for a file.
*/
static int
__recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config)
{
- WT_CONFIG_ITEM cval;
- WT_LSN lsn;
- uint32_t fileid, lsnfile, lsnoffset;
-
- WT_RET(__wt_config_getones(r->session, config, "id", &cval));
- fileid = (uint32_t)cval.val;
-
- /* Track the largest file ID we have seen. */
- if (fileid > r->max_fileid)
- r->max_fileid = fileid;
-
- if (r->nfiles <= fileid) {
- WT_RET(__wt_realloc_def(
- r->session, &r->file_alloc, fileid + 1, &r->files));
- r->nfiles = fileid + 1;
- }
-
- if (r->files[fileid].uri != NULL)
- WT_PANIC_RET(r->session, WT_PANIC,
- "metadata corruption: files %s and %s have the same "
- "file ID %u",
- uri, r->files[fileid].uri, fileid);
- WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
- WT_RET(
- __wt_config_getones(r->session, config, "checkpoint_lsn", &cval));
- /* If there is checkpoint logged for the file, apply everything. */
- if (cval.type != WT_CONFIG_ITEM_STRUCT)
- WT_INIT_LSN(&lsn);
- /* NOLINTNEXTLINE(cert-err34-c) */
- else if (sscanf(cval.str,
- "(%" SCNu32 ",%" SCNu32 ")", &lsnfile, &lsnoffset) == 2)
- WT_SET_LSN(&lsn, lsnfile, lsnoffset);
- else
- WT_RET_MSG(r->session, EINVAL,
- "Failed to parse checkpoint LSN '%.*s'",
- (int)cval.len, cval.str);
- r->files[fileid].ckpt_lsn = lsn;
-
- __wt_verbose(r->session, WT_VERB_RECOVERY,
- "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")",
- uri, fileid, lsn.l.file, lsn.l.offset);
-
- if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) &&
- (WT_IS_MAX_LSN(&r->max_ckpt_lsn) ||
- __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0))
- r->max_ckpt_lsn = lsn;
-
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_LSN lsn;
+ uint32_t fileid, lsnfile, lsnoffset;
+
+ WT_RET(__wt_config_getones(r->session, config, "id", &cval));
+ fileid = (uint32_t)cval.val;
+
+ /* Track the largest file ID we have seen. */
+ if (fileid > r->max_fileid)
+ r->max_fileid = fileid;
+
+ if (r->nfiles <= fileid) {
+ WT_RET(__wt_realloc_def(r->session, &r->file_alloc, fileid + 1, &r->files));
+ r->nfiles = fileid + 1;
+ }
+
+ if (r->files[fileid].uri != NULL)
+ WT_PANIC_RET(r->session, WT_PANIC,
+ "metadata corruption: files %s and %s have the same "
+ "file ID %u",
+ uri, r->files[fileid].uri, fileid);
+ WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri));
+ WT_RET(__wt_config_getones(r->session, config, "checkpoint_lsn", &cval));
+ /* If there is checkpoint logged for the file, apply everything. */
+ if (cval.type != WT_CONFIG_ITEM_STRUCT)
+ WT_INIT_LSN(&lsn);
+ /* NOLINTNEXTLINE(cert-err34-c) */
+ else if (sscanf(cval.str, "(%" SCNu32 ",%" SCNu32 ")", &lsnfile, &lsnoffset) == 2)
+ WT_SET_LSN(&lsn, lsnfile, lsnoffset);
+ else
+ WT_RET_MSG(
+ r->session, EINVAL, "Failed to parse checkpoint LSN '%.*s'", (int)cval.len, cval.str);
+ r->files[fileid].ckpt_lsn = lsn;
+
+ __wt_verbose(r->session, WT_VERB_RECOVERY,
+ "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file,
+ lsn.l.offset);
+
+ if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) &&
+ (WT_IS_MAX_LSN(&r->max_ckpt_lsn) || __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0))
+ r->max_ckpt_lsn = lsn;
+
+ return (0);
}
/*
* __recovery_free --
- * Free the recovery state.
+ * Free the recovery state.
*/
static int
__recovery_free(WT_RECOVERY *r)
{
- WT_CURSOR *c;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
-
- session = r->session;
- for (i = 0; i < r->nfiles; i++) {
- __wt_free(session, r->files[i].uri);
- if ((c = r->files[i].c) != NULL)
- WT_TRET(c->close(c));
- }
-
- __wt_free(session, r->files);
- return (ret);
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ session = r->session;
+ for (i = 0; i < r->nfiles; i++) {
+ __wt_free(session, r->files[i].uri);
+ if ((c = r->files[i].c) != NULL)
+ WT_TRET(c->close(c));
+ }
+
+ __wt_free(session, r->files);
+ return (ret);
}
/*
* __recovery_file_scan --
- * Scan the files referenced from the metadata and gather information
- * about them for recovery.
+ * Scan the files referenced from the metadata and gather information about them for recovery.
*/
static int
__recovery_file_scan(WT_RECOVERY *r)
{
- WT_CURSOR *c;
- WT_DECL_RET;
- int cmp;
- const char *uri, *config;
-
- /* Scan through all files in the metadata. */
- c = r->files[0].c;
- c->set_key(c, "file:");
- if ((ret = c->search_near(c, &cmp)) != 0) {
- /* Is the metadata empty? */
- WT_RET_NOTFOUND_OK(ret);
- return (0);
- }
- if (cmp < 0)
- WT_RET_NOTFOUND_OK(c->next(c));
- for (; ret == 0; ret = c->next(c)) {
- WT_RET(c->get_key(c, &uri));
- if (!WT_PREFIX_MATCH(uri, "file:"))
- break;
- WT_RET(c->get_value(c, &config));
- WT_RET(__recovery_setup_file(r, uri, config));
- }
- WT_RET_NOTFOUND_OK(ret);
- return (0);
+ WT_CURSOR *c;
+ WT_DECL_RET;
+ int cmp;
+ const char *uri, *config;
+
+ /* Scan through all files in the metadata. */
+ c = r->files[0].c;
+ c->set_key(c, "file:");
+ if ((ret = c->search_near(c, &cmp)) != 0) {
+ /* Is the metadata empty? */
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
+ }
+ if (cmp < 0)
+ WT_RET_NOTFOUND_OK(c->next(c));
+ for (; ret == 0; ret = c->next(c)) {
+ WT_RET(c->get_key(c, &uri));
+ if (!WT_PREFIX_MATCH(uri, "file:"))
+ break;
+ WT_RET(c->get_value(c, &config));
+ WT_RET(__recovery_setup_file(r, uri, config));
+ }
+ WT_RET_NOTFOUND_OK(ret);
+ return (0);
}
/*
* __wt_txn_recover --
- * Run recovery.
+ * Run recovery.
*/
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_CURSOR *metac;
- WT_DECL_RET;
- WT_RECOVERY r;
- WT_RECOVERY_FILE *metafile;
- char *config;
- bool do_checkpoint, eviction_started, needs_rec, was_backup;
-
- conn = S2C(session);
- WT_CLEAR(r);
- WT_INIT_LSN(&r.ckpt_lsn);
- config = NULL;
- do_checkpoint = true;
- eviction_started = false;
- was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);
-
- /* We need a real session for recovery. */
- WT_RET(__wt_open_internal_session(conn, "txn-recover",
- false, WT_SESSION_NO_LOGGING, &session));
- r.session = session;
- WT_MAX_LSN(&r.max_ckpt_lsn);
- WT_MAX_LSN(&r.max_rec_lsn);
- conn->txn_global.recovery_timestamp =
- conn->txn_global.meta_ckpt_timestamp = 0;
-
- F_SET(conn, WT_CONN_RECOVERING);
- WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
- WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
- WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
- metafile = &r.files[WT_METAFILE_ID];
- metafile->c = metac;
-
- /*
- * If no log was found (including if logging is disabled), or if the
- * last checkpoint was done with logging disabled, recovery should not
- * run. Scan the metadata to figure out the largest file ID.
- */
- if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) ||
- WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
- /*
- * Detect if we're going from logging disabled to enabled.
- * We need to know this to verify LSNs and start at the correct
- * log file later. If someone ran with logging, then disabled
- * it and removed all the log files and then turned logging back
- * on, we have to start logging in the log file number that is
- * larger than any checkpoint LSN we have from the earlier time.
- */
- WT_ERR(__recovery_file_scan(&r));
- /*
- * The array can be re-allocated in recovery_file_scan. Reset
- * our pointer after scanning all the files.
- */
- metafile = &r.files[WT_METAFILE_ID];
- conn->next_file_id = r.max_fileid;
-
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
- WT_IS_MAX_LSN(&metafile->ckpt_lsn) &&
- !WT_IS_MAX_LSN(&r.max_ckpt_lsn))
- WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file));
- else
- do_checkpoint = false;
- goto done;
- }
-
- /*
- * First, do a pass through the log to recover the metadata, and
- * establish the last checkpoint LSN. Skip this when opening a hot
- * backup: we already have the correct metadata in that case.
- *
- * If we're running with salvage and we hit an error, we ignore it
- * and continue. In salvage we want to recover whatever part of the
- * data we can from the last checkpoint up until whatever problem we
- * detect in the log file. In salvage, we ignore errors from scanning
- * the log so recovery can continue. Other errors remain errors.
- */
- if (!was_backup) {
- r.metadata_only = true;
- /*
- * If this is a read-only connection, check if the checkpoint
- * LSN in the metadata file is up to date, indicating a clean
- * shutdown.
- */
- if (F_ISSET(conn, WT_CONN_READONLY)) {
- WT_ERR(__wt_log_needs_recovery(
- session, &metafile->ckpt_lsn, &needs_rec));
- if (needs_rec)
- WT_ERR_MSG(session, WT_RUN_RECOVERY,
- "Read-only database needs recovery");
- }
- if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
- ret = __wt_log_scan(session,
- NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r);
- else {
- /*
- * Start at the last checkpoint LSN referenced in the
- * metadata. If we see the end of a checkpoint while
- * scanning, we will change the full scan to start from
- * there.
- */
- r.ckpt_lsn = metafile->ckpt_lsn;
- ret = __wt_log_scan(session,
- &metafile->ckpt_lsn, WT_LOGSCAN_RECOVER_METADATA,
- __txn_log_recover, &r);
- }
- if (F_ISSET(conn, WT_CONN_SALVAGE))
- ret = 0;
- /*
- * If log scan couldn't find a file we expected to be around,
- * this indicates a corruption of some sort.
- */
- if (ret == ENOENT) {
- F_SET(conn, WT_CONN_DATA_CORRUPTION);
- ret = WT_ERROR;
- }
-
- WT_ERR(ret);
- }
-
- /* Scan the metadata to find the live files and their IDs. */
- WT_ERR(__recovery_file_scan(&r));
- /*
- * Clear this out. We no longer need it and it could have been
- * re-allocated when scanning the files.
- */
- WT_NOT_READ(metafile, NULL);
-
- /*
- * We no longer need the metadata cursor: close it to avoid pinning any
- * resources that could block eviction during recovery.
- */
- r.files[0].c = NULL;
- WT_ERR(metac->close(metac));
-
- /*
- * Now, recover all the files apart from the metadata.
- * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
- */
- r.metadata_only = false;
- __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
- "Main recovery loop: starting at %" PRIu32 "/%" PRIu32
- " to %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset,
- r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset);
- WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
- /*
- * Check if the database was shut down cleanly. If not
- * return an error if the user does not want automatic
- * recovery.
- */
- if (needs_rec &&
- (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) ||
- F_ISSET(conn, WT_CONN_READONLY))) {
- if (F_ISSET(conn, WT_CONN_READONLY))
- WT_ERR_MSG(session, WT_RUN_RECOVERY,
- "Read-only database needs recovery");
- WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery");
- }
-
- if (F_ISSET(conn, WT_CONN_READONLY)) {
- do_checkpoint = false;
- goto done;
- }
-
- /*
- * Recovery can touch more data than fits in cache, so it relies on
- * regular eviction to manage paging. Start eviction threads for
- * recovery without LAS cursors.
- */
- WT_ERR(__wt_evict_create(session));
- eviction_started = true;
-
- /*
- * Always run recovery even if it was a clean shutdown only if
- * this is not a read-only connection.
- * We can consider skipping it in the future.
- */
- if (needs_rec)
- FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
- if (WT_IS_INIT_LSN(&r.ckpt_lsn))
- ret = __wt_log_scan(session, NULL,
- WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
- __txn_log_recover, &r);
- else
- ret = __wt_log_scan(session, &r.ckpt_lsn,
- WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
- if (F_ISSET(conn, WT_CONN_SALVAGE))
- ret = 0;
- WT_ERR(ret);
-
- conn->next_file_id = r.max_fileid;
-
-done: WT_ERR(__recovery_set_checkpoint_timestamp(&r));
- if (do_checkpoint)
- /*
- * Forcibly log a checkpoint so the next open is fast and keep
- * the metadata up to date with the checkpoint LSN and
- * archiving.
- */
- WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
-
- /*
- * If we're downgrading and have newer log files, force an archive,
- * no matter what the archive setting is.
- */
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE))
- WT_ERR(__wt_log_truncate_files(session, NULL, true));
- FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
-
-err: WT_TRET(__recovery_free(&r));
- __wt_free(session, config);
- FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
-
- if (ret != 0) {
- FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED);
- __wt_err(session, ret, "Recovery failed");
- }
-
- /*
- * Destroy the eviction threads that were started in support of
- * recovery. They will be restarted once the lookaside table is
- * created.
- */
- if (eviction_started)
- WT_TRET(__wt_evict_destroy(session));
-
- WT_TRET(session->iface.close(&session->iface, NULL));
- F_CLR(conn, WT_CONN_RECOVERING);
-
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *metac;
+ WT_DECL_RET;
+ WT_RECOVERY r;
+ WT_RECOVERY_FILE *metafile;
+ char *config;
+ bool do_checkpoint, eviction_started, needs_rec, was_backup;
+
+ conn = S2C(session);
+ WT_CLEAR(r);
+ WT_INIT_LSN(&r.ckpt_lsn);
+ config = NULL;
+ do_checkpoint = true;
+ eviction_started = false;
+ was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP);
+
+ /* We need a real session for recovery. */
+ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session));
+ r.session = session;
+ WT_MAX_LSN(&r.max_ckpt_lsn);
+ WT_MAX_LSN(&r.max_rec_lsn);
+ conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = 0;
+
+ F_SET(conn, WT_CONN_RECOVERING);
+ WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
+ WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
+ WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac));
+ metafile = &r.files[WT_METAFILE_ID];
+ metafile->c = metac;
+
+ /*
+ * If no log was found (including if logging is disabled), or if the last checkpoint was done
+ * with logging disabled, recovery should not run. Scan the metadata to figure out the largest
+ * file ID.
+ */
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
+ /*
+ * Detect if we're going from logging disabled to enabled. We need to know this to verify
+ * LSNs and start at the correct log file later. If someone ran with logging, then disabled
+ * it and removed all the log files and then turned logging back on, we have to start
+ * logging in the log file number that is larger than any checkpoint LSN we have from the
+ * earlier time.
+ */
+ WT_ERR(__recovery_file_scan(&r));
+ /*
+ * The array can be re-allocated in recovery_file_scan. Reset our pointer after scanning all
+ * the files.
+ */
+ metafile = &r.files[WT_METAFILE_ID];
+ conn->next_file_id = r.max_fileid;
+
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && WT_IS_MAX_LSN(&metafile->ckpt_lsn) &&
+ !WT_IS_MAX_LSN(&r.max_ckpt_lsn))
+ WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file));
+ else
+ do_checkpoint = false;
+ goto done;
+ }
+
+ /*
+ * First, do a pass through the log to recover the metadata, and
+ * establish the last checkpoint LSN. Skip this when opening a hot
+ * backup: we already have the correct metadata in that case.
+ *
+ * If we're running with salvage and we hit an error, we ignore it
+ * and continue. In salvage we want to recover whatever part of the
+ * data we can from the last checkpoint up until whatever problem we
+ * detect in the log file. In salvage, we ignore errors from scanning
+ * the log so recovery can continue. Other errors remain errors.
+ */
+ if (!was_backup) {
+ r.metadata_only = true;
+ /*
+ * If this is a read-only connection, check if the checkpoint LSN in the metadata file is up
+ * to date, indicating a clean shutdown.
+ */
+ if (F_ISSET(conn, WT_CONN_READONLY)) {
+ WT_ERR(__wt_log_needs_recovery(session, &metafile->ckpt_lsn, &needs_rec));
+ if (needs_rec)
+ WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery");
+ }
+ if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
+ ret = __wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r);
+ else {
+ /*
+ * Start at the last checkpoint LSN referenced in the metadata. If we see the end of a
+ * checkpoint while scanning, we will change the full scan to start from there.
+ */
+ r.ckpt_lsn = metafile->ckpt_lsn;
+ ret = __wt_log_scan(
+ session, &metafile->ckpt_lsn, WT_LOGSCAN_RECOVER_METADATA, __txn_log_recover, &r);
+ }
+ if (F_ISSET(conn, WT_CONN_SALVAGE))
+ ret = 0;
+ /*
+ * If log scan couldn't find a file we expected to be around, this indicates a corruption of
+ * some sort.
+ */
+ if (ret == ENOENT) {
+ F_SET(conn, WT_CONN_DATA_CORRUPTION);
+ ret = WT_ERROR;
+ }
+
+ WT_ERR(ret);
+ }
+
+ /* Scan the metadata to find the live files and their IDs. */
+ WT_ERR(__recovery_file_scan(&r));
+ /*
+ * Clear this out. We no longer need it and it could have been re-allocated when scanning the
+ * files.
+ */
+ WT_NOT_READ(metafile, NULL);
+
+ /*
+ * We no longer need the metadata cursor: close it to avoid pinning any resources that could
+ * block eviction during recovery.
+ */
+ r.files[0].c = NULL;
+ WT_ERR(metac->close(metac));
+
+ /*
+ * Now, recover all the files apart from the metadata. Pass WT_LOGSCAN_RECOVER so that old logs
+ * get truncated.
+ */
+ r.metadata_only = false;
+ __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS,
+ "Main recovery loop: starting at %" PRIu32 "/%" PRIu32 " to %" PRIu32 "/%" PRIu32,
+ r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset, r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset);
+ WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
+ /*
+ * Check if the database was shut down cleanly. If not return an error if the user does not want
+ * automatic recovery.
+ */
+ if (needs_rec &&
+ (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) {
+ if (F_ISSET(conn, WT_CONN_READONLY))
+ WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery");
+ WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery");
+ }
+
+ if (F_ISSET(conn, WT_CONN_READONLY)) {
+ do_checkpoint = false;
+ goto done;
+ }
+
+ /*
+ * Recovery can touch more data than fits in cache, so it relies on regular eviction to manage
+ * paging. Start eviction threads for recovery without LAS cursors.
+ */
+ WT_ERR(__wt_evict_create(session));
+ eviction_started = true;
+
+ /*
+ * Always run recovery even if it was a clean shutdown only if this is not a read-only
+ * connection. We can consider skipping it in the future.
+ */
+ if (needs_rec)
+ FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
+ if (WT_IS_INIT_LSN(&r.ckpt_lsn))
+ ret = __wt_log_scan(
+ session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
+ else
+ ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r);
+ if (F_ISSET(conn, WT_CONN_SALVAGE))
+ ret = 0;
+ WT_ERR(ret);
+
+ conn->next_file_id = r.max_fileid;
+
+done:
+ WT_ERR(__recovery_set_checkpoint_timestamp(&r));
+ if (do_checkpoint)
+ /*
+ * Forcibly log a checkpoint so the next open is fast and keep the metadata up to date with
+ * the checkpoint LSN and archiving.
+ */
+ WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
+
+ /*
+ * If we're downgrading and have newer log files, force an archive, no matter what the archive
+ * setting is.
+ */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE))
+ WT_ERR(__wt_log_truncate_files(session, NULL, true));
+ FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
+
+err:
+ WT_TRET(__recovery_free(&r));
+ __wt_free(session, config);
+ FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY);
+
+ if (ret != 0) {
+ FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED);
+ __wt_err(session, ret, "Recovery failed");
+ }
+
+ /*
+ * Destroy the eviction threads that were started in support of recovery. They will be restarted
+ * once the lookaside table is created.
+ */
+ if (eviction_started)
+ WT_TRET(__wt_evict_destroy(session));
+
+ WT_TRET(session->iface.close(&session->iface, NULL));
+ F_CLR(conn, WT_CONN_RECOVERING);
+
+ return (ret);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 01dad40f85f..97c83c47414 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -10,556 +10,524 @@
/*
* __txn_rollback_to_stable_lookaside_fixup --
- * Remove any updates that need to be rolled back from the lookaside file.
+ * Remove any updates that need to be rolled back from the lookaside file.
*/
static int
__txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_CURSOR *cursor;
- WT_DECL_RET;
- WT_ITEM las_key, las_value;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t durable_timestamp, las_timestamp, rollback_timestamp;
- uint64_t las_counter, las_pageid, las_total, las_txnid;
- uint32_t las_id, session_flags;
- uint8_t prepare_state, upd_type;
-
- conn = S2C(session);
- cursor = NULL;
- las_total = 0;
- session_flags = 0; /* [-Werror=maybe-uninitialized] */
-
- /*
- * Copy the stable timestamp, otherwise we'd need to lock it each time
- * it's accessed. Even though the stable timestamp isn't supposed to be
- * updated while rolling back, accessing it without a lock would
- * violate protocol.
- */
- txn_global = &conn->txn_global;
- WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
-
- __wt_las_cursor(session, &cursor, &session_flags);
-
- /* Discard pages we read as soon as we're done with them. */
- F_SET(session, WT_SESSION_READ_WONT_NEED);
-
- /* Walk the file. */
- __wt_writelock(session, &conn->cache->las_sweepwalk_lock);
- while ((ret = cursor->next(cursor)) == 0) {
- ++las_total;
- WT_ERR(cursor->get_key(cursor,
- &las_pageid, &las_id, &las_counter, &las_key));
-
- /* Check the file ID so we can skip durable tables */
- if (las_id >= conn->stable_rollback_maxfile)
- WT_PANIC_RET(session, EINVAL, "file ID %" PRIu32
- " in lookaside table larger than max %" PRIu32,
- las_id, conn->stable_rollback_maxfile);
- if (__bit_test(conn->stable_rollback_bitstring, las_id))
- continue;
-
- WT_ERR(cursor->get_value(
- cursor, &las_txnid, &las_timestamp,
- &durable_timestamp, &prepare_state, &upd_type, &las_value));
-
- /*
- * Entries with no timestamp will have a timestamp of zero,
- * which will fail the following check and cause them to never
- * be removed.
- */
- if (rollback_timestamp < durable_timestamp) {
- WT_ERR(cursor->remove(cursor));
- WT_STAT_CONN_INCR(session, txn_rollback_las_removed);
- --las_total;
- }
- }
- WT_ERR_NOTFOUND_OK(ret);
-err: if (ret == 0) {
- conn->cache->las_insert_count = las_total;
- conn->cache->las_remove_count = 0;
- }
- __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
- WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
-
- F_CLR(session, WT_SESSION_READ_WONT_NEED);
-
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_ITEM las_key, las_value;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t durable_timestamp, las_timestamp, rollback_timestamp;
+ uint64_t las_counter, las_pageid, las_total, las_txnid;
+ uint32_t las_id, session_flags;
+ uint8_t prepare_state, upd_type;
+
+ conn = S2C(session);
+ cursor = NULL;
+ las_total = 0;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+
+ /*
+ * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
+ * though the stable timestamp isn't supposed to be updated while rolling back, accessing it
+ * without a lock would violate protocol.
+ */
+ txn_global = &conn->txn_global;
+ WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
+
+ __wt_las_cursor(session, &cursor, &session_flags);
+
+ /* Discard pages we read as soon as we're done with them. */
+ F_SET(session, WT_SESSION_READ_WONT_NEED);
+
+ /* Walk the file. */
+ __wt_writelock(session, &conn->cache->las_sweepwalk_lock);
+ while ((ret = cursor->next(cursor)) == 0) {
+ ++las_total;
+ WT_ERR(cursor->get_key(cursor, &las_pageid, &las_id, &las_counter, &las_key));
+
+ /* Check the file ID so we can skip durable tables */
+ if (las_id >= conn->stable_rollback_maxfile)
+ WT_PANIC_RET(session, EINVAL,
+ "file ID %" PRIu32 " in lookaside table larger than max %" PRIu32, las_id,
+ conn->stable_rollback_maxfile);
+ if (__bit_test(conn->stable_rollback_bitstring, las_id))
+ continue;
+
+ WT_ERR(cursor->get_value(cursor, &las_txnid, &las_timestamp, &durable_timestamp,
+ &prepare_state, &upd_type, &las_value));
+
+ /*
+ * Entries with no timestamp will have a timestamp of zero, which will fail the following
+ * check and cause them to never be removed.
+ */
+ if (rollback_timestamp < durable_timestamp) {
+ WT_ERR(cursor->remove(cursor));
+ WT_STAT_CONN_INCR(session, txn_rollback_las_removed);
+ --las_total;
+ }
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+err:
+ if (ret == 0) {
+ conn->cache->las_insert_count = las_total;
+ conn->cache->las_remove_count = 0;
+ }
+ __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
+ WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ F_CLR(session, WT_SESSION_READ_WONT_NEED);
+
+ return (ret);
}
/*
* __txn_abort_newer_update --
- * Abort updates in an update change with timestamps newer than the
- * rollback timestamp.
+ * Abort updates in an update change with timestamps newer than the rollback timestamp.
*/
static void
-__txn_abort_newer_update(WT_SESSION_IMPL *session,
- WT_UPDATE *first_upd, wt_timestamp_t rollback_timestamp)
+__txn_abort_newer_update(
+ WT_SESSION_IMPL *session, WT_UPDATE *first_upd, wt_timestamp_t rollback_timestamp)
{
- WT_UPDATE *upd;
-
- for (upd = first_upd; upd != NULL; upd = upd->next) {
- /*
- * Updates with no timestamp will have a timestamp of zero and
- * will never be rolled back. If the table is configured for
- * strict timestamp checking, assert that all more recent
- * updates were also rolled back.
- */
- if (upd->txnid == WT_TXN_ABORTED ||
- upd->start_ts == WT_TS_NONE) {
- if (upd == first_upd)
- first_upd = upd->next;
- } else if (rollback_timestamp < upd->durable_ts) {
- /*
- * If any updates are aborted, all newer updates
- * better be aborted as well.
- *
- * Timestamp ordering relies on the validations at
- * the time of commit. Thus if the table is not
- * configured for key consistency check, the
- * the timestamps could be out of order here.
- */
- WT_ASSERT(session,
- !FLD_ISSET(S2BT(session)->assert_flags,
- WT_ASSERT_COMMIT_TS_KEYS) ||
- upd == first_upd);
- first_upd = upd->next;
-
- upd->txnid = WT_TXN_ABORTED;
- WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted);
- upd->durable_ts = upd->start_ts = WT_TS_NONE;
- }
- }
+ WT_UPDATE *upd;
+
+ for (upd = first_upd; upd != NULL; upd = upd->next) {
+ /*
+ * Updates with no timestamp will have a timestamp of zero and will never be rolled back. If
+ * the table is configured for strict timestamp checking, assert that all more recent
+ * updates were also rolled back.
+ */
+ if (upd->txnid == WT_TXN_ABORTED || upd->start_ts == WT_TS_NONE) {
+ if (upd == first_upd)
+ first_upd = upd->next;
+ } else if (rollback_timestamp < upd->durable_ts) {
+ /*
+ * If any updates are aborted, all newer updates
+ * better be aborted as well.
+ *
+ * Timestamp ordering relies on the validations at
+ * the time of commit. Thus if the table is not
+ * configured for key consistency check, the
+ * the timestamps could be out of order here.
+ */
+ WT_ASSERT(session, !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) ||
+ upd == first_upd);
+ first_upd = upd->next;
+
+ upd->txnid = WT_TXN_ABORTED;
+ WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted);
+ upd->durable_ts = upd->start_ts = WT_TS_NONE;
+ }
+ }
}
/*
* __txn_abort_newer_insert --
- * Apply the update abort check to each entry in an insert skip list
+ * Apply the update abort check to each entry in an insert skip list
*/
static void
-__txn_abort_newer_insert(WT_SESSION_IMPL *session,
- WT_INSERT_HEAD *head, wt_timestamp_t rollback_timestamp)
+__txn_abort_newer_insert(
+ WT_SESSION_IMPL *session, WT_INSERT_HEAD *head, wt_timestamp_t rollback_timestamp)
{
- WT_INSERT *ins;
+ WT_INSERT *ins;
- WT_SKIP_FOREACH(ins, head)
- __txn_abort_newer_update(session, ins->upd, rollback_timestamp);
+ WT_SKIP_FOREACH (ins, head)
+ __txn_abort_newer_update(session, ins->upd, rollback_timestamp);
}
/*
* __txn_abort_newer_col_var --
- * Abort updates on a variable length col leaf page with timestamps newer
- * than the rollback timestamp.
+ * Abort updates on a variable length col leaf page with timestamps newer than the rollback
+ * timestamp.
*/
static void
__txn_abort_newer_col_var(
- WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp)
+ WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp)
{
- WT_COL *cip;
- WT_INSERT_HEAD *ins;
- uint32_t i;
-
- /* Review the changes to the original on-page data items */
- WT_COL_FOREACH(page, cip, i)
- if ((ins = WT_COL_UPDATE(page, cip)) != NULL)
- __txn_abort_newer_insert(session,
- ins, rollback_timestamp);
-
- /* Review the append list */
- if ((ins = WT_COL_APPEND(page)) != NULL)
- __txn_abort_newer_insert(session, ins, rollback_timestamp);
+ WT_COL *cip;
+ WT_INSERT_HEAD *ins;
+ uint32_t i;
+
+ /* Review the changes to the original on-page data items */
+ WT_COL_FOREACH (page, cip, i)
+ if ((ins = WT_COL_UPDATE(page, cip)) != NULL)
+ __txn_abort_newer_insert(session, ins, rollback_timestamp);
+
+ /* Review the append list */
+ if ((ins = WT_COL_APPEND(page)) != NULL)
+ __txn_abort_newer_insert(session, ins, rollback_timestamp);
}
/*
* __txn_abort_newer_col_fix --
- * Abort updates on a fixed length col leaf page with timestamps newer than
- * the rollback timestamp.
+ * Abort updates on a fixed length col leaf page with timestamps newer than the rollback
+ * timestamp.
*/
static void
__txn_abort_newer_col_fix(
- WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp)
+ WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp)
{
- WT_INSERT_HEAD *ins;
+ WT_INSERT_HEAD *ins;
- /* Review the changes to the original on-page data items */
- if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL)
- __txn_abort_newer_insert(session, ins, rollback_timestamp);
+ /* Review the changes to the original on-page data items */
+ if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL)
+ __txn_abort_newer_insert(session, ins, rollback_timestamp);
- /* Review the append list */
- if ((ins = WT_COL_APPEND(page)) != NULL)
- __txn_abort_newer_insert(session, ins, rollback_timestamp);
+ /* Review the append list */
+ if ((ins = WT_COL_APPEND(page)) != NULL)
+ __txn_abort_newer_insert(session, ins, rollback_timestamp);
}
/*
* __txn_abort_newer_row_leaf --
- * Abort updates on a row leaf page with timestamps newer than the
- * rollback timestamp.
+ * Abort updates on a row leaf page with timestamps newer than the rollback timestamp.
*/
static void
__txn_abort_newer_row_leaf(
- WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp)
+ WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp)
{
- WT_INSERT_HEAD *insert;
- WT_ROW *rip;
- WT_UPDATE *upd;
- uint32_t i;
-
- /*
- * Review the insert list for keys before the first entry on the disk
- * page.
- */
- if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
- __txn_abort_newer_insert(session, insert, rollback_timestamp);
-
- /*
- * Review updates that belong to keys that are on the disk image,
- * as well as for keys inserted since the page was read from disk.
- */
- WT_ROW_FOREACH(page, rip, i) {
- if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
- __txn_abort_newer_update(
- session, upd, rollback_timestamp);
-
- if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
- __txn_abort_newer_insert(
- session, insert, rollback_timestamp);
- }
+ WT_INSERT_HEAD *insert;
+ WT_ROW *rip;
+ WT_UPDATE *upd;
+ uint32_t i;
+
+ /*
+ * Review the insert list for keys before the first entry on the disk page.
+ */
+ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
+ __txn_abort_newer_insert(session, insert, rollback_timestamp);
+
+ /*
+ * Review updates that belong to keys that are on the disk image, as well as for keys inserted
+ * since the page was read from disk.
+ */
+ WT_ROW_FOREACH (page, rip, i) {
+ if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
+ __txn_abort_newer_update(session, upd, rollback_timestamp);
+
+ if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
+ __txn_abort_newer_insert(session, insert, rollback_timestamp);
+ }
}
/*
* __txn_abort_newer_updates --
- * Abort updates on this page newer than the timestamp.
+ * Abort updates on this page newer than the timestamp.
*/
static int
-__txn_abort_newer_updates(
- WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
+__txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp)
{
- WT_DECL_RET;
- WT_PAGE *page;
- uint32_t read_flags;
- bool local_read;
-
- /*
- * If we created a page image with updates the need to be rolled back,
- * read the history into cache now and make sure the page is marked
- * dirty. Otherwise, the history we need could be swept from the
- * lookaside table before the page is read because the lookaside sweep
- * code has no way to tell that the page image is invalid.
- *
- * So, if there is lookaside history for a page, first check if the
- * history needs to be rolled back make sure that history is loaded
- * into cache. That is, if skew_newest is true, so the disk image
- * potentially contained unstable updates, and the history is more
- * recent than the rollback timestamp.
- *
- * Also, we have separately discarded any lookaside history more recent
- * than the rollback timestamp. For page_las structures in cache,
- * reset any future timestamps back to the rollback timestamp. This
- * allows those structures to be discarded once the rollback timestamp
- * is stable (crucially for tests, they can be discarded if the
- * connection is closed right after a rollback_to_stable call).
- */
- local_read = false;
- read_flags = WT_READ_WONT_NEED;
- if (ref->page_las != NULL) {
- if (ref->page_las->skew_newest && rollback_timestamp <
- ref->page_las->unstable_durable_timestamp) {
- /*
- * Make sure we get back a page with history, not a
- * limbo page.
- */
- WT_ASSERT(session,
- !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
- WT_RET(__wt_page_in(session, ref, read_flags));
- WT_ASSERT(session, ref->state != WT_REF_LIMBO &&
- ref->page != NULL &&
- __wt_page_is_modified(ref->page));
- local_read = true;
- }
- if (ref->page_las->max_timestamp > rollback_timestamp)
- ref->page_las->max_timestamp = rollback_timestamp;
- if (ref->page_las->unstable_durable_timestamp >
- rollback_timestamp)
- ref->page_las->unstable_durable_timestamp =
- rollback_timestamp;
- if (ref->page_las->unstable_timestamp > rollback_timestamp)
- ref->page_las->unstable_timestamp = rollback_timestamp;
- }
-
- /* Review deleted page saved to the ref */
- if (ref->page_del != NULL &&
- rollback_timestamp < ref->page_del->durable_timestamp)
- WT_ERR(__wt_delete_page_rollback(session, ref));
-
- /*
- * If we have a ref with no page, or the page is clean, there is
- * nothing to roll back.
- *
- * This check for a clean page is partly an optimization (checkpoint
- * only marks pages clean when they have no unwritten updates so
- * there's no point visiting them again), but also covers a corner case
- * of a checkpoint with use_timestamp=false. Such a checkpoint
- * effectively moves the stable timestamp forward, because changes that
- * are written in the checkpoint cannot be reliably rolled back. The
- * actual stable timestamp doesn't change, though, so if we try to roll
- * back clean pages the in-memory tree can get out of sync with the
- * on-disk tree.
- */
- if ((page = ref->page) == NULL || !__wt_page_is_modified(page))
- goto err;
-
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- __txn_abort_newer_col_fix(session, page, rollback_timestamp);
- break;
- case WT_PAGE_COL_VAR:
- __txn_abort_newer_col_var(session, page, rollback_timestamp);
- break;
- case WT_PAGE_COL_INT:
- case WT_PAGE_ROW_INT:
- /*
- * There is nothing to do for internal pages, since we aren't
- * rolling back far enough to potentially include reconciled
- * changes - and thus won't need to roll back structure
- * changes on internal pages.
- */
- break;
- case WT_PAGE_ROW_LEAF:
- __txn_abort_newer_row_leaf(session, page, rollback_timestamp);
- break;
- default:
- WT_ERR(__wt_illegal_value(session, page->type));
- }
-
-err: if (local_read)
- WT_TRET(__wt_page_release(session, ref, read_flags));
- return (ret);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ uint32_t read_flags;
+ bool local_read;
+
+ /*
+ * If we created a page image with updates the need to be rolled back,
+ * read the history into cache now and make sure the page is marked
+ * dirty. Otherwise, the history we need could be swept from the
+ * lookaside table before the page is read because the lookaside sweep
+ * code has no way to tell that the page image is invalid.
+ *
+ * So, if there is lookaside history for a page, first check if the
+ * history needs to be rolled back make sure that history is loaded
+ * into cache. That is, if skew_newest is true, so the disk image
+ * potentially contained unstable updates, and the history is more
+ * recent than the rollback timestamp.
+ *
+ * Also, we have separately discarded any lookaside history more recent
+ * than the rollback timestamp. For page_las structures in cache,
+ * reset any future timestamps back to the rollback timestamp. This
+ * allows those structures to be discarded once the rollback timestamp
+ * is stable (crucially for tests, they can be discarded if the
+ * connection is closed right after a rollback_to_stable call).
+ */
+ local_read = false;
+ read_flags = WT_READ_WONT_NEED;
+ if (ref->page_las != NULL) {
+ if (ref->page_las->skew_newest &&
+ rollback_timestamp < ref->page_las->unstable_durable_timestamp) {
+ /*
+ * Make sure we get back a page with history, not a limbo page.
+ */
+ WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
+ WT_RET(__wt_page_in(session, ref, read_flags));
+ WT_ASSERT(session,
+ ref->state != WT_REF_LIMBO && ref->page != NULL && __wt_page_is_modified(ref->page));
+ local_read = true;
+ }
+ if (ref->page_las->max_timestamp > rollback_timestamp)
+ ref->page_las->max_timestamp = rollback_timestamp;
+ if (ref->page_las->unstable_durable_timestamp > rollback_timestamp)
+ ref->page_las->unstable_durable_timestamp = rollback_timestamp;
+ if (ref->page_las->unstable_timestamp > rollback_timestamp)
+ ref->page_las->unstable_timestamp = rollback_timestamp;
+ }
+
+ /* Review deleted page saved to the ref */
+ if (ref->page_del != NULL && rollback_timestamp < ref->page_del->durable_timestamp)
+ WT_ERR(__wt_delete_page_rollback(session, ref));
+
+ /*
+ * If we have a ref with no page, or the page is clean, there is
+ * nothing to roll back.
+ *
+ * This check for a clean page is partly an optimization (checkpoint
+ * only marks pages clean when they have no unwritten updates so
+ * there's no point visiting them again), but also covers a corner case
+ * of a checkpoint with use_timestamp=false. Such a checkpoint
+ * effectively moves the stable timestamp forward, because changes that
+ * are written in the checkpoint cannot be reliably rolled back. The
+ * actual stable timestamp doesn't change, though, so if we try to roll
+ * back clean pages the in-memory tree can get out of sync with the
+ * on-disk tree.
+ */
+ if ((page = ref->page) == NULL || !__wt_page_is_modified(page))
+ goto err;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ __txn_abort_newer_col_fix(session, page, rollback_timestamp);
+ break;
+ case WT_PAGE_COL_VAR:
+ __txn_abort_newer_col_var(session, page, rollback_timestamp);
+ break;
+ case WT_PAGE_COL_INT:
+ case WT_PAGE_ROW_INT:
+ /*
+ * There is nothing to do for internal pages, since we aren't rolling back far enough to
+ * potentially include reconciled changes - and thus won't need to roll back structure
+ * changes on internal pages.
+ */
+ break;
+ case WT_PAGE_ROW_LEAF:
+ __txn_abort_newer_row_leaf(session, page, rollback_timestamp);
+ break;
+ default:
+ WT_ERR(__wt_illegal_value(session, page->type));
+ }
+
+err:
+ if (local_read)
+ WT_TRET(__wt_page_release(session, ref, read_flags));
+ return (ret);
}
/*
* __txn_rollback_to_stable_btree_walk --
- * Called for each open handle - choose to either skip or wipe the commits
+ * Called for each open handle - choose to either skip or wipe the commits
*/
static int
-__txn_rollback_to_stable_btree_walk(
- WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
+__txn_rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp)
{
- WT_DECL_RET;
- WT_REF *child_ref, *ref;
-
- /* Walk the tree, marking commits aborted where appropriate. */
- ref = NULL;
- while ((ret = __wt_tree_walk(session, &ref,
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
- ref != NULL) {
- if (WT_PAGE_IS_INTERNAL(ref->page)) {
- WT_INTL_FOREACH_BEGIN(session, ref->page, child_ref) {
- WT_RET(__txn_abort_newer_updates(
- session, child_ref, rollback_timestamp));
- } WT_INTL_FOREACH_END;
- } else
- WT_RET(__txn_abort_newer_updates(
- session, ref, rollback_timestamp));
- }
- return (ret);
+ WT_DECL_RET;
+ WT_REF *child_ref, *ref;
+
+ /* Walk the tree, marking commits aborted where appropriate. */
+ ref = NULL;
+ while ((ret = __wt_tree_walk(
+ session, &ref, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
+ ref != NULL) {
+ if (WT_PAGE_IS_INTERNAL(ref->page)) {
+ WT_INTL_FOREACH_BEGIN (session, ref->page, child_ref) {
+ WT_RET(__txn_abort_newer_updates(session, child_ref, rollback_timestamp));
+ }
+ WT_INTL_FOREACH_END;
+ } else
+ WT_RET(__txn_abort_newer_updates(session, ref, rollback_timestamp));
+ }
+ return (ret);
}
/*
* __txn_rollback_eviction_drain --
- * Wait for eviction to drain from a tree.
+ * Wait for eviction to drain from a tree.
*/
static int
__txn_rollback_eviction_drain(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_UNUSED(cfg);
+ WT_UNUSED(cfg);
- WT_RET(__wt_evict_file_exclusive_on(session));
- __wt_evict_file_exclusive_off(session);
- return (0);
+ WT_RET(__wt_evict_file_exclusive_on(session));
+ __wt_evict_file_exclusive_off(session);
+ return (0);
}
/*
* __txn_rollback_to_stable_btree --
- * Called for each open handle - choose to either skip or wipe the commits
+ * Called for each open handle - choose to either skip or wipe the commits
*/
static int
__txn_rollback_to_stable_btree(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t rollback_timestamp;
-
- WT_UNUSED(cfg);
-
- btree = S2BT(session);
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- /*
- * Immediately durable files don't get their commits wiped. This case
- * mostly exists to support the semantic required for the oplog in
- * MongoDB - updates that have been made to the oplog should not be
- * aborted. It also wouldn't be safe to roll back updates for any
- * table that had it's records logged, since those updates would be
- * recovered after a crash making them inconsistent.
- */
- if (__wt_btree_immediately_durable(session)) {
- /*
- * Add the btree ID to the bitstring, so we can exclude any
- * lookaside entries for this btree.
- */
- if (btree->id >= conn->stable_rollback_maxfile)
- WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32
- " larger than max %" PRIu32,
- btree->id, conn->stable_rollback_maxfile);
- __bit_set(conn->stable_rollback_bitstring, btree->id);
- return (0);
- }
-
- /* There is never anything to do for checkpoint handles */
- if (session->dhandle->checkpoint != NULL)
- return (0);
-
- /* There is nothing to do on an empty tree. */
- if (btree->root.page == NULL)
- return (0);
-
- /*
- * Copy the stable timestamp, otherwise we'd need to lock it each time
- * it's accessed. Even though the stable timestamp isn't supposed to be
- * updated while rolling back, accessing it without a lock would
- * violate protocol.
- */
- WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
-
- /*
- * Ensure the eviction server is out of the file - we don't
- * want it messing with us. This step shouldn't be required, but
- * it simplifies some of the reasoning about what state trees can
- * be in.
- */
- WT_RET(__wt_evict_file_exclusive_on(session));
- WT_WITH_PAGE_INDEX(session, ret =
- __txn_rollback_to_stable_btree_walk(session, rollback_timestamp));
- __wt_evict_file_exclusive_off(session);
-
- return (ret);
+ WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t rollback_timestamp;
+
+ WT_UNUSED(cfg);
+
+ btree = S2BT(session);
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ /*
+ * Immediately durable files don't get their commits wiped. This case mostly exists to support
+ * the semantic required for the oplog in MongoDB - updates that have been made to the oplog
+ * should not be aborted. It also wouldn't be safe to roll back updates for any table that had
+ * it's records logged, since those updates would be recovered after a crash making them
+ * inconsistent.
+ */
+ if (__wt_btree_immediately_durable(session)) {
+ /*
+ * Add the btree ID to the bitstring, so we can exclude any lookaside entries for this
+ * btree.
+ */
+ if (btree->id >= conn->stable_rollback_maxfile)
+ WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32 " larger than max %" PRIu32,
+ btree->id, conn->stable_rollback_maxfile);
+ __bit_set(conn->stable_rollback_bitstring, btree->id);
+ return (0);
+ }
+
+ /* There is never anything to do for checkpoint handles */
+ if (session->dhandle->checkpoint != NULL)
+ return (0);
+
+ /* There is nothing to do on an empty tree. */
+ if (btree->root.page == NULL)
+ return (0);
+
+ /*
+ * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
+ * though the stable timestamp isn't supposed to be updated while rolling back, accessing it
+ * without a lock would violate protocol.
+ */
+ WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
+
+ /*
+ * Ensure the eviction server is out of the file - we don't want it messing with us. This step
+ * shouldn't be required, but it simplifies some of the reasoning about what state trees can be
+ * in.
+ */
+ WT_RET(__wt_evict_file_exclusive_on(session));
+ WT_WITH_PAGE_INDEX(
+ session, ret = __txn_rollback_to_stable_btree_walk(session, rollback_timestamp));
+ __wt_evict_file_exclusive_off(session);
+
+ return (ret);
}
/*
* __txn_rollback_to_stable_check --
- * Ensure the rollback request is reasonable.
+ * Ensure the rollback request is reasonable.
*/
static int
__txn_rollback_to_stable_check(WT_SESSION_IMPL *session)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_TXN_GLOBAL *txn_global;
- bool txn_active;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- if (!txn_global->has_stable_timestamp)
- WT_RET_MSG(session, EINVAL,
- "rollback_to_stable requires a stable timestamp");
-
- /*
- * Help the user comply with the requirement that there are no
- * concurrent operations. Protect against spurious conflicts with the
- * sweep server: we exclude it from running concurrent with rolling
- * back the lookaside contents.
- */
- __wt_writelock(session, &conn->cache->las_sweepwalk_lock);
- ret = __wt_txn_activity_check(session, &txn_active);
- __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
-
- if (ret == 0 && txn_active)
- WT_RET_MSG(session, EINVAL,
- "rollback_to_stable illegal with active transactions");
-
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
+ bool txn_active;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ if (!txn_global->has_stable_timestamp)
+ WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a stable timestamp");
+
+ /*
+ * Help the user comply with the requirement that there are no concurrent operations. Protect
+ * against spurious conflicts with the sweep server: we exclude it from running concurrent with
+ * rolling back the lookaside contents.
+ */
+ __wt_writelock(session, &conn->cache->las_sweepwalk_lock);
+ ret = __wt_txn_activity_check(session, &txn_active);
+#ifdef HAVE_DIAGNOSTIC
+ if (txn_active)
+ WT_TRET(__wt_verbose_dump_txn(session));
+#endif
+ __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock);
+
+ if (ret == 0 && txn_active)
+ WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions");
+
+ return (ret);
}
/*
* __txn_rollback_to_stable --
- * Rollback all in-memory state related to timestamps more recent than
- * the passed in timestamp.
+ * Rollback all in-memory state related to timestamps more recent than the passed in timestamp.
*/
static int
__txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
-
- conn = S2C(session);
-
- WT_STAT_CONN_INCR(session, txn_rollback_to_stable);
- /*
- * Mark that a rollback operation is in progress and wait for eviction
- * to drain. This is necessary because lookaside eviction uses
- * transactions and causes the check for a quiescent system to fail.
- *
- * Configuring lookaside eviction off isn't atomic, safe because the
- * flag is only otherwise set when closing down the database. Assert
- * to avoid confusion in the future.
- */
- WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE));
- F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
-
- WT_ERR(__wt_conn_btree_apply(session,
- NULL, __txn_rollback_eviction_drain, NULL, cfg));
-
- WT_ERR(__txn_rollback_to_stable_check(session));
-
- F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
-
- /*
- * Allocate a non-durable btree bitstring. We increment the global
- * value before using it, so the current value is already in use, and
- * hence we need to add one here.
- */
- conn->stable_rollback_maxfile = conn->next_file_id + 1;
- WT_ERR(__bit_alloc(session,
- conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring));
- WT_ERR(__wt_conn_btree_apply(session,
- NULL, __txn_rollback_to_stable_btree, NULL, cfg));
-
- /*
- * Clear any offending content from the lookaside file. This must be
- * done after the in-memory application, since the process of walking
- * trees in cache populates a list that is used to check which
- * lookaside records should be removed.
- */
- if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
- WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
-
-err: F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
- __wt_free(session, conn->stable_rollback_bitstring);
- return (ret);
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ WT_STAT_CONN_INCR(session, txn_rollback_to_stable);
+ /*
+ * Mark that a rollback operation is in progress and wait for eviction
+ * to drain. This is necessary because lookaside eviction uses
+ * transactions and causes the check for a quiescent system to fail.
+ *
+ * Configuring lookaside eviction off isn't atomic, safe because the
+ * flag is only otherwise set when closing down the database. Assert
+ * to avoid confusion in the future.
+ */
+ WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE));
+ F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
+
+ WT_ERR(__wt_conn_btree_apply(session, NULL, __txn_rollback_eviction_drain, NULL, cfg));
+
+ WT_ERR(__txn_rollback_to_stable_check(session));
+
+ F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
+
+ /*
+ * Allocate a non-durable btree bitstring. We increment the global value before using it, so the
+ * current value is already in use, and hence we need to add one here.
+ */
+ conn->stable_rollback_maxfile = conn->next_file_id + 1;
+ WT_ERR(__bit_alloc(session, conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring));
+ WT_ERR(__wt_conn_btree_apply(session, NULL, __txn_rollback_to_stable_btree, NULL, cfg));
+
+ /*
+ * Clear any offending content from the lookaside file. This must be done after the in-memory
+ * application, since the process of walking trees in cache populates a list that is used to
+ * check which lookaside records should be removed.
+ */
+ if (!F_ISSET(conn, WT_CONN_IN_MEMORY))
+ WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session));
+
+err:
+ F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE);
+ __wt_free(session, conn->stable_rollback_bitstring);
+ return (ret);
}
/*
* __wt_txn_rollback_to_stable --
- * Rollback all in-memory state related to timestamps more recent than
- * the passed in timestamp.
+ * Rollback all in-memory state related to timestamps more recent than the passed in timestamp.
*/
int
__wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_DECL_RET;
-
- /*
- * Don't use the connection's default session: we are working on data
- * handles and (a) don't want to cache all of them forever, plus (b)
- * can't guarantee that no other method will be called concurrently.
- */
- WT_RET(__wt_open_internal_session(S2C(session),
- "txn rollback_to_stable", true, 0, &session));
- ret = __txn_rollback_to_stable(session, cfg);
- WT_TRET(session->iface.close(&session->iface, NULL));
-
- return (ret);
+ WT_DECL_RET;
+
+ /*
+ * Don't use the connection's default session: we are working on data handles and (a) don't want
+ * to cache all of them forever, plus (b) can't guarantee that no other method will be called
+ * concurrently.
+ */
+ WT_RET(__wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, &session));
+ ret = __txn_rollback_to_stable(session, cfg);
+ WT_TRET(session->iface.close(&session->iface, NULL));
+
+ return (ret);
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index 84b9c290641..2d9291ebbce 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -10,1397 +10,1285 @@
/*
* __wt_timestamp_to_string --
- * Convert a timestamp to the MongoDB string representation.
+ * Convert a timestamp to the MongoDB string representation.
*/
char *
__wt_timestamp_to_string(wt_timestamp_t ts, char *ts_string)
{
- WT_IGNORE_RET(__wt_snprintf(ts_string, WT_TS_INT_STRING_SIZE,
- "(%" PRIu32 ",%" PRIu32 ")",
- (uint32_t)((ts >> 32) & 0xffffffff), (uint32_t)(ts & 0xffffffff)));
- return (ts_string);
+ WT_IGNORE_RET(__wt_snprintf(ts_string, WT_TS_INT_STRING_SIZE, "(%" PRIu32 ",%" PRIu32 ")",
+ (uint32_t)((ts >> 32) & 0xffffffff), (uint32_t)(ts & 0xffffffff)));
+ return (ts_string);
}
/*
* __wt_timestamp_to_hex_string --
- * Convert a timestamp to hex string representation.
+ * Convert a timestamp to hex string representation.
*/
void
__wt_timestamp_to_hex_string(wt_timestamp_t ts, char *hex_timestamp)
{
- char *p, v;
-
- if (ts == 0) {
- hex_timestamp[0] = '0';
- hex_timestamp[1] = '\0';
- return;
- }
- if (ts == WT_TS_MAX) {
-#define WT_TS_MAX_HEX_STRING "ffffffffffffffff"
- (void)memcpy(hex_timestamp,
- WT_TS_MAX_HEX_STRING, strlen(WT_TS_MAX_HEX_STRING) + 1);
- return;
- }
-
- for (p = hex_timestamp; ts != 0; ts >>= 4)
- *p++ = (char)__wt_hex((u_char)(ts & 0x0f));
- *p = '\0';
-
- /* Reverse the string. */
- for (--p; p > hex_timestamp;) {
- v = *p;
- *p-- = *hex_timestamp;
- *hex_timestamp++ = v;
- }
+ char *p, v;
+
+ if (ts == 0) {
+ hex_timestamp[0] = '0';
+ hex_timestamp[1] = '\0';
+ return;
+ }
+ if (ts == WT_TS_MAX) {
+#define WT_TS_MAX_HEX_STRING "ffffffffffffffff"
+ (void)memcpy(hex_timestamp, WT_TS_MAX_HEX_STRING, strlen(WT_TS_MAX_HEX_STRING) + 1);
+ return;
+ }
+
+ for (p = hex_timestamp; ts != 0; ts >>= 4)
+ *p++ = (char)__wt_hex((u_char)(ts & 0x0f));
+ *p = '\0';
+
+ /* Reverse the string. */
+ for (--p; p > hex_timestamp;) {
+ v = *p;
+ *p-- = *hex_timestamp;
+ *hex_timestamp++ = v;
+ }
}
/*
* __wt_verbose_timestamp --
- * Output a verbose message along with the specified timestamp.
+ * Output a verbose message along with the specified timestamp.
*/
void
-__wt_verbose_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t ts, const char *msg)
+__wt_verbose_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t ts, const char *msg)
{
- char ts_string[WT_TS_INT_STRING_SIZE];
+ char ts_string[WT_TS_INT_STRING_SIZE];
- __wt_verbose(session,
- WT_VERB_TIMESTAMP, "Timestamp %s: %s",
- __wt_timestamp_to_string(ts, ts_string), msg);
+ __wt_verbose(
+ session, WT_VERB_TIMESTAMP, "Timestamp %s: %s", __wt_timestamp_to_string(ts, ts_string), msg);
}
/*
* __wt_txn_parse_timestamp_raw --
- * Decodes and sets a timestamp. Don't do any checking.
+ * Decodes and sets a timestamp. Don't do any checking.
*/
int
-__wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name,
- wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
+__wt_txn_parse_timestamp_raw(
+ WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
{
- static const int8_t hextable[] = {
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, -1, -1, -1, -1, -1, -1,
- -1, 10, 11, 12, 13, 14, 15, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1,
- -1, 10, 11, 12, 13, 14, 15, -1
- };
- wt_timestamp_t ts;
- size_t len;
- int hex_val;
- const char *hex_itr;
-
- *timestamp = 0;
-
- if (cval->len == 0)
- return (0);
-
- /* Protect against unexpectedly long hex strings. */
- if (cval->len > 2 * sizeof(wt_timestamp_t))
- WT_RET_MSG(session, EINVAL,
- "%s timestamp too long '%.*s'",
- name, (int)cval->len, cval->str);
-
- for (ts = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) {
- if ((size_t)*hex_itr < WT_ELEMENTS(hextable))
- hex_val = hextable[(size_t)*hex_itr++];
- else
- hex_val = -1;
- if (hex_val < 0)
- WT_RET_MSG(session, EINVAL,
- "Failed to parse %s timestamp '%.*s'",
- name, (int)cval->len, cval->str);
- ts = (ts << 4) | (uint64_t)hex_val;
- }
- *timestamp = ts;
-
- return (0);
+ static const int8_t hextable[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
+ -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1};
+ wt_timestamp_t ts;
+ size_t len;
+ int hex_val;
+ const char *hex_itr;
+
+ *timestamp = 0;
+
+ if (cval->len == 0)
+ return (0);
+
+ /* Protect against unexpectedly long hex strings. */
+ if (cval->len > 2 * sizeof(wt_timestamp_t))
+ WT_RET_MSG(
+ session, EINVAL, "%s timestamp too long '%.*s'", name, (int)cval->len, cval->str);
+
+ for (ts = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) {
+ if ((size_t)*hex_itr < WT_ELEMENTS(hextable))
+ hex_val = hextable[(size_t)*hex_itr++];
+ else
+ hex_val = -1;
+ if (hex_val < 0)
+ WT_RET_MSG(session, EINVAL, "Failed to parse %s timestamp '%.*s'", name, (int)cval->len,
+ cval->str);
+ ts = (ts << 4) | (uint64_t)hex_val;
+ }
+ *timestamp = ts;
+
+ return (0);
}
/*
* __wt_txn_parse_timestamp --
- * Decodes and sets a timestamp checking it is non-zero.
+ * Decodes and sets a timestamp checking it is non-zero.
*/
int
-__wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name,
- wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
+__wt_txn_parse_timestamp(
+ WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval)
{
- WT_RET(__wt_txn_parse_timestamp_raw(session, name, timestamp, cval));
- if (cval->len != 0 && *timestamp == WT_TS_NONE)
- WT_RET_MSG(session, EINVAL,
- "Failed to parse %s timestamp '%.*s': zero not permitted",
- name, (int)cval->len, cval->str);
+ WT_RET(__wt_txn_parse_timestamp_raw(session, name, timestamp, cval));
+ if (cval->len != 0 && *timestamp == WT_TS_NONE)
+ WT_RET_MSG(session, EINVAL, "Failed to parse %s timestamp '%.*s': zero not permitted", name,
+ (int)cval->len, cval->str);
- return (0);
+ return (0);
}
/*
* __txn_get_read_timestamp --
- * Get the read timestamp from the transaction. Additionally
- * return bool to specify whether the transaction has set
- * clear read queue flag.
+ * Get the read timestamp from the transaction. Additionally return bool to specify whether the
+ * transaction has set clear read queue flag.
*/
static bool
-__txn_get_read_timestamp(
- WT_TXN *txn, wt_timestamp_t *read_timestampp)
+__txn_get_read_timestamp(WT_TXN *txn, wt_timestamp_t *read_timestampp)
{
- WT_ORDERED_READ(*read_timestampp, txn->read_timestamp);
- return (!txn->clear_read_q);
+ WT_ORDERED_READ(*read_timestampp, txn->read_timestamp);
+ return (!txn->clear_read_q);
}
/*
* __wt_txn_get_pinned_timestamp --
- * Calculate the current pinned timestamp.
+ * Calculate the current pinned timestamp.
*/
int
-__wt_txn_get_pinned_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags)
+__wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags)
{
- WT_CONNECTION_IMPL *conn;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t tmp_read_ts, tmp_ts;
- bool include_oldest, txn_has_write_lock;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
- include_oldest = LF_ISSET(WT_TXN_TS_INCLUDE_OLDEST);
- txn_has_write_lock = LF_ISSET(WT_TXN_TS_ALREADY_LOCKED);
-
- if (include_oldest && !txn_global->has_oldest_timestamp)
- return (WT_NOTFOUND);
-
- if (!txn_has_write_lock)
- __wt_readlock(session, &txn_global->rwlock);
-
- tmp_ts = include_oldest ? txn_global->oldest_timestamp : 0;
-
- /* Check for a running checkpoint */
- if (LF_ISSET(WT_TXN_TS_INCLUDE_CKPT) &&
- txn_global->checkpoint_timestamp != WT_TS_NONE &&
- (tmp_ts == 0 || txn_global->checkpoint_timestamp < tmp_ts))
- tmp_ts = txn_global->checkpoint_timestamp;
- if (!txn_has_write_lock)
- __wt_readunlock(session, &txn_global->rwlock);
-
- /* Look for the oldest ordinary reader. */
- __wt_readlock(session, &txn_global->read_timestamp_rwlock);
- TAILQ_FOREACH(txn, &txn_global->read_timestamph, read_timestampq) {
- /*
- * Skip any transactions on the queue that are not active.
- * Copy out value of read timestamp to prevent possible
- * race where a transaction resets its read timestamp while
- * we traverse the queue.
- */
- if (!__txn_get_read_timestamp(txn, &tmp_read_ts))
- continue;
- /*
- * A zero timestamp is possible here only when the oldest
- * timestamp is not accounted for.
- */
- if (tmp_ts == 0 || tmp_read_ts < tmp_ts)
- tmp_ts = tmp_read_ts;
- /*
- * We break on the first active txn on the list.
- */
- break;
- }
- __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
-
- if (!include_oldest && tmp_ts == 0)
- return (WT_NOTFOUND);
- *tsp = tmp_ts;
-
- return (0);
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t tmp_read_ts, tmp_ts;
+ bool include_oldest, txn_has_write_lock;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ include_oldest = LF_ISSET(WT_TXN_TS_INCLUDE_OLDEST);
+ txn_has_write_lock = LF_ISSET(WT_TXN_TS_ALREADY_LOCKED);
+
+ if (include_oldest && !txn_global->has_oldest_timestamp)
+ return (WT_NOTFOUND);
+
+ if (!txn_has_write_lock)
+ __wt_readlock(session, &txn_global->rwlock);
+
+ tmp_ts = include_oldest ? txn_global->oldest_timestamp : 0;
+
+ /* Check for a running checkpoint */
+ if (LF_ISSET(WT_TXN_TS_INCLUDE_CKPT) && txn_global->checkpoint_timestamp != WT_TS_NONE &&
+ (tmp_ts == 0 || txn_global->checkpoint_timestamp < tmp_ts))
+ tmp_ts = txn_global->checkpoint_timestamp;
+ if (!txn_has_write_lock)
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /* Look for the oldest ordinary reader. */
+ __wt_readlock(session, &txn_global->read_timestamp_rwlock);
+ TAILQ_FOREACH (txn, &txn_global->read_timestamph, read_timestampq) {
+ /*
+ * Skip any transactions on the queue that are not active. Copy out value of read timestamp
+ * to prevent possible race where a transaction resets its read timestamp while we traverse
+ * the queue.
+ */
+ if (!__txn_get_read_timestamp(txn, &tmp_read_ts))
+ continue;
+ /*
+ * A zero timestamp is possible here only when the oldest timestamp is not accounted for.
+ */
+ if (tmp_ts == 0 || tmp_read_ts < tmp_ts)
+ tmp_ts = tmp_read_ts;
+ /*
+ * We break on the first active txn on the list.
+ */
+ break;
+ }
+ __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
+
+ if (!include_oldest && tmp_ts == 0)
+ return (WT_NOTFOUND);
+ *tsp = tmp_ts;
+
+ return (0);
}
/*
* __txn_get_published_timestamp --
- * Get the current durable timestamp for a given transaction. If there is
- * an explicit durable timestamp, this function will return the commit
- * timestamp since this is implied. If there is neither a commit nor a
- * durable timestamp, this function will return 0.
+ * Get the current durable timestamp for a given transaction. If there is an explicit durable
+ * timestamp, this function will return the commit timestamp since this is implied. If there is
+ * neither a commit nor a durable timestamp, this function will return 0.
*/
static inline wt_timestamp_t
__txn_get_published_timestamp(WT_SESSION_IMPL *session, WT_TXN *txn)
{
- wt_timestamp_t ts;
-
- /*
- * Any checking of bit flags in this logic is invalid. __wt_txn_release
- * may have already been called on this transaction which will set the
- * flags member to 0. So we need to deduce which timestamp to use purely
- * by inspecting the timestamp members which we deliberately preserve
- * for reader threads such as ourselves.
- *
- * In the non-prepared case, the first commit will either be less than
- * the commit (in the case of multiple commits) in which case we should
- * return the first commit. Or it will be equal to the commit (in the
- * case of a single commit) and we can return durable (which is mirrored
- * from the commit timestamp).
- *
- * In the prepared case, the first commit will always be equal to the
- * commit so we'll return durable.
- */
- if (txn->commit_timestamp != txn->first_commit_timestamp)
- ts = txn->first_commit_timestamp;
- else
- ts = txn->durable_timestamp;
-
- WT_ASSERT(session, ts != WT_TS_NONE);
- return (ts);
+ wt_timestamp_t ts;
+
+ /*
+ * Any checking of bit flags in this logic is invalid. __wt_txn_release
+ * may have already been called on this transaction which will set the
+ * flags member to 0. So we need to deduce which timestamp to use purely
+ * by inspecting the timestamp members which we deliberately preserve
+ * for reader threads such as ourselves.
+ *
+ * In the non-prepared case, the first commit will either be less than
+ * the commit (in the case of multiple commits) in which case we should
+ * return the first commit. Or it will be equal to the commit (in the
+ * case of a single commit) and we can return durable (which is mirrored
+ * from the commit timestamp).
+ *
+ * In the prepared case, the first commit will always be equal to the
+ * commit so we'll return durable.
+ */
+ if (txn->commit_timestamp != txn->first_commit_timestamp)
+ ts = txn->first_commit_timestamp;
+ else
+ ts = txn->durable_timestamp;
+
+ WT_ASSERT(session, ts != WT_TS_NONE);
+ return (ts);
}
/*
* __txn_global_query_timestamp --
- * Query a timestamp on the global transaction.
+ * Query a timestamp on the global transaction.
*/
static int
-__txn_global_query_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
+__txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_CONNECTION_IMPL *conn;
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t ts, tmpts;
-
- conn = S2C(session);
- txn_global = &conn->txn_global;
-
- WT_STAT_CONN_INCR(session, txn_query_ts);
- WT_RET(__wt_config_gets(session, cfg, "get", &cval));
- if (WT_STRING_MATCH("all_committed", cval.str, cval.len) ||
- WT_STRING_MATCH("all_durable", cval.str, cval.len)) {
- if (!txn_global->has_durable_timestamp)
- return (WT_NOTFOUND);
- ts = txn_global->durable_timestamp;
- WT_ASSERT(session, ts != WT_TS_NONE);
-
- /*
- * Skip straight to the commit queue if no running transactions
- * have an explicit durable timestamp.
- */
- if (TAILQ_EMPTY(&txn_global->durable_timestamph))
- goto done;
- /*
- * Compare with the least recently durable transaction.
- */
- __wt_readlock(session, &txn_global->durable_timestamp_rwlock);
- TAILQ_FOREACH(txn, &txn_global->durable_timestamph,
- durable_timestampq) {
- if (txn->clear_durable_q)
- continue;
-
- tmpts = __txn_get_published_timestamp(session, txn) - 1;
- if (tmpts < ts)
- ts = tmpts;
- break;
- }
- __wt_readunlock(session, &txn_global->durable_timestamp_rwlock);
-
- /*
- * If a transaction is committing with a durable timestamp of 1,
- * we could return zero here, which is unexpected. Fail instead.
- */
- if (ts == WT_TS_NONE)
- return (WT_NOTFOUND);
- } else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len))
- /* Read-only value forever. No lock needed. */
- ts = txn_global->last_ckpt_timestamp;
- else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) {
- if (!txn_global->has_oldest_timestamp)
- return (WT_NOTFOUND);
- ts = txn_global->oldest_timestamp;
- } else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len))
- WT_RET(__wt_txn_get_pinned_timestamp(
- session, &ts, WT_TXN_TS_INCLUDE_CKPT));
- else if (WT_STRING_MATCH("pinned", cval.str, cval.len))
- WT_RET(__wt_txn_get_pinned_timestamp(session, &ts,
- WT_TXN_TS_INCLUDE_CKPT | WT_TXN_TS_INCLUDE_OLDEST));
- else if (WT_STRING_MATCH("recovery", cval.str, cval.len))
- /* Read-only value forever. No lock needed. */
- ts = txn_global->recovery_timestamp;
- else if (WT_STRING_MATCH("stable", cval.str, cval.len)) {
- if (!txn_global->has_stable_timestamp)
- return (WT_NOTFOUND);
- ts = txn_global->stable_timestamp;
- } else
- WT_RET_MSG(session, EINVAL,
- "unknown timestamp query %.*s", (int)cval.len, cval.str);
-
-done: *tsp = ts;
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t ts, tmpts;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ WT_STAT_CONN_INCR(session, txn_query_ts);
+ WT_RET(__wt_config_gets(session, cfg, "get", &cval));
+ if (WT_STRING_MATCH("all_committed", cval.str, cval.len) ||
+ WT_STRING_MATCH("all_durable", cval.str, cval.len)) {
+ if (!txn_global->has_durable_timestamp)
+ return (WT_NOTFOUND);
+ ts = txn_global->durable_timestamp;
+ WT_ASSERT(session, ts != WT_TS_NONE);
+
+ /*
+ * Skip straight to the commit queue if no running transactions have an explicit durable
+ * timestamp.
+ */
+ if (TAILQ_EMPTY(&txn_global->durable_timestamph))
+ goto done;
+ /*
+ * Compare with the least recently durable transaction.
+ */
+ __wt_readlock(session, &txn_global->durable_timestamp_rwlock);
+ TAILQ_FOREACH (txn, &txn_global->durable_timestamph, durable_timestampq) {
+ if (txn->clear_durable_q)
+ continue;
+
+ tmpts = __txn_get_published_timestamp(session, txn) - 1;
+ if (tmpts < ts)
+ ts = tmpts;
+ break;
+ }
+ __wt_readunlock(session, &txn_global->durable_timestamp_rwlock);
+
+ /*
+ * If a transaction is committing with a durable timestamp of 1, we could return zero here,
+ * which is unexpected. Fail instead.
+ */
+ if (ts == WT_TS_NONE)
+ return (WT_NOTFOUND);
+ } else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len))
+ /* Read-only value forever. No lock needed. */
+ ts = txn_global->last_ckpt_timestamp;
+ else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) {
+ if (!txn_global->has_oldest_timestamp)
+ return (WT_NOTFOUND);
+ ts = txn_global->oldest_timestamp;
+ } else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len))
+ WT_RET(__wt_txn_get_pinned_timestamp(session, &ts, WT_TXN_TS_INCLUDE_CKPT));
+ else if (WT_STRING_MATCH("pinned", cval.str, cval.len))
+ WT_RET(__wt_txn_get_pinned_timestamp(
+ session, &ts, WT_TXN_TS_INCLUDE_CKPT | WT_TXN_TS_INCLUDE_OLDEST));
+ else if (WT_STRING_MATCH("recovery", cval.str, cval.len))
+ /* Read-only value forever. No lock needed. */
+ ts = txn_global->recovery_timestamp;
+ else if (WT_STRING_MATCH("stable", cval.str, cval.len)) {
+ if (!txn_global->has_stable_timestamp)
+ return (WT_NOTFOUND);
+ ts = txn_global->stable_timestamp;
+ } else
+ WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str);
+
+done:
+ *tsp = ts;
+ return (0);
}
/*
* __txn_query_timestamp --
- * Query a timestamp within this session's transaction.
+ * Query a timestamp within this session's transaction.
*/
static int
-__txn_query_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
+__txn_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_TXN *txn;
-
- txn = &session->txn;
-
- WT_STAT_CONN_INCR(session, session_query_ts);
- if (!F_ISSET(txn, WT_TXN_RUNNING))
- return (WT_NOTFOUND);
-
- WT_RET(__wt_config_gets(session, cfg, "get", &cval));
- if (WT_STRING_MATCH("commit", cval.str, cval.len))
- *tsp = txn->commit_timestamp;
- else if (WT_STRING_MATCH("first_commit", cval.str, cval.len))
- *tsp = txn->first_commit_timestamp;
- else if (WT_STRING_MATCH("prepare", cval.str, cval.len))
- *tsp = txn->prepare_timestamp;
- else if (WT_STRING_MATCH("read", cval.str, cval.len))
- *tsp = txn->read_timestamp;
- else
- WT_RET_MSG(session, EINVAL,
- "unknown timestamp query %.*s", (int)cval.len, cval.str);
-
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_TXN *txn;
+
+ txn = &session->txn;
+
+ WT_STAT_CONN_INCR(session, session_query_ts);
+ if (!F_ISSET(txn, WT_TXN_RUNNING))
+ return (WT_NOTFOUND);
+
+ WT_RET(__wt_config_gets(session, cfg, "get", &cval));
+ if (WT_STRING_MATCH("commit", cval.str, cval.len))
+ *tsp = txn->commit_timestamp;
+ else if (WT_STRING_MATCH("first_commit", cval.str, cval.len))
+ *tsp = txn->first_commit_timestamp;
+ else if (WT_STRING_MATCH("prepare", cval.str, cval.len))
+ *tsp = txn->prepare_timestamp;
+ else if (WT_STRING_MATCH("read", cval.str, cval.len))
+ *tsp = txn->read_timestamp;
+ else
+ WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str);
+
+ return (0);
}
/*
* __wt_txn_query_timestamp --
- * Query a timestamp. The caller may query the global transaction or the
- * session's transaction.
+ * Query a timestamp. The caller may query the global transaction or the session's transaction.
*/
int
-__wt_txn_query_timestamp(WT_SESSION_IMPL *session,
- char *hex_timestamp, const char *cfg[], bool global_txn)
+__wt_txn_query_timestamp(
+ WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[], bool global_txn)
{
- wt_timestamp_t ts;
+ wt_timestamp_t ts;
- if (global_txn)
- WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
- else
- WT_RET(__txn_query_timestamp(session, &ts, cfg));
+ if (global_txn)
+ WT_RET(__txn_global_query_timestamp(session, &ts, cfg));
+ else
+ WT_RET(__txn_query_timestamp(session, &ts, cfg));
- __wt_timestamp_to_hex_string(ts, hex_timestamp);
- return (0);
+ __wt_timestamp_to_hex_string(ts, hex_timestamp);
+ return (0);
}
/*
* __wt_txn_update_pinned_timestamp --
- * Update the pinned timestamp (the oldest timestamp that has to be
- * maintained for current or future readers).
+ * Update the pinned timestamp (the oldest timestamp that has to be maintained for current or
+ * future readers).
*/
int
__wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force)
{
- WT_DECL_RET;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t last_pinned_timestamp, pinned_timestamp;
-
- txn_global = &S2C(session)->txn_global;
-
- /* Skip locking and scanning when the oldest timestamp is pinned. */
- if (txn_global->oldest_is_pinned)
- return (0);
-
- /* Scan to find the global pinned timestamp. */
- if ((ret = __wt_txn_get_pinned_timestamp(
- session, &pinned_timestamp, WT_TXN_TS_INCLUDE_OLDEST)) != 0)
- return (ret == WT_NOTFOUND ? 0 : ret);
-
- if (txn_global->has_pinned_timestamp && !force) {
- last_pinned_timestamp = txn_global->pinned_timestamp;
-
- if (pinned_timestamp <= last_pinned_timestamp)
- return (0);
- }
-
- __wt_writelock(session, &txn_global->rwlock);
- /*
- * Scan the global pinned timestamp again, it's possible that it got
- * changed after the previous scan.
- */
- if ((ret = __wt_txn_get_pinned_timestamp(session, &pinned_timestamp,
- WT_TXN_TS_ALREADY_LOCKED | WT_TXN_TS_INCLUDE_OLDEST)) != 0) {
- __wt_writeunlock(session, &txn_global->rwlock);
- return (ret == WT_NOTFOUND ? 0 : ret);
- }
-
- if (!txn_global->has_pinned_timestamp || force ||
- txn_global->pinned_timestamp < pinned_timestamp) {
- txn_global->pinned_timestamp = pinned_timestamp;
- txn_global->has_pinned_timestamp = true;
- txn_global->oldest_is_pinned =
- txn_global->pinned_timestamp ==
- txn_global->oldest_timestamp;
- txn_global->stable_is_pinned =
- txn_global->pinned_timestamp ==
- txn_global->stable_timestamp;
- __wt_verbose_timestamp(session,
- pinned_timestamp, "Updated pinned timestamp");
- }
- __wt_writeunlock(session, &txn_global->rwlock);
-
- return (0);
+ WT_DECL_RET;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t last_pinned_timestamp, pinned_timestamp;
+
+ txn_global = &S2C(session)->txn_global;
+
+ /* Skip locking and scanning when the oldest timestamp is pinned. */
+ if (txn_global->oldest_is_pinned)
+ return (0);
+
+ /* Scan to find the global pinned timestamp. */
+ if ((ret = __wt_txn_get_pinned_timestamp(
+ session, &pinned_timestamp, WT_TXN_TS_INCLUDE_OLDEST)) != 0)
+ return (ret == WT_NOTFOUND ? 0 : ret);
+
+ if (txn_global->has_pinned_timestamp && !force) {
+ last_pinned_timestamp = txn_global->pinned_timestamp;
+
+ if (pinned_timestamp <= last_pinned_timestamp)
+ return (0);
+ }
+
+ __wt_writelock(session, &txn_global->rwlock);
+ /*
+ * Scan the global pinned timestamp again, it's possible that it got changed after the previous
+ * scan.
+ */
+ if ((ret = __wt_txn_get_pinned_timestamp(
+ session, &pinned_timestamp, WT_TXN_TS_ALREADY_LOCKED | WT_TXN_TS_INCLUDE_OLDEST)) != 0) {
+ __wt_writeunlock(session, &txn_global->rwlock);
+ return (ret == WT_NOTFOUND ? 0 : ret);
+ }
+
+ if (!txn_global->has_pinned_timestamp || force ||
+ txn_global->pinned_timestamp < pinned_timestamp) {
+ txn_global->pinned_timestamp = pinned_timestamp;
+ txn_global->has_pinned_timestamp = true;
+ txn_global->oldest_is_pinned = txn_global->pinned_timestamp == txn_global->oldest_timestamp;
+ txn_global->stable_is_pinned = txn_global->pinned_timestamp == txn_global->stable_timestamp;
+ __wt_verbose_timestamp(session, pinned_timestamp, "Updated pinned timestamp");
+ }
+ __wt_writeunlock(session, &txn_global->rwlock);
+
+ return (0);
}
/*
* __wt_txn_global_set_timestamp --
- * Set a global transaction timestamp.
+ * Set a global transaction timestamp.
*/
int
__wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_CONFIG_ITEM durable_cval, oldest_cval, stable_cval;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t durable_ts, oldest_ts, stable_ts;
- wt_timestamp_t last_oldest_ts, last_stable_ts;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool force, has_durable, has_oldest, has_stable;
-
- txn_global = &S2C(session)->txn_global;
-
- WT_STAT_CONN_INCR(session, txn_set_ts);
-
- /*
- * TODO: When we remove all_committed, we need to remove this too.
- * For now, we're temporarily aliasing the global commit timestamp to
- * the global durable timestamp.
- */
- WT_RET(__wt_config_gets_def(session,
- cfg, "commit_timestamp", 0, &durable_cval));
- has_durable = durable_cval.len != 0;
- if (has_durable)
- WT_STAT_CONN_INCR(session, txn_set_ts_durable);
-
- if (!has_durable) {
- WT_RET(__wt_config_gets_def(session,
- cfg, "durable_timestamp", 0, &durable_cval));
- has_durable = durable_cval.len != 0;
- if (has_durable)
- WT_STAT_CONN_INCR(session, txn_set_ts_durable);
- }
-
- WT_RET(__wt_config_gets_def(session,
- cfg, "oldest_timestamp", 0, &oldest_cval));
- has_oldest = oldest_cval.len != 0;
- if (has_oldest)
- WT_STAT_CONN_INCR(session, txn_set_ts_oldest);
-
- WT_RET(__wt_config_gets_def(session,
- cfg, "stable_timestamp", 0, &stable_cval));
- has_stable = stable_cval.len != 0;
- if (has_stable)
- WT_STAT_CONN_INCR(session, txn_set_ts_stable);
-
- /* If no timestamp was supplied, there's nothing to do. */
- if (!has_durable && !has_oldest && !has_stable)
- return (0);
-
- /*
- * Parsing will initialize the timestamp to zero even if
- * it is not configured.
- */
- WT_RET(__wt_txn_parse_timestamp(
- session, "durable", &durable_ts, &durable_cval));
- WT_RET(__wt_txn_parse_timestamp(
- session, "oldest", &oldest_ts, &oldest_cval));
- WT_RET(__wt_txn_parse_timestamp(
- session, "stable", &stable_ts, &stable_cval));
-
- WT_RET(__wt_config_gets_def(session,
- cfg, "force", 0, &cval));
- force = cval.val != 0;
-
- if (force)
- goto set;
-
- __wt_readlock(session, &txn_global->rwlock);
-
- last_oldest_ts = txn_global->oldest_timestamp;
- last_stable_ts = txn_global->stable_timestamp;
-
- /*
- * First do error checking on the timestamp values. The
- * oldest timestamp must always be less than or equal to
- * the stable timestamp. If we're only setting one
- * then compare against the system timestamp. If we're
- * setting both then compare the passed in values.
- */
- if (!has_durable && txn_global->has_durable_timestamp)
- durable_ts = txn_global->durable_timestamp;
- if (!has_oldest && txn_global->has_oldest_timestamp)
- oldest_ts = last_oldest_ts;
- if (!has_stable && txn_global->has_stable_timestamp)
- stable_ts = last_stable_ts;
-
- /*
- * If a durable timestamp was supplied, check that it is no older than
- * either the stable timestamp or the oldest timestamp.
- */
- if (has_durable && (has_oldest ||
- txn_global->has_oldest_timestamp) && oldest_ts > durable_ts) {
- __wt_readunlock(session, &txn_global->rwlock);
- WT_RET_MSG(session, EINVAL,
- "set_timestamp: oldest timestamp %s must not be later than "
- "durable timestamp %s",
- __wt_timestamp_to_string(oldest_ts, ts_string[0]),
- __wt_timestamp_to_string(durable_ts, ts_string[1]));
- }
-
- if (has_durable && (has_stable ||
- txn_global->has_stable_timestamp) && stable_ts > durable_ts) {
- __wt_readunlock(session, &txn_global->rwlock);
- WT_RET_MSG(session, EINVAL,
- "set_timestamp: stable timestamp %s must not be later than "
- "durable timestamp %s",
- __wt_timestamp_to_string(stable_ts, ts_string[0]),
- __wt_timestamp_to_string(durable_ts, ts_string[1]));
- }
-
- /*
- * The oldest and stable timestamps must always satisfy the condition
- * that oldest <= stable.
- */
- if ((has_oldest || has_stable) &&
- (has_oldest || txn_global->has_oldest_timestamp) &&
- (has_stable ||
- txn_global->has_stable_timestamp) && oldest_ts > stable_ts) {
- __wt_readunlock(session, &txn_global->rwlock);
- WT_RET_MSG(session, EINVAL,
- "set_timestamp: oldest timestamp %s must not be later than "
- "stable timestamp %s",
- __wt_timestamp_to_string(oldest_ts, ts_string[0]),
- __wt_timestamp_to_string(stable_ts, ts_string[1]));
- }
-
- __wt_readunlock(session, &txn_global->rwlock);
-
- /* Check if we are actually updating anything. */
- if (has_oldest &&
- txn_global->has_oldest_timestamp && oldest_ts <= last_oldest_ts)
- has_oldest = false;
-
- if (has_stable &&
- txn_global->has_stable_timestamp && stable_ts <= last_stable_ts)
- has_stable = false;
-
- if (!has_durable && !has_oldest && !has_stable)
- return (0);
-
-set: __wt_writelock(session, &txn_global->rwlock);
- /*
- * This method can be called from multiple threads, check that we are
- * moving the global timestamps forwards.
- *
- * The exception is the durable timestamp, where the application can
- * move it backwards (in fact, it only really makes sense to explicitly
- * move it backwards because it otherwise tracks the largest
- * durable_timestamp so it moves forward whenever transactions are
- * assigned timestamps).
- */
- if (has_durable) {
- txn_global->durable_timestamp = durable_ts;
- txn_global->has_durable_timestamp = true;
- WT_STAT_CONN_INCR(session, txn_set_ts_durable_upd);
- __wt_verbose_timestamp(session, durable_ts,
- "Updated global durable timestamp");
- }
-
- if (has_oldest && (!txn_global->has_oldest_timestamp || force ||
- oldest_ts > txn_global->oldest_timestamp)) {
- txn_global->oldest_timestamp = oldest_ts;
- WT_STAT_CONN_INCR(session, txn_set_ts_oldest_upd);
- txn_global->has_oldest_timestamp = true;
- txn_global->oldest_is_pinned = false;
- __wt_verbose_timestamp(session, oldest_ts,
- "Updated global oldest timestamp");
- }
-
- if (has_stable && (!txn_global->has_stable_timestamp || force ||
- stable_ts > txn_global->stable_timestamp)) {
- txn_global->stable_timestamp = stable_ts;
- WT_STAT_CONN_INCR(session, txn_set_ts_stable_upd);
- txn_global->has_stable_timestamp = true;
- txn_global->stable_is_pinned = false;
- __wt_verbose_timestamp(session, stable_ts,
- "Updated global stable timestamp");
- }
- __wt_writeunlock(session, &txn_global->rwlock);
-
- if (has_oldest || has_stable)
- WT_RET(__wt_txn_update_pinned_timestamp(session, force));
-
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_CONFIG_ITEM durable_cval, oldest_cval, stable_cval;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t durable_ts, oldest_ts, stable_ts;
+ wt_timestamp_t last_oldest_ts, last_stable_ts;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+ bool force, has_durable, has_oldest, has_stable;
+
+ txn_global = &S2C(session)->txn_global;
+
+ WT_STAT_CONN_INCR(session, txn_set_ts);
+
+ /*
+ * TODO: When we remove all_committed, we need to remove this too. For now, we're temporarily
+ * aliasing the global commit timestamp to the global durable timestamp.
+ */
+ WT_RET(__wt_config_gets_def(session, cfg, "commit_timestamp", 0, &durable_cval));
+ has_durable = durable_cval.len != 0;
+ if (has_durable)
+ WT_STAT_CONN_INCR(session, txn_set_ts_durable);
+
+ if (!has_durable) {
+ WT_RET(__wt_config_gets_def(session, cfg, "durable_timestamp", 0, &durable_cval));
+ has_durable = durable_cval.len != 0;
+ if (has_durable)
+ WT_STAT_CONN_INCR(session, txn_set_ts_durable);
+ }
+
+ WT_RET(__wt_config_gets_def(session, cfg, "oldest_timestamp", 0, &oldest_cval));
+ has_oldest = oldest_cval.len != 0;
+ if (has_oldest)
+ WT_STAT_CONN_INCR(session, txn_set_ts_oldest);
+
+ WT_RET(__wt_config_gets_def(session, cfg, "stable_timestamp", 0, &stable_cval));
+ has_stable = stable_cval.len != 0;
+ if (has_stable)
+ WT_STAT_CONN_INCR(session, txn_set_ts_stable);
+
+ /* If no timestamp was supplied, there's nothing to do. */
+ if (!has_durable && !has_oldest && !has_stable)
+ return (0);
+
+ /*
+ * Parsing will initialize the timestamp to zero even if it is not configured.
+ */
+ WT_RET(__wt_txn_parse_timestamp(session, "durable", &durable_ts, &durable_cval));
+ WT_RET(__wt_txn_parse_timestamp(session, "oldest", &oldest_ts, &oldest_cval));
+ WT_RET(__wt_txn_parse_timestamp(session, "stable", &stable_ts, &stable_cval));
+
+ WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval));
+ force = cval.val != 0;
+
+ if (force)
+ goto set;
+
+ __wt_readlock(session, &txn_global->rwlock);
+
+ last_oldest_ts = txn_global->oldest_timestamp;
+ last_stable_ts = txn_global->stable_timestamp;
+
+ /*
+ * First do error checking on the timestamp values. The oldest timestamp must always be less
+ * than or equal to the stable timestamp. If we're only setting one then compare against the
+ * system timestamp. If we're setting both then compare the passed in values.
+ */
+ if (!has_durable && txn_global->has_durable_timestamp)
+ durable_ts = txn_global->durable_timestamp;
+ if (!has_oldest && txn_global->has_oldest_timestamp)
+ oldest_ts = last_oldest_ts;
+ if (!has_stable && txn_global->has_stable_timestamp)
+ stable_ts = last_stable_ts;
+
+ /*
+ * If a durable timestamp was supplied, check that it is no older than either the stable
+ * timestamp or the oldest timestamp.
+ */
+ if (has_durable && (has_oldest || txn_global->has_oldest_timestamp) && oldest_ts > durable_ts) {
+ __wt_readunlock(session, &txn_global->rwlock);
+ WT_RET_MSG(session, EINVAL,
+ "set_timestamp: oldest timestamp %s must not be later than "
+ "durable timestamp %s",
+ __wt_timestamp_to_string(oldest_ts, ts_string[0]),
+ __wt_timestamp_to_string(durable_ts, ts_string[1]));
+ }
+
+ if (has_durable && (has_stable || txn_global->has_stable_timestamp) && stable_ts > durable_ts) {
+ __wt_readunlock(session, &txn_global->rwlock);
+ WT_RET_MSG(session, EINVAL,
+ "set_timestamp: stable timestamp %s must not be later than "
+ "durable timestamp %s",
+ __wt_timestamp_to_string(stable_ts, ts_string[0]),
+ __wt_timestamp_to_string(durable_ts, ts_string[1]));
+ }
+
+ /*
+ * The oldest and stable timestamps must always satisfy the condition that oldest <= stable.
+ */
+ if ((has_oldest || has_stable) && (has_oldest || txn_global->has_oldest_timestamp) &&
+ (has_stable || txn_global->has_stable_timestamp) && oldest_ts > stable_ts) {
+ __wt_readunlock(session, &txn_global->rwlock);
+ WT_RET_MSG(session, EINVAL,
+ "set_timestamp: oldest timestamp %s must not be later than "
+ "stable timestamp %s",
+ __wt_timestamp_to_string(oldest_ts, ts_string[0]),
+ __wt_timestamp_to_string(stable_ts, ts_string[1]));
+ }
+
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /* Check if we are actually updating anything. */
+ if (has_oldest && txn_global->has_oldest_timestamp && oldest_ts <= last_oldest_ts)
+ has_oldest = false;
+
+ if (has_stable && txn_global->has_stable_timestamp && stable_ts <= last_stable_ts)
+ has_stable = false;
+
+ if (!has_durable && !has_oldest && !has_stable)
+ return (0);
+
+set:
+ __wt_writelock(session, &txn_global->rwlock);
+ /*
+ * This method can be called from multiple threads, check that we are
+ * moving the global timestamps forwards.
+ *
+ * The exception is the durable timestamp, where the application can
+ * move it backwards (in fact, it only really makes sense to explicitly
+ * move it backwards because it otherwise tracks the largest
+ * durable_timestamp so it moves forward whenever transactions are
+ * assigned timestamps).
+ */
+ if (has_durable) {
+ txn_global->durable_timestamp = durable_ts;
+ txn_global->has_durable_timestamp = true;
+ WT_STAT_CONN_INCR(session, txn_set_ts_durable_upd);
+ __wt_verbose_timestamp(session, durable_ts, "Updated global durable timestamp");
+ }
+
+ if (has_oldest &&
+ (!txn_global->has_oldest_timestamp || force || oldest_ts > txn_global->oldest_timestamp)) {
+ txn_global->oldest_timestamp = oldest_ts;
+ WT_STAT_CONN_INCR(session, txn_set_ts_oldest_upd);
+ txn_global->has_oldest_timestamp = true;
+ txn_global->oldest_is_pinned = false;
+ __wt_verbose_timestamp(session, oldest_ts, "Updated global oldest timestamp");
+ }
+
+ if (has_stable &&
+ (!txn_global->has_stable_timestamp || force || stable_ts > txn_global->stable_timestamp)) {
+ txn_global->stable_timestamp = stable_ts;
+ WT_STAT_CONN_INCR(session, txn_set_ts_stable_upd);
+ txn_global->has_stable_timestamp = true;
+ txn_global->stable_is_pinned = false;
+ __wt_verbose_timestamp(session, stable_ts, "Updated global stable timestamp");
+ }
+ __wt_writeunlock(session, &txn_global->rwlock);
+
+ if (has_oldest || has_stable)
+ WT_RET(__wt_txn_update_pinned_timestamp(session, force));
+
+ return (0);
}
/*
* __txn_assert_after_reads --
- * Assert that commit and prepare timestamps are greater than the latest
- * active read timestamp, if any.
+ * Assert that commit and prepare timestamps are greater than the latest active read timestamp,
+ * if any.
*/
static int
__txn_assert_after_reads(
- WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN **prevp)
+ WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN **prevp)
{
#ifdef HAVE_DIAGNOSTIC
- WT_TXN *prev, *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- wt_timestamp_t tmp_timestamp;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
-
- __wt_readlock(session, &txn_global->read_timestamp_rwlock);
- prev = TAILQ_LAST(
- &txn_global->read_timestamph, __wt_txn_rts_qh);
- while (prev != NULL) {
- /*
- * Skip self and non-active transactions. Copy out value of
- * read timestamp to prevent possible race where a transaction
- * resets its read timestamp while we traverse the queue.
- */
- if (!__txn_get_read_timestamp(prev, &tmp_timestamp) ||
- prev == txn) {
- prev = TAILQ_PREV(
- prev, __wt_txn_rts_qh, read_timestampq);
- continue;
- }
-
- if (tmp_timestamp >= ts) {
- __wt_readunlock(session,
- &txn_global->read_timestamp_rwlock);
- WT_RET_MSG(session, EINVAL,
- "%s timestamp %s must be greater than the "
- "latest active read timestamp %s ",
- op,
- __wt_timestamp_to_string(ts, ts_string[0]),
- __wt_timestamp_to_string(
- tmp_timestamp, ts_string[1]));
- }
- break;
- }
-
- __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
-
- if (prevp != NULL)
- *prevp = prev;
+ WT_TXN *prev, *txn = &session->txn;
+ WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ wt_timestamp_t tmp_timestamp;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+
+ __wt_readlock(session, &txn_global->read_timestamp_rwlock);
+ prev = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
+ while (prev != NULL) {
+ /*
+ * Skip self and non-active transactions. Copy out value of read timestamp to prevent
+ * possible race where a transaction resets its read timestamp while we traverse the queue.
+ */
+ if (!__txn_get_read_timestamp(prev, &tmp_timestamp) || prev == txn) {
+ prev = TAILQ_PREV(prev, __wt_txn_rts_qh, read_timestampq);
+ continue;
+ }
+
+ if (tmp_timestamp >= ts) {
+ __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
+ WT_RET_MSG(session, EINVAL,
+ "%s timestamp %s must be greater than the "
+ "latest active read timestamp %s ",
+ op, __wt_timestamp_to_string(ts, ts_string[0]),
+ __wt_timestamp_to_string(tmp_timestamp, ts_string[1]));
+ }
+ break;
+ }
+
+ __wt_readunlock(session, &txn_global->read_timestamp_rwlock);
+
+ if (prevp != NULL)
+ *prevp = prev;
#else
- WT_UNUSED(session);
- WT_UNUSED(op);
- WT_UNUSED(ts);
- WT_UNUSED(prevp);
+ WT_UNUSED(session);
+ WT_UNUSED(op);
+ WT_UNUSED(ts);
+ WT_UNUSED(prevp);
#endif
- return (0);
+ return (0);
}
/*
* __wt_txn_set_commit_timestamp --
- * Validate the commit timestamp of a transaction.
- * If the commit timestamp is less than the oldest timestamp and
- * transaction is configured to roundup timestamps of a prepared
- * transaction, then we will roundup the commit timestamp to the prepare
- * timestamp of the transaction.
+ * Validate the commit timestamp of a transaction. If the commit timestamp is less than the
+ * oldest timestamp and transaction is configured to roundup timestamps of a prepared
+ * transaction, then we will roundup the commit timestamp to the prepare timestamp of the
+ * transaction.
*/
int
-__wt_txn_set_commit_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t commit_ts)
+__wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts)
{
- WT_TXN *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- wt_timestamp_t oldest_ts, stable_ts;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool has_oldest_ts, has_stable_ts;
-
- /* Added this redundant initialization to circumvent build failure. */
- oldest_ts = stable_ts = WT_TS_NONE;
-
- if (txn->isolation != WT_ISO_SNAPSHOT)
- WT_RET_MSG(session, EINVAL, "setting a commit_timestamp"
- " requires a transaction running at snapshot"
- " isolation");
-
- /*
- * Compare against the oldest and the stable timestamp. Return an error
- * if the given timestamp is less than oldest and/or stable timestamp.
- */
- has_oldest_ts = txn_global->has_oldest_timestamp;
- if (has_oldest_ts)
- oldest_ts = txn_global->oldest_timestamp;
- has_stable_ts = txn_global->has_stable_timestamp;
- if (has_stable_ts)
- stable_ts = txn_global->stable_timestamp;
-
- if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) {
- /*
- * For a non-prepared transactions the commit timestamp should
- * not be less than the stable timestamp.
- */
- if (has_oldest_ts && commit_ts < oldest_ts)
- WT_RET_MSG(session, EINVAL,
- "commit timestamp %s is less than the oldest "
- "timestamp %s",
- __wt_timestamp_to_string(commit_ts, ts_string[0]),
- __wt_timestamp_to_string(oldest_ts, ts_string[1]));
-
- if (has_stable_ts && commit_ts < stable_ts)
- WT_RET_MSG(session, EINVAL,
- "commit timestamp %s is less than the stable "
- "timestamp %s",
- __wt_timestamp_to_string(commit_ts, ts_string[0]),
- __wt_timestamp_to_string(stable_ts, ts_string[1]));
-
- /*
- * Compare against the commit timestamp of the current
- * transaction. Return an error if the given timestamp is
- * older than the first commit timestamp.
- */
- if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) &&
- commit_ts < txn->first_commit_timestamp)
- WT_RET_MSG(session, EINVAL,
- "commit timestamp %s older than the first "
- "commit timestamp %s for this transaction",
- __wt_timestamp_to_string(commit_ts, ts_string[0]),
- __wt_timestamp_to_string(
- txn->first_commit_timestamp, ts_string[1]));
-
- /*
- * FIXME:
- * WT-4779 disabled to buy time to understand a test failure.
- * WT_RET(__txn_assert_after_reads(
- * session, "commit", commit_ts, NULL));
- */
- } else {
- /*
- * For a prepared transaction, the commit timestamp should not
- * be less than the prepare timestamp.
- */
- if (txn->prepare_timestamp > commit_ts) {
- if (!F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED))
- WT_RET_MSG(session, EINVAL,
- "commit timestamp %s is less than the "
- "prepare timestamp %s for this transaction",
- __wt_timestamp_to_string(
- commit_ts, ts_string[0]),
- __wt_timestamp_to_string(
- txn->prepare_timestamp, ts_string[1]));
- commit_ts = txn->prepare_timestamp;
- }
- }
-
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) ||
- txn->durable_timestamp == txn->commit_timestamp);
- txn->commit_timestamp = commit_ts;
- /*
- * First time copy the commit timestamp to the first commit timestamp.
- */
- if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
- txn->first_commit_timestamp = commit_ts;
-
- /*
- * Only mirror the commit timestamp if there isn't already an explicit
- * durable timestamp. This might happen if we set a commit timestamp,
- * set a durable timestamp and then subsequently set the commit
- * timestamp again.
- */
- if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
- txn->durable_timestamp = commit_ts;
-
- F_SET(txn, WT_TXN_HAS_TS_COMMIT);
- return (0);
+ WT_TXN *txn = &session->txn;
+ WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ wt_timestamp_t oldest_ts, stable_ts;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+ bool has_oldest_ts, has_stable_ts;
+
+ /* Added this redundant initialization to circumvent build failure. */
+ oldest_ts = stable_ts = WT_TS_NONE;
+
+ if (txn->isolation != WT_ISO_SNAPSHOT)
+ WT_RET_MSG(session, EINVAL,
+ "setting a commit_timestamp"
+ " requires a transaction running at snapshot"
+ " isolation");
+
+ /*
+ * Compare against the oldest and the stable timestamp. Return an error if the given timestamp
+ * is less than oldest and/or stable timestamp.
+ */
+ has_oldest_ts = txn_global->has_oldest_timestamp;
+ if (has_oldest_ts)
+ oldest_ts = txn_global->oldest_timestamp;
+ has_stable_ts = txn_global->has_stable_timestamp;
+ if (has_stable_ts)
+ stable_ts = txn_global->stable_timestamp;
+
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) {
+ /*
+ * For a non-prepared transactions the commit timestamp should not be less than the stable
+ * timestamp.
+ */
+ if (has_oldest_ts && commit_ts < oldest_ts)
+ WT_RET_MSG(session, EINVAL,
+ "commit timestamp %s is less than the oldest "
+ "timestamp %s",
+ __wt_timestamp_to_string(commit_ts, ts_string[0]),
+ __wt_timestamp_to_string(oldest_ts, ts_string[1]));
+
+ if (has_stable_ts && commit_ts < stable_ts)
+ WT_RET_MSG(session, EINVAL,
+ "commit timestamp %s is less than the stable "
+ "timestamp %s",
+ __wt_timestamp_to_string(commit_ts, ts_string[0]),
+ __wt_timestamp_to_string(stable_ts, ts_string[1]));
+
+ /*
+ * Compare against the commit timestamp of the current transaction. Return an error if the
+ * given timestamp is older than the first commit timestamp.
+ */
+ if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && commit_ts < txn->first_commit_timestamp)
+ WT_RET_MSG(session, EINVAL,
+ "commit timestamp %s older than the first "
+ "commit timestamp %s for this transaction",
+ __wt_timestamp_to_string(commit_ts, ts_string[0]),
+ __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[1]));
+
+ /*
+ * FIXME:
+ * WT-4779 disabled to buy time to understand a test failure.
+ * WT_RET(__txn_assert_after_reads(
+ * session, "commit", commit_ts, NULL));
+ */
+ } else {
+ /*
+ * For a prepared transaction, the commit timestamp should not be less than the prepare
+ * timestamp.
+ */
+ if (txn->prepare_timestamp > commit_ts) {
+ if (!F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED))
+ WT_RET_MSG(session, EINVAL,
+ "commit timestamp %s is less than the "
+ "prepare timestamp %s for this transaction",
+ __wt_timestamp_to_string(commit_ts, ts_string[0]),
+ __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[1]));
+ commit_ts = txn->prepare_timestamp;
+ }
+ }
+
+ WT_ASSERT(session,
+ !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) || txn->durable_timestamp == txn->commit_timestamp);
+ txn->commit_timestamp = commit_ts;
+ /*
+ * First time copy the commit timestamp to the first commit timestamp.
+ */
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ txn->first_commit_timestamp = commit_ts;
+
+ /*
+ * Only mirror the commit timestamp if there isn't already an explicit durable timestamp. This
+ * might happen if we set a commit timestamp, set a durable timestamp and then subsequently set
+ * the commit timestamp again.
+ */
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
+ txn->durable_timestamp = commit_ts;
+
+ F_SET(txn, WT_TXN_HAS_TS_COMMIT);
+ return (0);
}
/*
* __wt_txn_set_durable_timestamp --
- * Validate the durable timestamp of a transaction.
+ * Validate the durable timestamp of a transaction.
*/
int
-__wt_txn_set_durable_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t durable_ts)
+__wt_txn_set_durable_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts)
{
- WT_TXN *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- wt_timestamp_t oldest_ts, stable_ts;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool has_oldest_ts, has_stable_ts;
-
- /* Added this redundant initialization to circumvent build failure. */
- oldest_ts = stable_ts = 0;
-
- if (!F_ISSET(txn, WT_TXN_PREPARE))
- WT_RET_MSG(session, EINVAL,
- "durable timestamp should not be specified for "
- "non-prepared transaction");
-
- if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
- WT_RET_MSG(session, EINVAL,
- "commit timestamp is needed before the durable timestamp");
-
- /*
- * Compare against the oldest and the stable timestamp. Return an error
- * if the given timestamp is less than oldest and/or stable timestamp.
- */
- has_oldest_ts = txn_global->has_oldest_timestamp;
- if (has_oldest_ts)
- oldest_ts = txn_global->oldest_timestamp;
- has_stable_ts = txn_global->has_stable_timestamp;
- if (has_stable_ts)
- stable_ts = txn_global->stable_timestamp;
-
- /*
- * For a non-prepared transactions the commit timestamp should
- * not be less than the stable timestamp.
- */
- if (has_oldest_ts && durable_ts < oldest_ts)
- WT_RET_MSG(session, EINVAL,
- "durable timestamp %s is less than the oldest timestamp %s",
- __wt_timestamp_to_string(durable_ts, ts_string[0]),
- __wt_timestamp_to_string(oldest_ts, ts_string[1]));
-
- if (has_stable_ts && durable_ts < stable_ts)
- WT_RET_MSG(session, EINVAL,
- "durable timestamp %s is less than the stable timestamp %s",
- __wt_timestamp_to_string(durable_ts, ts_string[0]),
- __wt_timestamp_to_string(stable_ts, ts_string[1]));
-
- /* Check if the durable timestamp is less than the commit timestamp. */
- if (durable_ts < txn->commit_timestamp)
- WT_RET_MSG(session, EINVAL,
- "durable timestamp %s is less than the commit timestamp %s "
- "for this transaction",
- __wt_timestamp_to_string(durable_ts, ts_string[0]),
- __wt_timestamp_to_string(
- txn->commit_timestamp, ts_string[1]));
-
- txn->durable_timestamp = durable_ts;
- F_SET(txn, WT_TXN_HAS_TS_DURABLE);
-
- return (0);
+ WT_TXN *txn = &session->txn;
+ WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ wt_timestamp_t oldest_ts, stable_ts;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+ bool has_oldest_ts, has_stable_ts;
+
+ /* Added this redundant initialization to circumvent build failure. */
+ oldest_ts = stable_ts = 0;
+
+ if (!F_ISSET(txn, WT_TXN_PREPARE))
+ WT_RET_MSG(session, EINVAL,
+ "durable timestamp should not be specified for "
+ "non-prepared transaction");
+
+ if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ WT_RET_MSG(session, EINVAL, "commit timestamp is needed before the durable timestamp");
+
+ /*
+ * Compare against the oldest and the stable timestamp. Return an error if the given timestamp
+ * is less than oldest and/or stable timestamp.
+ */
+ has_oldest_ts = txn_global->has_oldest_timestamp;
+ if (has_oldest_ts)
+ oldest_ts = txn_global->oldest_timestamp;
+ has_stable_ts = txn_global->has_stable_timestamp;
+ if (has_stable_ts)
+ stable_ts = txn_global->stable_timestamp;
+
+ /*
+ * For a non-prepared transactions the commit timestamp should not be less than the stable
+ * timestamp.
+ */
+ if (has_oldest_ts && durable_ts < oldest_ts)
+ WT_RET_MSG(session, EINVAL, "durable timestamp %s is less than the oldest timestamp %s",
+ __wt_timestamp_to_string(durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(oldest_ts, ts_string[1]));
+
+ if (has_stable_ts && durable_ts < stable_ts)
+ WT_RET_MSG(session, EINVAL, "durable timestamp %s is less than the stable timestamp %s",
+ __wt_timestamp_to_string(durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(stable_ts, ts_string[1]));
+
+ /* Check if the durable timestamp is less than the commit timestamp. */
+ if (durable_ts < txn->commit_timestamp)
+ WT_RET_MSG(session, EINVAL,
+ "durable timestamp %s is less than the commit timestamp %s "
+ "for this transaction",
+ __wt_timestamp_to_string(durable_ts, ts_string[0]),
+ __wt_timestamp_to_string(txn->commit_timestamp, ts_string[1]));
+
+ txn->durable_timestamp = durable_ts;
+ F_SET(txn, WT_TXN_HAS_TS_DURABLE);
+
+ return (0);
}
/*
* __wt_txn_set_prepare_timestamp --
- * Validate and set the prepare timestamp of a transaction.
+ * Validate and set the prepare timestamp of a transaction.
*/
int
-__wt_txn_set_prepare_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t prepare_ts)
+__wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ts)
{
- WT_TXN *prev, *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- wt_timestamp_t oldest_ts;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
-
- WT_RET(__wt_txn_context_prepare_check(session));
-
- if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
- WT_RET_MSG(session, EINVAL, "prepare timestamp is already set");
-
- if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
- WT_RET_MSG(session, EINVAL, "commit timestamp "
- "should not have been set before the prepare timestamp");
-
- WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev));
-
- /*
- * Check whether the prepare timestamp is less than the oldest
- * timestamp.
- */
- oldest_ts = txn_global->oldest_timestamp;
- if (prepare_ts < oldest_ts) {
- /*
- * Check whether the prepare timestamp needs to be rounded up to
- * the oldest timestamp.
- */
- if (F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED)) {
- /*
- * Check that there are no active readers. That would
- * be a violation of preconditions for rounding
- * timestamps of prepared transactions.
- */
- WT_ASSERT(session, prev == NULL);
-
- __wt_verbose(session, WT_VERB_TIMESTAMP,
- "prepare timestamp %s rounded to oldest "
- "timestamp %s",
- __wt_timestamp_to_string(prepare_ts, ts_string[0]),
- __wt_timestamp_to_string(oldest_ts, ts_string[1]));
-
- prepare_ts = oldest_ts;
- } else
- WT_RET_MSG(session, EINVAL,
- "prepare timestamp %s is older than the oldest "
- "timestamp %s",
- __wt_timestamp_to_string(prepare_ts, ts_string[0]),
- __wt_timestamp_to_string(oldest_ts, ts_string[1]));
- }
- txn->prepare_timestamp = prepare_ts;
- F_SET(txn, WT_TXN_HAS_TS_PREPARE);
-
- return (0);
+ WT_TXN *prev, *txn = &session->txn;
+ WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ wt_timestamp_t oldest_ts;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+
+ WT_RET(__wt_txn_context_prepare_check(session));
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE))
+ WT_RET_MSG(session, EINVAL, "prepare timestamp is already set");
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
+ WT_RET_MSG(session, EINVAL,
+ "commit timestamp "
+ "should not have been set before the prepare timestamp");
+
+ WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev));
+
+ /*
+ * Check whether the prepare timestamp is less than the oldest timestamp.
+ */
+ oldest_ts = txn_global->oldest_timestamp;
+ if (prepare_ts < oldest_ts) {
+ /*
+ * Check whether the prepare timestamp needs to be rounded up to the oldest timestamp.
+ */
+ if (F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED)) {
+ /*
+ * Check that there are no active readers. That would be a violation of preconditions
+ * for rounding timestamps of prepared transactions.
+ */
+ WT_ASSERT(session, prev == NULL);
+
+ __wt_verbose(session, WT_VERB_TIMESTAMP,
+ "prepare timestamp %s rounded to oldest "
+ "timestamp %s",
+ __wt_timestamp_to_string(prepare_ts, ts_string[0]),
+ __wt_timestamp_to_string(oldest_ts, ts_string[1]));
+
+ prepare_ts = oldest_ts;
+ } else
+ WT_RET_MSG(session, EINVAL,
+ "prepare timestamp %s is older than the oldest "
+ "timestamp %s",
+ __wt_timestamp_to_string(prepare_ts, ts_string[0]),
+ __wt_timestamp_to_string(oldest_ts, ts_string[1]));
+ }
+ txn->prepare_timestamp = prepare_ts;
+ F_SET(txn, WT_TXN_HAS_TS_PREPARE);
+
+ return (0);
}
/*
* __wt_txn_set_read_timestamp --
- * Parse a request to set a transaction's read_timestamp.
+ * Parse a request to set a transaction's read_timestamp.
*/
int
-__wt_txn_set_read_timestamp(
- WT_SESSION_IMPL *session, wt_timestamp_t read_ts)
+__wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts)
{
- WT_TXN *txn = &session->txn;
- WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
- wt_timestamp_t ts_oldest;
- char ts_string[2][WT_TS_INT_STRING_SIZE];
- bool did_roundup_to_oldest;
-
- WT_RET(__wt_txn_context_prepare_check(session));
-
- /* Read timestamps imply / require snapshot isolation. */
- if (!F_ISSET(txn, WT_TXN_RUNNING))
- txn->isolation = WT_ISO_SNAPSHOT;
- else if (txn->isolation != WT_ISO_SNAPSHOT)
- WT_RET_MSG(session, EINVAL, "setting a read_timestamp"
- " requires a transaction running at snapshot"
- " isolation");
-
- /* Read timestamps can't change once set. */
- if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
- WT_RET_MSG(session, EINVAL, "a read_timestamp"
- " may only be set once per transaction");
-
- /*
- * This code is not using the timestamp validate function to
- * avoid a race between checking and setting transaction
- * timestamp.
- */
- __wt_readlock(session, &txn_global->rwlock);
- ts_oldest = txn_global->oldest_timestamp;
- did_roundup_to_oldest = false;
- if (read_ts < ts_oldest) {
- /*
- * If given read timestamp is earlier than oldest
- * timestamp then round the read timestamp to
- * oldest timestamp.
- */
- if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) {
- txn->read_timestamp = ts_oldest;
- did_roundup_to_oldest = true;
- } else {
- __wt_readunlock(session, &txn_global->rwlock);
-
- /*
- * In some cases, MongoDB sets a read timestamp older
- * than the oldest timestamp, relying on WiredTiger's
- * concurrency to detect and fail the set. In other
- * cases it's a bug and MongoDB wants error context to
- * make it easier to find those problems. Don't output
- * an error message because that logs a MongoDB error,
- * use an informational message to provide the context
- * instead.
- */
- WT_RET(__wt_msg(session, "read timestamp "
- "%s less than the oldest timestamp %s",
- __wt_timestamp_to_string(read_ts, ts_string[0]),
- __wt_timestamp_to_string(ts_oldest, ts_string[1])));
- return (EINVAL);
- }
- } else
- txn->read_timestamp = read_ts;
-
- __wt_txn_publish_read_timestamp(session);
- __wt_readunlock(session, &txn_global->rwlock);
-
- /*
- * This message is generated here to reduce the span of critical
- * section.
- */
- if (did_roundup_to_oldest)
- __wt_verbose(session, WT_VERB_TIMESTAMP, "read "
- "timestamp %s : rounded to oldest timestamp %s",
- __wt_timestamp_to_string(read_ts, ts_string[0]),
- __wt_timestamp_to_string(ts_oldest, ts_string[1]));
-
- /*
- * If we already have a snapshot, it may be too early to match
- * the timestamp (including the one we just read, if rounding
- * to oldest). Get a new one.
- */
- if (F_ISSET(txn, WT_TXN_RUNNING))
- __wt_txn_get_snapshot(session);
-
- return (0);
+ WT_TXN *txn = &session->txn;
+ WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global;
+ wt_timestamp_t ts_oldest;
+ char ts_string[2][WT_TS_INT_STRING_SIZE];
+ bool did_roundup_to_oldest;
+
+ WT_RET(__wt_txn_context_prepare_check(session));
+
+ /* Read timestamps imply / require snapshot isolation. */
+ if (!F_ISSET(txn, WT_TXN_RUNNING))
+ txn->isolation = WT_ISO_SNAPSHOT;
+ else if (txn->isolation != WT_ISO_SNAPSHOT)
+ WT_RET_MSG(session, EINVAL,
+ "setting a read_timestamp"
+ " requires a transaction running at snapshot"
+ " isolation");
+
+ /* Read timestamps can't change once set. */
+ if (F_ISSET(txn, WT_TXN_HAS_TS_READ))
+ WT_RET_MSG(session, EINVAL,
+ "a read_timestamp"
+ " may only be set once per transaction");
+
+ /*
+ * This code is not using the timestamp validate function to avoid a race between checking and
+ * setting transaction timestamp.
+ */
+ __wt_readlock(session, &txn_global->rwlock);
+ ts_oldest = txn_global->oldest_timestamp;
+ did_roundup_to_oldest = false;
+ if (read_ts < ts_oldest) {
+ /*
+ * If given read timestamp is earlier than oldest timestamp then round the read timestamp to
+ * oldest timestamp.
+ */
+ if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) {
+ txn->read_timestamp = ts_oldest;
+ did_roundup_to_oldest = true;
+ } else {
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /*
+ * In some cases, MongoDB sets a read timestamp older than the oldest timestamp, relying
+ * on WiredTiger's concurrency to detect and fail the set. In other cases it's a bug and
+ * MongoDB wants error context to make it easier to find those problems. Don't output an
+ * error message because that logs a MongoDB error, use an informational message to
+ * provide the context instead.
+ */
+ WT_RET(__wt_msg(session,
+ "read timestamp "
+ "%s less than the oldest timestamp %s",
+ __wt_timestamp_to_string(read_ts, ts_string[0]),
+ __wt_timestamp_to_string(ts_oldest, ts_string[1])));
+ return (EINVAL);
+ }
+ } else
+ txn->read_timestamp = read_ts;
+
+ __wt_txn_publish_read_timestamp(session);
+ __wt_readunlock(session, &txn_global->rwlock);
+
+ /*
+ * This message is generated here to reduce the span of critical section.
+ */
+ if (did_roundup_to_oldest)
+ __wt_verbose(session, WT_VERB_TIMESTAMP,
+ "read "
+ "timestamp %s : rounded to oldest timestamp %s",
+ __wt_timestamp_to_string(read_ts, ts_string[0]),
+ __wt_timestamp_to_string(ts_oldest, ts_string[1]));
+
+ /*
+ * If we already have a snapshot, it may be too early to match the timestamp (including the one
+ * we just read, if rounding to oldest). Get a new one.
+ */
+ if (F_ISSET(txn, WT_TXN_RUNNING))
+ __wt_txn_get_snapshot(session);
+
+ return (0);
}
/*
* __wt_txn_set_timestamp --
- * Parse a request to set a timestamp in a transaction.
+ * Parse a request to set a timestamp in a transaction.
*/
int
__wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_CONFIG_ITEM cval;
- WT_DECL_RET;
- wt_timestamp_t ts;
- bool set_ts;
-
- set_ts = false;
- WT_TRET(__wt_txn_context_check(session, true));
-
- /* Look for a commit timestamp. */
- ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval);
- WT_RET_NOTFOUND_OK(ret);
- if (ret == 0 && cval.len != 0) {
- WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
- WT_RET(__wt_txn_set_commit_timestamp(session, ts));
- set_ts = true;
- }
-
- /*
- * Look for a durable timestamp. Durable timestamp should be set only
- * after setting the commit timestamp.
- */
- ret = __wt_config_gets_def(
- session, cfg, "durable_timestamp", 0, &cval);
- WT_RET_NOTFOUND_OK(ret);
- if (ret == 0 && cval.len != 0) {
- WT_RET(__wt_txn_parse_timestamp(
- session, "durable", &ts, &cval));
- WT_RET(__wt_txn_set_durable_timestamp(session, ts));
- }
-
- __wt_txn_publish_timestamp(session);
-
- /* Look for a read timestamp. */
- WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
- if (ret == 0 && cval.len != 0) {
- WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval));
- set_ts = true;
- WT_RET(__wt_txn_set_read_timestamp(session, ts));
- }
-
- /* Look for a prepare timestamp. */
- WT_RET(__wt_config_gets_def(session,
- cfg, "prepare_timestamp", 0, &cval));
- if (ret == 0 && cval.len != 0) {
- WT_RET(__wt_txn_parse_timestamp(
- session, "prepare", &ts, &cval));
- WT_RET(__wt_txn_set_prepare_timestamp(session, ts));
- }
- if (set_ts)
- WT_RET(__wt_txn_ts_log(session));
-
- return (0);
+ WT_CONFIG_ITEM cval;
+ WT_DECL_RET;
+ wt_timestamp_t ts;
+ bool set_ts;
+
+ set_ts = false;
+ WT_TRET(__wt_txn_context_check(session, true));
+
+ /* Look for a commit timestamp. */
+ ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval);
+ WT_RET_NOTFOUND_OK(ret);
+ if (ret == 0 && cval.len != 0) {
+ WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval));
+ WT_RET(__wt_txn_set_commit_timestamp(session, ts));
+ set_ts = true;
+ }
+
+ /*
+ * Look for a durable timestamp. Durable timestamp should be set only after setting the commit
+ * timestamp.
+ */
+ ret = __wt_config_gets_def(session, cfg, "durable_timestamp", 0, &cval);
+ WT_RET_NOTFOUND_OK(ret);
+ if (ret == 0 && cval.len != 0) {
+ WT_RET(__wt_txn_parse_timestamp(session, "durable", &ts, &cval));
+ WT_RET(__wt_txn_set_durable_timestamp(session, ts));
+ }
+
+ __wt_txn_publish_timestamp(session);
+
+ /* Look for a read timestamp. */
+ WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
+ if (ret == 0 && cval.len != 0) {
+ WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval));
+ set_ts = true;
+ WT_RET(__wt_txn_set_read_timestamp(session, ts));
+ }
+
+ /* Look for a prepare timestamp. */
+ WT_RET(__wt_config_gets_def(session, cfg, "prepare_timestamp", 0, &cval));
+ if (ret == 0 && cval.len != 0) {
+ WT_RET(__wt_txn_parse_timestamp(session, "prepare", &ts, &cval));
+ WT_RET(__wt_txn_set_prepare_timestamp(session, ts));
+ }
+ if (set_ts)
+ WT_RET(__wt_txn_ts_log(session));
+
+ return (0);
}
/*
* __wt_txn_publish_timestamp --
- * Publish a transaction's timestamp to the durable queue.
+ * Publish a transaction's timestamp to the durable queue.
*/
void
__wt_txn_publish_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *qtxn, *txn, *txn_tmp;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t ts;
- uint64_t walked;
-
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
-
- if (F_ISSET(txn, WT_TXN_TS_PUBLISHED))
- return;
-
- if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
- ts = txn->durable_timestamp;
- else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
- /*
- * If we know for a fact that this is a prepared transaction and
- * we only have a commit timestamp, don't add to the durable
- * queue. If we poll all_durable after setting the commit
- * timestamp of a prepared transaction, that prepared
- * transaction should NOT be visible.
- */
- if (F_ISSET(txn, WT_TXN_PREPARE))
- return;
- ts = txn->commit_timestamp;
- } else
- return;
-
- __wt_writelock(session, &txn_global->durable_timestamp_rwlock);
- /*
- * If our transaction is on the queue remove it first. The timestamp
- * may move earlier so we otherwise might not remove ourselves before
- * finding where to insert ourselves (which would result in a list
- * loop) and we don't want to walk more of the list than needed.
- */
- if (txn->clear_durable_q) {
- TAILQ_REMOVE(&txn_global->durable_timestamph,
- txn, durable_timestampq);
- WT_PUBLISH(txn->clear_durable_q, false);
- --txn_global->durable_timestampq_len;
- }
- /*
- * Walk the list to look for where to insert our own transaction
- * and remove any transactions that are not active. We stop when
- * we get to the location where we want to insert.
- */
- if (TAILQ_EMPTY(&txn_global->durable_timestamph)) {
- TAILQ_INSERT_HEAD(
- &txn_global->durable_timestamph, txn, durable_timestampq);
- WT_STAT_CONN_INCR(session, txn_durable_queue_empty);
- } else {
- /* Walk from the start, removing cleared entries. */
- walked = 0;
- TAILQ_FOREACH_SAFE(qtxn, &txn_global->durable_timestamph,
- durable_timestampq, txn_tmp) {
- ++walked;
- /*
- * Stop on the first entry that we cannot clear.
- */
- if (!qtxn->clear_durable_q)
- break;
-
- TAILQ_REMOVE(&txn_global->durable_timestamph,
- qtxn, durable_timestampq);
- WT_PUBLISH(qtxn->clear_durable_q, false);
- --txn_global->durable_timestampq_len;
- }
-
- /*
- * Now walk backwards from the end to find the correct position
- * for the insert.
- */
- qtxn = TAILQ_LAST(
- &txn_global->durable_timestamph, __wt_txn_dts_qh);
- while (qtxn != NULL &&
- __txn_get_published_timestamp(session, qtxn) > ts) {
- ++walked;
- qtxn = TAILQ_PREV(
- qtxn, __wt_txn_dts_qh, durable_timestampq);
- }
- if (qtxn == NULL) {
- TAILQ_INSERT_HEAD(&txn_global->durable_timestamph,
- txn, durable_timestampq);
- WT_STAT_CONN_INCR(session, txn_durable_queue_head);
- } else
- TAILQ_INSERT_AFTER(&txn_global->durable_timestamph,
- qtxn, txn, durable_timestampq);
- WT_STAT_CONN_INCRV(session, txn_durable_queue_walked, walked);
- }
- ++txn_global->durable_timestampq_len;
- WT_STAT_CONN_INCR(session, txn_durable_queue_inserts);
- txn->clear_durable_q = false;
- F_SET(txn, WT_TXN_TS_PUBLISHED);
- __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock);
+ WT_TXN *qtxn, *txn, *txn_tmp;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t ts;
+ uint64_t walked;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+
+ if (F_ISSET(txn, WT_TXN_TS_PUBLISHED))
+ return;
+
+ if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE))
+ ts = txn->durable_timestamp;
+ else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) {
+ /*
+ * If we know for a fact that this is a prepared transaction and we only have a commit
+ * timestamp, don't add to the durable queue. If we poll all_durable after setting the
+ * commit timestamp of a prepared transaction, that prepared transaction should NOT be
+ * visible.
+ */
+ if (F_ISSET(txn, WT_TXN_PREPARE))
+ return;
+ ts = txn->commit_timestamp;
+ } else
+ return;
+
+ __wt_writelock(session, &txn_global->durable_timestamp_rwlock);
+ /*
+ * If our transaction is on the queue remove it first. The timestamp may move earlier so we
+ * otherwise might not remove ourselves before finding where to insert ourselves (which would
+ * result in a list loop) and we don't want to walk more of the list than needed.
+ */
+ if (txn->clear_durable_q) {
+ TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq);
+ WT_PUBLISH(txn->clear_durable_q, false);
+ --txn_global->durable_timestampq_len;
+ }
+ /*
+ * Walk the list to look for where to insert our own transaction and remove any transactions
+ * that are not active. We stop when we get to the location where we want to insert.
+ */
+ if (TAILQ_EMPTY(&txn_global->durable_timestamph)) {
+ TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq);
+ WT_STAT_CONN_INCR(session, txn_durable_queue_empty);
+ } else {
+ /* Walk from the start, removing cleared entries. */
+ walked = 0;
+ TAILQ_FOREACH_SAFE(qtxn, &txn_global->durable_timestamph, durable_timestampq, txn_tmp)
+ {
+ ++walked;
+ /*
+ * Stop on the first entry that we cannot clear.
+ */
+ if (!qtxn->clear_durable_q)
+ break;
+
+ TAILQ_REMOVE(&txn_global->durable_timestamph, qtxn, durable_timestampq);
+ WT_PUBLISH(qtxn->clear_durable_q, false);
+ --txn_global->durable_timestampq_len;
+ }
+
+ /*
+ * Now walk backwards from the end to find the correct position for the insert.
+ */
+ qtxn = TAILQ_LAST(&txn_global->durable_timestamph, __wt_txn_dts_qh);
+ while (qtxn != NULL && __txn_get_published_timestamp(session, qtxn) > ts) {
+ ++walked;
+ qtxn = TAILQ_PREV(qtxn, __wt_txn_dts_qh, durable_timestampq);
+ }
+ if (qtxn == NULL) {
+ TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq);
+ WT_STAT_CONN_INCR(session, txn_durable_queue_head);
+ } else
+ TAILQ_INSERT_AFTER(&txn_global->durable_timestamph, qtxn, txn, durable_timestampq);
+ WT_STAT_CONN_INCRV(session, txn_durable_queue_walked, walked);
+ }
+ ++txn_global->durable_timestampq_len;
+ WT_STAT_CONN_INCR(session, txn_durable_queue_inserts);
+ txn->clear_durable_q = false;
+ F_SET(txn, WT_TXN_TS_PUBLISHED);
+ __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock);
}
/*
* __wt_txn_clear_durable_timestamp --
- * Clear a transaction's published durable timestamp.
+ * Clear a transaction's published durable timestamp.
*/
void
__wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
- uint32_t flags;
-
- txn = &session->txn;
-
- if (!F_ISSET(txn, WT_TXN_TS_PUBLISHED))
- return;
- flags = txn->flags;
- LF_CLR(WT_TXN_TS_PUBLISHED);
-
- /*
- * Notify other threads that our transaction is inactive and can be
- * cleaned up safely from the durable timestamp queue whenever the next
- * thread walks the queue. We do not need to remove it now.
- */
- WT_PUBLISH(txn->clear_durable_q, true);
- WT_PUBLISH(txn->flags, flags);
+ WT_TXN *txn;
+ uint32_t flags;
+
+ txn = &session->txn;
+
+ if (!F_ISSET(txn, WT_TXN_TS_PUBLISHED))
+ return;
+ flags = txn->flags;
+ LF_CLR(WT_TXN_TS_PUBLISHED);
+
+ /*
+ * Notify other threads that our transaction is inactive and can be cleaned up safely from the
+ * durable timestamp queue whenever the next thread walks the queue. We do not need to remove it
+ * now.
+ */
+ WT_PUBLISH(txn->clear_durable_q, true);
+ WT_PUBLISH(txn->flags, flags);
}
/*
* __wt_txn_publish_read_timestamp --
- * Publish a transaction's read timestamp.
+ * Publish a transaction's read timestamp.
*/
void
__wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *qtxn, *txn, *txn_tmp;
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t tmp_timestamp;
- uint64_t walked;
-
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
-
- if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
- return;
-
- __wt_writelock(session, &txn_global->read_timestamp_rwlock);
- /*
- * If our transaction is on the queue remove it first. The timestamp
- * may move earlier so we otherwise might not remove ourselves before
- * finding where to insert ourselves (which would result in a list
- * loop) and we don't want to walk more of the list than needed.
- */
- if (txn->clear_read_q) {
- TAILQ_REMOVE(&txn_global->read_timestamph,
- txn, read_timestampq);
- WT_PUBLISH(txn->clear_read_q, false);
- --txn_global->read_timestampq_len;
- }
- /*
- * Walk the list to look for where to insert our own transaction
- * and remove any transactions that are not active. We stop when
- * we get to the location where we want to insert.
- */
- if (TAILQ_EMPTY(&txn_global->read_timestamph)) {
- TAILQ_INSERT_HEAD(
- &txn_global->read_timestamph, txn, read_timestampq);
- WT_STAT_CONN_INCR(session, txn_read_queue_empty);
- } else {
- /* Walk from the start, removing cleared entries. */
- walked = 0;
- TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph,
- read_timestampq, txn_tmp) {
- ++walked;
- if (!qtxn->clear_read_q)
- break;
-
- TAILQ_REMOVE(&txn_global->read_timestamph,
- qtxn, read_timestampq);
- WT_PUBLISH(qtxn->clear_read_q, false);
- --txn_global->read_timestampq_len;
- }
-
- /*
- * Now walk backwards from the end to find the correct position
- * for the insert.
- */
- qtxn = TAILQ_LAST(
- &txn_global->read_timestamph, __wt_txn_rts_qh);
- while (qtxn != NULL) {
- if (!__txn_get_read_timestamp(qtxn, &tmp_timestamp) ||
- tmp_timestamp > txn->read_timestamp) {
- ++walked;
- qtxn = TAILQ_PREV(qtxn,
- __wt_txn_rts_qh, read_timestampq);
- } else
- break;
- }
- if (qtxn == NULL) {
- TAILQ_INSERT_HEAD(&txn_global->read_timestamph,
- txn, read_timestampq);
- WT_STAT_CONN_INCR(session, txn_read_queue_head);
- } else
- TAILQ_INSERT_AFTER(&txn_global->read_timestamph,
- qtxn, txn, read_timestampq);
- WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked);
- }
- /*
- * We do not set the read timestamp here. It has been set in the caller
- * because special processing for round to oldest.
- */
- ++txn_global->read_timestampq_len;
- WT_STAT_CONN_INCR(session, txn_read_queue_inserts);
- txn->clear_read_q = false;
- F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ);
- __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
+ WT_TXN *qtxn, *txn, *txn_tmp;
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t tmp_timestamp;
+ uint64_t walked;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+
+ if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
+ return;
+
+ __wt_writelock(session, &txn_global->read_timestamp_rwlock);
+ /*
+ * If our transaction is on the queue remove it first. The timestamp may move earlier so we
+ * otherwise might not remove ourselves before finding where to insert ourselves (which would
+ * result in a list loop) and we don't want to walk more of the list than needed.
+ */
+ if (txn->clear_read_q) {
+ TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
+ WT_PUBLISH(txn->clear_read_q, false);
+ --txn_global->read_timestampq_len;
+ }
+ /*
+ * Walk the list to look for where to insert our own transaction and remove any transactions
+ * that are not active. We stop when we get to the location where we want to insert.
+ */
+ if (TAILQ_EMPTY(&txn_global->read_timestamph)) {
+ TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq);
+ WT_STAT_CONN_INCR(session, txn_read_queue_empty);
+ } else {
+ /* Walk from the start, removing cleared entries. */
+ walked = 0;
+ TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph, read_timestampq, txn_tmp)
+ {
+ ++walked;
+ if (!qtxn->clear_read_q)
+ break;
+
+ TAILQ_REMOVE(&txn_global->read_timestamph, qtxn, read_timestampq);
+ WT_PUBLISH(qtxn->clear_read_q, false);
+ --txn_global->read_timestampq_len;
+ }
+
+ /*
+ * Now walk backwards from the end to find the correct position for the insert.
+ */
+ qtxn = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh);
+ while (qtxn != NULL) {
+ if (!__txn_get_read_timestamp(qtxn, &tmp_timestamp) ||
+ tmp_timestamp > txn->read_timestamp) {
+ ++walked;
+ qtxn = TAILQ_PREV(qtxn, __wt_txn_rts_qh, read_timestampq);
+ } else
+ break;
+ }
+ if (qtxn == NULL) {
+ TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq);
+ WT_STAT_CONN_INCR(session, txn_read_queue_head);
+ } else
+ TAILQ_INSERT_AFTER(&txn_global->read_timestamph, qtxn, txn, read_timestampq);
+ WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked);
+ }
+ /*
+ * We do not set the read timestamp here. It has been set in the caller because special
+ * processing for round to oldest.
+ */
+ ++txn_global->read_timestampq_len;
+ WT_STAT_CONN_INCR(session, txn_read_queue_inserts);
+ txn->clear_read_q = false;
+ F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ);
+ __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
}
/*
* __wt_txn_clear_read_timestamp --
- * Clear a transaction's published read timestamp.
+ * Clear a transaction's published read timestamp.
*/
void
__wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
- uint32_t flags;
+ WT_TXN *txn;
+ uint32_t flags;
- txn = &session->txn;
+ txn = &session->txn;
- if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) {
- txn->read_timestamp = WT_TS_NONE;
- return;
- }
+ if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) {
+ txn->read_timestamp = WT_TS_NONE;
+ return;
+ }
#ifdef HAVE_DIAGNOSTIC
- {
- WT_TXN_GLOBAL *txn_global;
- wt_timestamp_t pinned_ts;
-
- txn_global = &S2C(session)->txn_global;
- pinned_ts = txn_global->pinned_timestamp;
- WT_ASSERT(session, txn->read_timestamp >= pinned_ts);
- }
+ {
+ WT_TXN_GLOBAL *txn_global;
+ wt_timestamp_t pinned_ts;
+
+ txn_global = &S2C(session)->txn_global;
+ pinned_ts = txn_global->pinned_timestamp;
+ WT_ASSERT(session, txn->read_timestamp >= pinned_ts);
+ }
#endif
- flags = txn->flags;
- LF_CLR(WT_TXN_PUBLIC_TS_READ);
-
- /*
- * Notify other threads that our transaction is inactive and can be
- * cleaned up safely from the read timestamp queue whenever the
- * next thread walks the queue. We do not need to remove it now.
- */
- WT_PUBLISH(txn->clear_read_q, true);
- WT_PUBLISH(txn->flags, flags);
- txn->read_timestamp = WT_TS_NONE;
+ flags = txn->flags;
+ LF_CLR(WT_TXN_PUBLIC_TS_READ);
+
+ /*
+ * Notify other threads that our transaction is inactive and can be cleaned up safely from the
+ * read timestamp queue whenever the next thread walks the queue. We do not need to remove it
+ * now.
+ */
+ WT_PUBLISH(txn->clear_read_q, true);
+ WT_PUBLISH(txn->flags, flags);
+ txn->read_timestamp = WT_TS_NONE;
}
/*
* __wt_txn_clear_timestamp_queues --
- * We're about to clear the session and overwrite the txn structure.
- * Remove ourselves from the commit timestamp queue and the read
- * timestamp queue if we're on either of them.
+ * We're about to clear the session and overwrite the txn structure. Remove ourselves from the
+ * commit timestamp queue and the read timestamp queue if we're on either of them.
*/
void
__wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session)
{
- WT_TXN *txn;
- WT_TXN_GLOBAL *txn_global;
-
- txn = &session->txn;
- txn_global = &S2C(session)->txn_global;
-
- if (!txn->clear_durable_q && !txn->clear_read_q)
- return;
-
- if (txn->clear_durable_q) {
- __wt_writelock(session, &txn_global->durable_timestamp_rwlock);
- /*
- * Recheck after acquiring the lock.
- */
- if (txn->clear_durable_q) {
- TAILQ_REMOVE(&txn_global->durable_timestamph,
- txn, durable_timestampq);
- --txn_global->durable_timestampq_len;
- txn->clear_durable_q = false;
- }
- __wt_writeunlock(
- session, &txn_global->durable_timestamp_rwlock);
- }
- if (txn->clear_read_q) {
- __wt_writelock(session, &txn_global->read_timestamp_rwlock);
- /*
- * Recheck after acquiring the lock.
- */
- if (txn->clear_read_q) {
- TAILQ_REMOVE(
- &txn_global->read_timestamph, txn, read_timestampq);
- --txn_global->read_timestampq_len;
- txn->clear_read_q = false;
- }
- __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
- }
+ WT_TXN *txn;
+ WT_TXN_GLOBAL *txn_global;
+
+ txn = &session->txn;
+ txn_global = &S2C(session)->txn_global;
+
+ if (!txn->clear_durable_q && !txn->clear_read_q)
+ return;
+
+ if (txn->clear_durable_q) {
+ __wt_writelock(session, &txn_global->durable_timestamp_rwlock);
+ /*
+ * Recheck after acquiring the lock.
+ */
+ if (txn->clear_durable_q) {
+ TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq);
+ --txn_global->durable_timestampq_len;
+ txn->clear_durable_q = false;
+ }
+ __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock);
+ }
+ if (txn->clear_read_q) {
+ __wt_writelock(session, &txn_global->read_timestamp_rwlock);
+ /*
+ * Recheck after acquiring the lock.
+ */
+ if (txn->clear_read_q) {
+ TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
+ --txn_global->read_timestampq_len;
+ txn->clear_read_q = false;
+ }
+ __wt_writeunlock(session, &txn_global->read_timestamp_rwlock);
+ }
}