diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/txn')
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn.c | 2899 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_ckpt.c | 3249 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_ext.c | 91 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_log.c | 1247 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_nsnap.c | 679 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_recover.c | 1329 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c | 872 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_timestamp.c | 2298 |
8 files changed, 6059 insertions, 6605 deletions
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index b3085080956..66a5330258b 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -10,1772 +10,1631 @@ /* * __snapsort_partition -- - * Custom quick sort partitioning for snapshots. + * Custom quick sort partitioning for snapshots. */ static uint32_t __snapsort_partition(uint64_t *array, uint32_t f, uint32_t l, uint64_t pivot) { - uint32_t i, j; - - i = f - 1; - j = l + 1; - for (;;) { - while (pivot < array[--j]) - ; - while (array[++i] < pivot) - ; - if (i < j) { - uint64_t tmp = array[i]; - array[i] = array[j]; - array[j] = tmp; - } else - return (j); - } + uint32_t i, j; + + i = f - 1; + j = l + 1; + for (;;) { + while (pivot < array[--j]) + ; + while (array[++i] < pivot) + ; + if (i < j) { + uint64_t tmp = array[i]; + array[i] = array[j]; + array[j] = tmp; + } else + return (j); + } } /* * __snapsort_impl -- - * Custom quick sort implementation for snapshots. + * Custom quick sort implementation for snapshots. */ static void __snapsort_impl(uint64_t *array, uint32_t f, uint32_t l) { - while (f + 16 < l) { - uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l)/2]; - uint64_t median = v1 < v2 ? - (v3 < v1 ? v1 : WT_MIN(v2, v3)) : - (v3 < v2 ? v2 : WT_MIN(v1, v3)); - uint32_t m = __snapsort_partition(array, f, l, median); - __snapsort_impl(array, f, m); - f = m + 1; - } + while (f + 16 < l) { + uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l) / 2]; + uint64_t median = + v1 < v2 ? (v3 < v1 ? v1 : WT_MIN(v2, v3)) : (v3 < v2 ? v2 : WT_MIN(v1, v3)); + uint32_t m = __snapsort_partition(array, f, l, median); + __snapsort_impl(array, f, m); + f = m + 1; + } } /* * __snapsort -- - * Sort an array of transaction IDs. + * Sort an array of transaction IDs. */ static void __snapsort(uint64_t *array, uint32_t size) { - __snapsort_impl(array, 0, size - 1); - WT_INSERTION_SORT(array, size, uint64_t, WT_TXNID_LT); + __snapsort_impl(array, 0, size - 1); + WT_INSERTION_SORT(array, size, uint64_t, WT_TXNID_LT); } /* * __txn_remove_from_global_table -- - * Remove the transaction id from the global transaction table. + * Remove the transaction id from the global transaction table. */ static inline void __txn_remove_from_global_table(WT_SESSION_IMPL *session) { #ifdef HAVE_DIAGNOSTIC - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); - WT_ASSERT(session, - txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE); + WT_ASSERT(session, !WT_TXNID_LT(txn->id, txn_global->last_running)); + WT_ASSERT(session, txn->id != WT_TXN_NONE && txn_state->id != WT_TXN_NONE); #else - WT_TXN_STATE *txn_state; + WT_TXN_STATE *txn_state; - txn_state = WT_SESSION_TXN_STATE(session); + txn_state = WT_SESSION_TXN_STATE(session); #endif - WT_PUBLISH(txn_state->id, WT_TXN_NONE); + WT_PUBLISH(txn_state->id, WT_TXN_NONE); } /* * __txn_sort_snapshot -- - * Sort a snapshot for faster searching and set the min/max bounds. + * Sort a snapshot for faster searching and set the min/max bounds. */ static void __txn_sort_snapshot(WT_SESSION_IMPL *session, uint32_t n, uint64_t snap_max) { - WT_TXN *txn; + WT_TXN *txn; - txn = &session->txn; + txn = &session->txn; - if (n > 1) - __snapsort(txn->snapshot, n); + if (n > 1) + __snapsort(txn->snapshot, n); - txn->snapshot_count = n; - txn->snap_max = snap_max; - txn->snap_min = (n > 0 && WT_TXNID_LE(txn->snapshot[0], snap_max)) ? - txn->snapshot[0] : snap_max; - F_SET(txn, WT_TXN_HAS_SNAPSHOT); - WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE); + txn->snapshot_count = n; + txn->snap_max = snap_max; + txn->snap_min = + (n > 0 && WT_TXNID_LE(txn->snapshot[0], snap_max)) ? txn->snapshot[0] : snap_max; + F_SET(txn, WT_TXN_HAS_SNAPSHOT); + WT_ASSERT(session, n == 0 || txn->snap_min != WT_TXN_NONE); } /* * __wt_txn_release_snapshot -- - * Release the snapshot in the current transaction. + * Release the snapshot in the current transaction. */ void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) { - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, - txn_state->pinned_id == WT_TXN_NONE || - session->txn.isolation == WT_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE)); + WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE || + session->txn.isolation == WT_ISO_READ_UNCOMMITTED || + !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE)); - txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE; - F_CLR(txn, WT_TXN_HAS_SNAPSHOT); + txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE; + F_CLR(txn, WT_TXN_HAS_SNAPSHOT); - /* Clear a checkpoint's pinned ID. */ - if (WT_SESSION_IS_CHECKPOINT(session)) { - txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; - txn_global->checkpoint_timestamp = 0; - } + /* Clear a checkpoint's pinned ID. */ + if (WT_SESSION_IS_CHECKPOINT(session)) { + txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; + txn_global->checkpoint_timestamp = 0; + } - __wt_txn_clear_read_timestamp(session); + __wt_txn_clear_read_timestamp(session); } /* * __wt_txn_get_snapshot -- - * Allocate a snapshot. + * Allocate a snapshot. */ void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s, *txn_state; - uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id; - uint32_t i, n, session_cnt; - - conn = S2C(session); - txn = &session->txn; - txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); - n = 0; - - /* Fast path if we already have the current snapshot. */ - if ((commit_gen = __wt_session_gen(session, WT_GEN_COMMIT)) != 0) { - if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && - commit_gen == __wt_gen(session, WT_GEN_COMMIT)) - return; - __wt_session_gen_leave(session, WT_GEN_COMMIT); - } - __wt_session_gen_enter(session, WT_GEN_COMMIT); - - /* We're going to scan the table: wait for the lock. */ - __wt_readlock(session, &txn_global->rwlock); - - current_id = pinned_id = txn_global->current; - prev_oldest_id = txn_global->oldest_id; - - /* - * Include the checkpoint transaction, if one is running: we should - * ignore any uncommitted changes the checkpoint has written to the - * metadata. We don't have to keep the checkpoint's changes pinned so - * don't including it in the published pinned ID. - */ - if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) { - txn->snapshot[n++] = id; - txn_state->metadata_pinned = id; - } - - /* For pure read-only workloads, avoid scanning. */ - if (prev_oldest_id == current_id) { - txn_state->pinned_id = current_id; - /* Check that the oldest ID has not moved in the meantime. */ - WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - goto done; - } - - /* Walk the array of concurrent transactions. */ - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* - * Build our snapshot of any concurrent transaction IDs. - * - * Ignore: - * - Our own ID: we always read our own updates. - * - The ID if it is older than the oldest ID we saw. This - * can happen if we race with a thread that is allocating - * an ID -- the ID will not be used because the thread will - * keep spinning until it gets a valid one. - * - The ID if it is higher than the current ID we saw. This - * can happen if the transaction is already finished. In - * this case, we ignore this transaction because it would - * not be visible to the current snapshot. - */ - while (s != txn_state && - (id = s->id) != WT_TXN_NONE && - WT_TXNID_LE(prev_oldest_id, id) && - WT_TXNID_LT(id, current_id)) { - /* - * If the transaction is still allocating its ID, then - * we spin here until it gets its valid ID. - */ - WT_READ_BARRIER(); - if (!s->is_allocating) { - /* - * There is still a chance that fetched ID is - * not valid after ID allocation, so we check - * again here. The read of transaction ID - * should be carefully ordered: we want to - * re-read ID from transaction state after this - * transaction completes ID allocation. - */ - WT_READ_BARRIER(); - if (id == s->id) { - txn->snapshot[n++] = id; - if (WT_TXNID_LT(id, pinned_id)) - pinned_id = id; - break; - } - } - WT_PAUSE(); - } - } - - /* - * If we got a new snapshot, update the published pinned ID for this - * session. - */ - WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); - WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->pinned_id = pinned_id; - -done: __wt_readunlock(session, &txn_global->rwlock); - __txn_sort_snapshot(session, n, current_id); + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s, *txn_state; + uint64_t commit_gen, current_id, id, prev_oldest_id, pinned_id; + uint32_t i, n, session_cnt; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); + n = 0; + + /* Fast path if we already have the current snapshot. */ + if ((commit_gen = __wt_session_gen(session, WT_GEN_COMMIT)) != 0) { + if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && commit_gen == __wt_gen(session, WT_GEN_COMMIT)) + return; + __wt_session_gen_leave(session, WT_GEN_COMMIT); + } + __wt_session_gen_enter(session, WT_GEN_COMMIT); + + /* We're going to scan the table: wait for the lock. */ + __wt_readlock(session, &txn_global->rwlock); + + current_id = pinned_id = txn_global->current; + prev_oldest_id = txn_global->oldest_id; + + /* + * Include the checkpoint transaction, if one is running: we should ignore any uncommitted + * changes the checkpoint has written to the metadata. We don't have to keep the checkpoint's + * changes pinned so don't including it in the published pinned ID. + */ + if ((id = txn_global->checkpoint_state.id) != WT_TXN_NONE) { + txn->snapshot[n++] = id; + txn_state->metadata_pinned = id; + } + + /* For pure read-only workloads, avoid scanning. */ + if (prev_oldest_id == current_id) { + txn_state->pinned_id = current_id; + /* Check that the oldest ID has not moved in the meantime. */ + WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); + goto done; + } + + /* Walk the array of concurrent transactions. */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* + * Build our snapshot of any concurrent transaction IDs. + * + * Ignore: + * - Our own ID: we always read our own updates. + * - The ID if it is older than the oldest ID we saw. This + * can happen if we race with a thread that is allocating + * an ID -- the ID will not be used because the thread will + * keep spinning until it gets a valid one. + * - The ID if it is higher than the current ID we saw. This + * can happen if the transaction is already finished. In + * this case, we ignore this transaction because it would + * not be visible to the current snapshot. + */ + while (s != txn_state && (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && + WT_TXNID_LT(id, current_id)) { + /* + * If the transaction is still allocating its ID, then we spin here until it gets its + * valid ID. + */ + WT_READ_BARRIER(); + if (!s->is_allocating) { + /* + * There is still a chance that fetched ID is not valid after ID allocation, so we + * check again here. The read of transaction ID should be carefully ordered: we want + * to re-read ID from transaction state after this transaction completes ID + * allocation. + */ + WT_READ_BARRIER(); + if (id == s->id) { + txn->snapshot[n++] = id; + if (WT_TXNID_LT(id, pinned_id)) + pinned_id = id; + break; + } + } + WT_PAUSE(); + } + } + + /* + * If we got a new snapshot, update the published pinned ID for this session. + */ + WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); + WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); + txn_state->pinned_id = pinned_id; + +done: + __wt_readunlock(session, &txn_global->rwlock); + __txn_sort_snapshot(session, n, current_id); } /* * __txn_oldest_scan -- - * Sweep the running transactions to calculate the oldest ID required. + * Sweep the running transactions to calculate the oldest ID required. */ static void -__txn_oldest_scan(WT_SESSION_IMPL *session, - uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp, - WT_SESSION_IMPL **oldest_sessionp) +__txn_oldest_scan(WT_SESSION_IMPL *session, uint64_t *oldest_idp, uint64_t *last_runningp, + uint64_t *metadata_pinnedp, WT_SESSION_IMPL **oldest_sessionp) { - WT_CONNECTION_IMPL *conn; - WT_SESSION_IMPL *oldest_session; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; - uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id; - uint32_t i, session_cnt; - - conn = S2C(session); - txn_global = &conn->txn_global; - oldest_session = NULL; - - /* The oldest ID cannot change while we are holding the scan lock. */ - prev_oldest_id = txn_global->oldest_id; - last_running = oldest_id = txn_global->current; - if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE) - metadata_pinned = oldest_id; - - /* Walk the array of concurrent transactions. */ - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Update the last running transaction ID. */ - while ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LE(prev_oldest_id, id) && - WT_TXNID_LT(id, last_running)) { - /* - * If the transaction is still allocating its ID, then - * we spin here until it gets its valid ID. - */ - WT_READ_BARRIER(); - if (!s->is_allocating) { - /* - * There is still a chance that fetched ID is - * not valid after ID allocation, so we check - * again here. The read of transaction ID - * should be carefully ordered: we want to - * re-read ID from transaction state after this - * transaction completes ID allocation. - */ - WT_READ_BARRIER(); - if (id == s->id) { - last_running = id; - break; - } - } - WT_PAUSE(); - } - - /* Update the metadata pinned ID. */ - if ((id = s->metadata_pinned) != WT_TXN_NONE && - WT_TXNID_LT(id, metadata_pinned)) - metadata_pinned = id; - - /* - * !!! - * Note: Don't ignore pinned ID values older than the previous - * oldest ID. Read-uncommitted operations publish pinned ID - * values without acquiring the scan lock to protect the global - * table. See the comment in __wt_txn_cursor_op for more - * details. - */ - if ((id = s->pinned_id) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) { - oldest_id = id; - oldest_session = &conn->sessions[i]; - } - } - - if (WT_TXNID_LT(last_running, oldest_id)) - oldest_id = last_running; - - /* The oldest ID can't move past any named snapshots. */ - if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; - - /* The metadata pinned ID can't move past the oldest ID. */ - if (WT_TXNID_LT(oldest_id, metadata_pinned)) - metadata_pinned = oldest_id; - - *last_runningp = last_running; - *metadata_pinnedp = metadata_pinned; - *oldest_idp = oldest_id; - *oldest_sessionp = oldest_session; + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *oldest_session; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn_global = &conn->txn_global; + oldest_session = NULL; + + /* The oldest ID cannot change while we are holding the scan lock. */ + prev_oldest_id = txn_global->oldest_id; + last_running = oldest_id = txn_global->current; + if ((metadata_pinned = txn_global->checkpoint_state.id) == WT_TXN_NONE) + metadata_pinned = oldest_id; + + /* Walk the array of concurrent transactions. */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Update the last running transaction ID. */ + while ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && + WT_TXNID_LT(id, last_running)) { + /* + * If the transaction is still allocating its ID, then we spin here until it gets its + * valid ID. + */ + WT_READ_BARRIER(); + if (!s->is_allocating) { + /* + * There is still a chance that fetched ID is not valid after ID allocation, so we + * check again here. The read of transaction ID should be carefully ordered: we want + * to re-read ID from transaction state after this transaction completes ID + * allocation. + */ + WT_READ_BARRIER(); + if (id == s->id) { + last_running = id; + break; + } + } + WT_PAUSE(); + } + + /* Update the metadata pinned ID. */ + if ((id = s->metadata_pinned) != WT_TXN_NONE && WT_TXNID_LT(id, metadata_pinned)) + metadata_pinned = id; + + /* + * !!! + * Note: Don't ignore pinned ID values older than the previous + * oldest ID. Read-uncommitted operations publish pinned ID + * values without acquiring the scan lock to protect the global + * table. See the comment in __wt_txn_cursor_op for more + * details. + */ + if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { + oldest_id = id; + oldest_session = &conn->sessions[i]; + } + } + + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; + + /* The oldest ID can't move past any named snapshots. */ + if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) + oldest_id = id; + + /* The metadata pinned ID can't move past the oldest ID. */ + if (WT_TXNID_LT(oldest_id, metadata_pinned)) + metadata_pinned = oldest_id; + + *last_runningp = last_running; + *metadata_pinnedp = metadata_pinned; + *oldest_idp = oldest_id; + *oldest_sessionp = oldest_session; } /* * __wt_txn_update_oldest -- - * Sweep the running transactions to update the oldest ID required. + * Sweep the running transactions to update the oldest ID required. */ int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_SESSION_IMPL *oldest_session; - WT_TXN_GLOBAL *txn_global; - uint64_t current_id, last_running, metadata_pinned, oldest_id; - uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id; - bool strict, wait; - - conn = S2C(session); - txn_global = &conn->txn_global; - strict = LF_ISSET(WT_TXN_OLDEST_STRICT); - wait = LF_ISSET(WT_TXN_OLDEST_WAIT); - - current_id = last_running = metadata_pinned = txn_global->current; - prev_last_running = txn_global->last_running; - prev_metadata_pinned = txn_global->metadata_pinned; - prev_oldest_id = txn_global->oldest_id; - - /* Try to move the pinned timestamp forward. */ - if (strict) - WT_RET(__wt_txn_update_pinned_timestamp(session, false)); - - /* - * For pure read-only workloads, or if the update isn't forced and the - * oldest ID isn't too far behind, avoid scanning. - */ - if ((prev_oldest_id == current_id && - prev_metadata_pinned == current_id) || - (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) - return (0); - - /* First do a read-only scan. */ - if (wait) - __wt_readlock(session, &txn_global->rwlock); - else if ((ret = - __wt_try_readlock(session, &txn_global->rwlock)) != 0) - return (ret == EBUSY ? 0 : ret); - __txn_oldest_scan(session, - &oldest_id, &last_running, &metadata_pinned, &oldest_session); - __wt_readunlock(session, &txn_global->rwlock); - - /* - * If the state hasn't changed (or hasn't moved far enough for - * non-forced updates), give up. - */ - if ((oldest_id == prev_oldest_id || - (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && - ((last_running == prev_last_running) || - (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) && - metadata_pinned == prev_metadata_pinned) - return (0); - - /* It looks like an update is necessary, wait for exclusive access. */ - if (wait) - __wt_writelock(session, &txn_global->rwlock); - else if ((ret = - __wt_try_writelock(session, &txn_global->rwlock)) != 0) - return (ret == EBUSY ? 0 : ret); - - /* - * If the oldest ID has been updated while we waited, don't bother - * scanning. - */ - if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && - WT_TXNID_LE(last_running, txn_global->last_running) && - WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned)) - goto done; - - /* - * Re-scan now that we have exclusive access. This is necessary because - * threads get transaction snapshots with read locks, and we have to be - * sure that there isn't a thread that has got a snapshot locally but - * not yet published its snap_min. - */ - __txn_oldest_scan(session, - &oldest_id, &last_running, &metadata_pinned, &oldest_session); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *oldest_session; + WT_TXN_GLOBAL *txn_global; + uint64_t current_id, last_running, metadata_pinned, oldest_id; + uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id; + bool strict, wait; + + conn = S2C(session); + txn_global = &conn->txn_global; + strict = LF_ISSET(WT_TXN_OLDEST_STRICT); + wait = LF_ISSET(WT_TXN_OLDEST_WAIT); + + current_id = last_running = metadata_pinned = txn_global->current; + prev_last_running = txn_global->last_running; + prev_metadata_pinned = txn_global->metadata_pinned; + prev_oldest_id = txn_global->oldest_id; + + /* Try to move the pinned timestamp forward. */ + if (strict) + WT_RET(__wt_txn_update_pinned_timestamp(session, false)); + + /* + * For pure read-only workloads, or if the update isn't forced and the oldest ID isn't too far + * behind, avoid scanning. + */ + if ((prev_oldest_id == current_id && prev_metadata_pinned == current_id) || + (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) + return (0); + + /* First do a read-only scan. */ + if (wait) + __wt_readlock(session, &txn_global->rwlock); + else if ((ret = __wt_try_readlock(session, &txn_global->rwlock)) != 0) + return (ret == EBUSY ? 0 : ret); + __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session); + __wt_readunlock(session, &txn_global->rwlock); + + /* + * If the state hasn't changed (or hasn't moved far enough for non-forced updates), give up. + */ + if ((oldest_id == prev_oldest_id || + (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && + ((last_running == prev_last_running) || + (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) && + metadata_pinned == prev_metadata_pinned) + return (0); + + /* It looks like an update is necessary, wait for exclusive access. */ + if (wait) + __wt_writelock(session, &txn_global->rwlock); + else if ((ret = __wt_try_writelock(session, &txn_global->rwlock)) != 0) + return (ret == EBUSY ? 0 : ret); + + /* + * If the oldest ID has been updated while we waited, don't bother scanning. + */ + if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && + WT_TXNID_LE(last_running, txn_global->last_running) && + WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned)) + goto done; + + /* + * Re-scan now that we have exclusive access. This is necessary because threads get transaction + * snapshots with read locks, and we have to be sure that there isn't a thread that has got a + * snapshot locally but not yet published its snap_min. + */ + __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session); #ifdef HAVE_DIAGNOSTIC - { - /* - * Make sure the ID doesn't move past any named snapshots. - * - * Don't include the read/assignment in the assert statement. Coverity - * complains if there are assignments only done in diagnostic builds, - * and when the read is from a volatile. - */ - uint64_t id = txn_global->nsnap_oldest_id; - WT_ASSERT(session, - id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); - } + { + /* + * Make sure the ID doesn't move past any named snapshots. + * + * Don't include the read/assignment in the assert statement. Coverity + * complains if there are assignments only done in diagnostic builds, + * and when the read is from a volatile. + */ + uint64_t id = txn_global->nsnap_oldest_id; + WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); + } #endif - /* Update the public IDs. */ - if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned)) - txn_global->metadata_pinned = metadata_pinned; - if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) - txn_global->oldest_id = oldest_id; - if (WT_TXNID_LT(txn_global->last_running, last_running)) { - txn_global->last_running = last_running; - - /* Output a verbose message about long-running transactions, - * but only when some progress is being made. */ - if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && - current_id - oldest_id > 10000 && oldest_session != NULL) { - __wt_verbose(session, WT_VERB_TRANSACTION, - "old snapshot %" PRIu64 - " pinned in session %" PRIu32 " [%s]" - " with snap_min %" PRIu64, - oldest_id, oldest_session->id, - oldest_session->lastop, - oldest_session->txn.snap_min); - } - } - -done: __wt_writeunlock(session, &txn_global->rwlock); - return (ret); + /* Update the public IDs. */ + if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned)) + txn_global->metadata_pinned = metadata_pinned; + if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) + txn_global->oldest_id = oldest_id; + if (WT_TXNID_LT(txn_global->last_running, last_running)) { + txn_global->last_running = last_running; + + /* Output a verbose message about long-running transactions, + * but only when some progress is being made. */ + if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && current_id - oldest_id > 10000 && + oldest_session != NULL) { + __wt_verbose(session, WT_VERB_TRANSACTION, + "old snapshot %" PRIu64 " pinned in session %" PRIu32 + " [%s]" + " with snap_min %" PRIu64, + oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); + } + } + +done: + __wt_writeunlock(session, &txn_global->rwlock); + return (ret); } /* * __wt_txn_config -- - * Configure a transaction. + * Configure a transaction. */ int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_TXN *txn; - wt_timestamp_t read_ts; - - txn = &session->txn; - - WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); - if (cval.len != 0) - txn->isolation = - WT_STRING_MATCH("snapshot", cval.str, cval.len) ? - WT_ISO_SNAPSHOT : - WT_STRING_MATCH("read-committed", cval.str, cval.len) ? - WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED; - - /* - * The default sync setting is inherited from the connection, but can - * be overridden by an explicit "sync" setting for this transaction. - * - * We want to distinguish between inheriting implicitly and explicitly. - */ - F_CLR(txn, WT_TXN_SYNC_SET); - WT_RET(__wt_config_gets_def( - session, cfg, "sync", (int)UINT_MAX, &cval)); - if (cval.val == 0 || cval.val == 1) - /* - * This is an explicit setting of sync. Set the flag so - * that we know not to overwrite it in commit_transaction. - */ - F_SET(txn, WT_TXN_SYNC_SET); - - /* - * If sync is turned off explicitly, clear the transaction's sync field. - */ - if (cval.val == 0) - txn->txn_logsync = 0; - - WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval)); - if (cval.len > 0) - /* - * The layering here isn't ideal - the named snapshot get - * function does both validation and setup. Otherwise we'd - * need to walk the list of named snapshots twice during - * transaction open. - */ - WT_RET(__wt_txn_named_snapshot_get(session, &cval)); - - /* Check if prepared updates should be ignored during reads. */ - WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval)); - if (cval.len > 0 && - WT_STRING_MATCH("force", cval.str, cval.len)) - F_SET(txn, WT_TXN_IGNORE_PREPARE); - else if (cval.val) - F_SET(txn, WT_TXN_IGNORE_PREPARE | WT_TXN_READONLY); - - /* - * Check if the prepare timestamp and the commit timestamp of a - * prepared transaction need to be rounded up. - */ - WT_RET(__wt_config_gets_def( - session, cfg, "roundup_timestamps.prepared", 0, &cval)); - if (cval.val) - F_SET(txn, WT_TXN_TS_ROUND_PREPARED); - - /* Check if read timestamp needs to be rounded up. */ - WT_RET(__wt_config_gets_def( - session, cfg, "roundup_timestamps.read", 0, &cval)); - if (cval.val) - F_SET(txn, WT_TXN_TS_ROUND_READ); - - WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); - if (cval.len != 0) { - WT_RET(__wt_txn_parse_timestamp( - session, "read", &read_ts, &cval)); - WT_RET(__wt_txn_set_read_timestamp(session, read_ts)); - } - - return (0); + WT_CONFIG_ITEM cval; + WT_TXN *txn; + wt_timestamp_t read_ts; + + txn = &session->txn; + + WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); + if (cval.len != 0) + txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? + WT_ISO_SNAPSHOT : + WT_STRING_MATCH("read-committed", cval.str, cval.len) ? WT_ISO_READ_COMMITTED : + WT_ISO_READ_UNCOMMITTED; + + /* + * The default sync setting is inherited from the connection, but can + * be overridden by an explicit "sync" setting for this transaction. + * + * We want to distinguish between inheriting implicitly and explicitly. + */ + F_CLR(txn, WT_TXN_SYNC_SET); + WT_RET(__wt_config_gets_def(session, cfg, "sync", (int)UINT_MAX, &cval)); + if (cval.val == 0 || cval.val == 1) + /* + * This is an explicit setting of sync. Set the flag so that we know not to overwrite it in + * commit_transaction. + */ + F_SET(txn, WT_TXN_SYNC_SET); + + /* + * If sync is turned off explicitly, clear the transaction's sync field. + */ + if (cval.val == 0) + txn->txn_logsync = 0; + + WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval)); + if (cval.len > 0) + /* + * The layering here isn't ideal - the named snapshot get function does both validation and + * setup. Otherwise we'd need to walk the list of named snapshots twice during transaction + * open. + */ + WT_RET(__wt_txn_named_snapshot_get(session, &cval)); + + /* Check if prepared updates should be ignored during reads. */ + WT_RET(__wt_config_gets_def(session, cfg, "ignore_prepare", 0, &cval)); + if (cval.len > 0 && WT_STRING_MATCH("force", cval.str, cval.len)) + F_SET(txn, WT_TXN_IGNORE_PREPARE); + else if (cval.val) + F_SET(txn, WT_TXN_IGNORE_PREPARE | WT_TXN_READONLY); + + /* + * Check if the prepare timestamp and the commit timestamp of a prepared transaction need to be + * rounded up. + */ + WT_RET(__wt_config_gets_def(session, cfg, "roundup_timestamps.prepared", 0, &cval)); + if (cval.val) + F_SET(txn, WT_TXN_TS_ROUND_PREPARED); + + /* Check if read timestamp needs to be rounded up. */ + WT_RET(__wt_config_gets_def(session, cfg, "roundup_timestamps.read", 0, &cval)); + if (cval.val) + F_SET(txn, WT_TXN_TS_ROUND_READ); + + WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); + if (cval.len != 0) { + WT_RET(__wt_txn_parse_timestamp(session, "read", &read_ts, &cval)); + WT_RET(__wt_txn_set_read_timestamp(session, read_ts)); + } + + return (0); } /* * __wt_txn_reconfigure -- - * WT_SESSION::reconfigure for transactions. + * WT_SESSION::reconfigure for transactions. */ int __wt_txn_reconfigure(WT_SESSION_IMPL *session, const char *config) { - WT_CONFIG_ITEM cval; - WT_DECL_RET; - WT_TXN *txn; - - txn = &session->txn; - - ret = __wt_config_getones(session, config, "isolation", &cval); - if (ret == 0 && cval.len != 0) { - session->isolation = txn->isolation = - WT_STRING_MATCH("snapshot", cval.str, cval.len) ? - WT_ISO_SNAPSHOT : - WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ? - WT_ISO_READ_UNCOMMITTED : WT_ISO_READ_COMMITTED; - } - WT_RET_NOTFOUND_OK(ret); - - return (0); + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_TXN *txn; + + txn = &session->txn; + + ret = __wt_config_getones(session, config, "isolation", &cval); + if (ret == 0 && cval.len != 0) { + session->isolation = txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? + WT_ISO_SNAPSHOT : + WT_STRING_MATCH("read-uncommitted", cval.str, cval.len) ? WT_ISO_READ_UNCOMMITTED : + WT_ISO_READ_COMMITTED; + } + WT_RET_NOTFOUND_OK(ret); + + return (0); } /* * __wt_txn_release -- - * Release the resources associated with the current transaction. + * Release the resources associated with the current transaction. */ void __wt_txn_release(WT_SESSION_IMPL *session) { - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - - WT_ASSERT(session, txn->mod_count == 0); - txn->notify = NULL; - - /* Clear the transaction's ID from the global table. */ - if (WT_SESSION_IS_CHECKPOINT(session)) { - WT_ASSERT(session, - WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); - txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE; - - /* - * Be extra careful to cleanup everything for checkpoints: once - * the global checkpoint ID is cleared, we can no longer tell - * if this session is doing a checkpoint. - */ - txn_global->checkpoint_id = 0; - } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { - /* - * If transaction is prepared, this would have been done in - * prepare. - */ - if (!F_ISSET(txn, WT_TXN_PREPARE)) - __txn_remove_from_global_table(session); - txn->id = WT_TXN_NONE; - } - - __wt_txn_clear_durable_timestamp(session); - - /* Free the scratch buffer allocated for logging. */ - __wt_logrec_free(session, &txn->logrec); - - /* Discard any memory from the session's stash that we can. */ - WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0); - __wt_stash_discard(session); - - /* - * Reset the transaction state to not running and release the snapshot. - */ - __wt_txn_release_snapshot(session); - txn->isolation = session->isolation; - - txn->rollback_reason = NULL; - - /* - * Ensure the transaction flags are cleared on exit - * - * Purposely do NOT clear the commit and durable timestamps on release. - * Other readers may still find these transactions in the durable queue - * and will need to see those timestamps. - */ - txn->flags = 0; - txn->prepare_timestamp = WT_TS_NONE; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + + WT_ASSERT(session, txn->mod_count == 0); + txn->notify = NULL; + + /* Clear the transaction's ID from the global table. */ + if (WT_SESSION_IS_CHECKPOINT(session)) { + WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); + txn->id = txn_global->checkpoint_state.id = WT_TXN_NONE; + + /* + * Be extra careful to cleanup everything for checkpoints: once the global checkpoint ID is + * cleared, we can no longer tell if this session is doing a checkpoint. + */ + txn_global->checkpoint_id = 0; + } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { + /* + * If transaction is prepared, this would have been done in prepare. + */ + if (!F_ISSET(txn, WT_TXN_PREPARE)) + __txn_remove_from_global_table(session); + else + WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->id == WT_TXN_NONE); + txn->id = WT_TXN_NONE; + } + + __wt_txn_clear_durable_timestamp(session); + + /* Free the scratch buffer allocated for logging. */ + __wt_logrec_free(session, &txn->logrec); + + /* Discard any memory from the session's stash that we can. */ + WT_ASSERT(session, __wt_session_gen(session, WT_GEN_SPLIT) == 0); + __wt_stash_discard(session); + + /* + * Reset the transaction state to not running and release the snapshot. + */ + __wt_txn_release_snapshot(session); + txn->isolation = session->isolation; + + txn->rollback_reason = NULL; + + /* + * Ensure the transaction flags are cleared on exit + * + * Purposely do NOT clear the commit and durable timestamps on release. + * Other readers may still find these transactions in the durable queue + * and will need to see those timestamps. + */ + txn->flags = 0; + txn->prepare_timestamp = WT_TS_NONE; } /* * __txn_commit_timestamps_assert -- - * Validate that timestamps provided to commit are legal. + * Validate that timestamps provided to commit are legal. */ static inline int __txn_commit_timestamps_assert(WT_SESSION_IMPL *session) { - WT_CURSOR *cursor; - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_OP *op; - WT_UPDATE *upd; - wt_timestamp_t durable_op_timestamp, op_timestamp, prev_op_timestamp; - u_int i; - const char *open_cursor_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; - bool op_zero_ts, upd_zero_ts; - - txn = &session->txn; - cursor = NULL; - durable_op_timestamp = prev_op_timestamp = WT_TS_NONE; - - /* - * Debugging checks on timestamps, if user requested them. - */ - if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && - !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - txn->mod_count != 0) - WT_RET_MSG(session, EINVAL, "commit_timestamp required and " - "none set on this transaction"); - if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && - F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - txn->mod_count != 0) - WT_RET_MSG(session, EINVAL, "no commit_timestamp required and " - "timestamp set on this transaction"); - if (F_ISSET(txn, WT_TXN_TS_DURABLE_ALWAYS) && - !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) && - txn->mod_count != 0) - WT_RET_MSG(session, EINVAL, "durable_timestamp required and " - "none set on this transaction"); - if (F_ISSET(txn, WT_TXN_TS_DURABLE_NEVER) && - F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) && - txn->mod_count != 0) - WT_RET_MSG(session, EINVAL, "no durable_timestamp required and " - "durable timestamp set on this transaction"); - - /* - * If we're not doing any key consistency checking, we're done. - */ - if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS | WT_TXN_TS_DURABLE_KEYS)) - return (0); - - /* - * Error on any valid update structures for the same key that - * are at a later timestamp or use timestamps inconsistently. - */ - for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) - if (op->type == WT_TXN_OP_BASIC_COL || - op->type == WT_TXN_OP_BASIC_ROW) { - /* - * Search for prepared updates, so that they will be - * restored, if moved to lookaside. - */ - if (F_ISSET(txn, WT_TXN_PREPARE)) { - WT_RET(__wt_open_cursor(session, - op->btree->dhandle->name, NULL, - open_cursor_cfg, &cursor)); - F_CLR(txn, WT_TXN_PREPARE); - if (op->type == WT_TXN_OP_BASIC_ROW) - __wt_cursor_set_raw_key( - cursor, &op->u.op_row.key); - else - ((WT_CURSOR_BTREE*)cursor)->iface.recno - = op->u.op_col.recno; - F_SET(txn, WT_TXN_PREPARE); - WT_WITH_BTREE(session, op->btree, - ret = __wt_btcur_search_uncommitted( - (WT_CURSOR_BTREE *)cursor, &upd)); - if (ret != 0) - WT_RET_MSG(session, EINVAL, - "prepared update restore failed"); - } else - upd = op->u.op_upd; - - WT_ASSERT(session, upd != NULL); - op_timestamp = upd->start_ts; - - /* - * Skip over any aborted update structures, internally - * created update structures or ones from our own - * transaction. - */ - while (upd != NULL && (upd->txnid == WT_TXN_ABORTED || - upd->txnid == WT_TXN_NONE || upd->txnid == txn->id)) - upd = upd->next; - - /* - * Check the timestamp on this update with the - * first valid update in the chain. They're in - * most recent order. - */ - if (upd != NULL) { - prev_op_timestamp = upd->start_ts; - durable_op_timestamp = upd->durable_ts; - } - - /* - * We no longer need to access the update structure so - * it's safe to release our reference to the page. - */ - if (cursor != NULL) { - WT_ASSERT( - session, F_ISSET(txn, WT_TXN_PREPARE)); - WT_RET(cursor->close(cursor)); - cursor = NULL; - } - - if (upd == NULL) - continue; - /* - * Check for consistent per-key timestamp usage. - * If timestamps are or are not used originally then - * they should be used the same way always. For this - * transaction, timestamps are in use anytime the - * commit timestamp is set. - * Check timestamps are used in order. - */ - op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); - upd_zero_ts = prev_op_timestamp == WT_TS_NONE; - if (op_zero_ts != upd_zero_ts) - WT_RET_MSG(session, EINVAL, - "per-key timestamps used inconsistently"); - /* - * If we aren't using timestamps for this transaction - * then we are done checking. Don't check the timestamp - * because the one in the transaction is not cleared. - */ - if (op_zero_ts) - continue; - - /* - * Only if the update structure doesn't have a timestamp - * then use the one in the transaction structure. - */ - if (op_timestamp == WT_TS_NONE) - op_timestamp = txn->commit_timestamp; - if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) && - op_timestamp < prev_op_timestamp) - WT_RET_MSG(session, EINVAL, - "out of order commit timestamps"); - if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) && - txn->durable_timestamp < durable_op_timestamp) - WT_RET_MSG(session, EINVAL, - "out of order durable timestamps"); - } - return (0); + WT_CURSOR *cursor; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_OP *op; + WT_UPDATE *upd; + wt_timestamp_t durable_op_timestamp, op_timestamp, prev_op_timestamp; + u_int i; + const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL}; + bool op_zero_ts, upd_zero_ts; + + txn = &session->txn; + cursor = NULL; + durable_op_timestamp = prev_op_timestamp = WT_TS_NONE; + + /* + * Debugging checks on timestamps, if user requested them. + */ + if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, + "commit_timestamp required and " + "none set on this transaction"); + if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && + txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, + "no commit_timestamp required and " + "timestamp set on this transaction"); + if (F_ISSET(txn, WT_TXN_TS_DURABLE_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) && + txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, + "durable_timestamp required and " + "none set on this transaction"); + if (F_ISSET(txn, WT_TXN_TS_DURABLE_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) && + txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, + "no durable_timestamp required and " + "durable timestamp set on this transaction"); + + /* + * If we're not doing any key consistency checking, we're done. + */ + if (!F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS | WT_TXN_TS_DURABLE_KEYS)) + return (0); + + /* + * Error on any valid update structures for the same key that are at a later timestamp or use + * timestamps inconsistently. + */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) + if (op->type == WT_TXN_OP_BASIC_COL || op->type == WT_TXN_OP_BASIC_ROW) { + /* + * Search for prepared updates, so that they will be restored, if moved to lookaside. + */ + if (F_ISSET(txn, WT_TXN_PREPARE)) { + WT_RET(__wt_open_cursor( + session, op->btree->dhandle->name, NULL, open_cursor_cfg, &cursor)); + F_CLR(txn, WT_TXN_PREPARE); + if (op->type == WT_TXN_OP_BASIC_ROW) + __wt_cursor_set_raw_key(cursor, &op->u.op_row.key); + else + ((WT_CURSOR_BTREE *)cursor)->iface.recno = op->u.op_col.recno; + F_SET(txn, WT_TXN_PREPARE); + WT_WITH_BTREE(session, op->btree, + ret = __wt_btcur_search_uncommitted((WT_CURSOR_BTREE *)cursor, &upd)); + if (ret != 0) + WT_RET_MSG(session, EINVAL, "prepared update restore failed"); + } else + upd = op->u.op_upd; + + WT_ASSERT(session, upd != NULL); + op_timestamp = upd->start_ts; + + /* + * Skip over any aborted update structures, internally created update structures or ones + * from our own transaction. + */ + while (upd != NULL && + (upd->txnid == WT_TXN_ABORTED || upd->txnid == WT_TXN_NONE || upd->txnid == txn->id)) + upd = upd->next; + + /* + * Check the timestamp on this update with the first valid update in the chain. They're + * in most recent order. + */ + if (upd != NULL) { + prev_op_timestamp = upd->start_ts; + durable_op_timestamp = upd->durable_ts; + } + + /* + * We no longer need to access the update structure so it's safe to release our + * reference to the page. + */ + if (cursor != NULL) { + WT_ASSERT(session, F_ISSET(txn, WT_TXN_PREPARE)); + WT_RET(cursor->close(cursor)); + cursor = NULL; + } + + if (upd == NULL) + continue; + /* + * Check for consistent per-key timestamp usage. If timestamps are or are not used + * originally then they should be used the same way always. For this transaction, + * timestamps are in use anytime the commit timestamp is set. Check timestamps are used + * in order. + */ + op_zero_ts = !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); + upd_zero_ts = prev_op_timestamp == WT_TS_NONE; + if (op_zero_ts != upd_zero_ts) + WT_RET_MSG(session, EINVAL, "per-key timestamps used inconsistently"); + /* + * If we aren't using timestamps for this transaction then we are done checking. Don't + * check the timestamp because the one in the transaction is not cleared. + */ + if (op_zero_ts) + continue; + + /* + * Only if the update structure doesn't have a timestamp then use the one in the + * transaction structure. + */ + if (op_timestamp == WT_TS_NONE) + op_timestamp = txn->commit_timestamp; + if (F_ISSET(txn, WT_TXN_TS_COMMIT_KEYS) && op_timestamp < prev_op_timestamp) + WT_RET_MSG(session, EINVAL, "out of order commit timestamps"); + if (F_ISSET(txn, WT_TXN_TS_DURABLE_KEYS) && + txn->durable_timestamp < durable_op_timestamp) + WT_RET_MSG(session, EINVAL, "out of order durable timestamps"); + } + return (0); } /* * __wt_txn_commit -- - * Commit the current transaction. + * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_OP *op; - WT_UPDATE *upd; - wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp; - int64_t resolved_update_count, visited_update_count; - uint32_t fileid; - u_int i; - bool locked, prepare, readonly, skip_update_assert, update_durable_ts; - - txn = &session->txn; - conn = S2C(session); - txn_global = &conn->txn_global; - locked = skip_update_assert = false; - resolved_update_count = visited_update_count = 0; - - WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || - txn->mod_count == 0); - - readonly = txn->mod_count == 0; - - prepare = F_ISSET(txn, WT_TXN_PREPARE); - - /* - * Clear the prepared round up flag if the transaction is not prepared. - * There is no rounding up to do in that case. - */ - if (!prepare) - F_CLR(txn, WT_TXN_TS_ROUND_PREPARED); - - /* Set the commit and the durable timestamps. */ - WT_ERR(__wt_txn_set_timestamp(session, cfg)); - - if (prepare) { - if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) - WT_ERR_MSG(session, EINVAL, - "commit_timestamp is required for a prepared " - "transaction"); - - if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) - WT_ERR_MSG(session, EINVAL, - "durable_timestamp is required for a prepared " - "transaction"); - - WT_ASSERT(session, - txn->prepare_timestamp <= txn->commit_timestamp); - } else { - if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) - WT_ERR_MSG(session, EINVAL, - "prepare timestamp is set for non-prepared " - "transaction"); - - if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) - WT_ERR_MSG(session, EINVAL, - "durable_timestamp should not be specified for " - "non-prepared transaction"); - } - - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) - WT_ASSERT(session, - txn->commit_timestamp <= txn->durable_timestamp); - - WT_ERR(__txn_commit_timestamps_assert(session)); - - /* - * The default sync setting is inherited from the connection, but can - * be overridden by an explicit "sync" setting for this transaction. - */ - WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); - - /* - * If the user chose the default setting, check whether sync is enabled - * for this transaction (either inherited or via begin_transaction). - * If sync is disabled, clear the field to avoid the log write being - * flushed. - * - * Otherwise check for specific settings. We don't need to check for - * "on" because that is the default inherited from the connection. If - * the user set anything in begin_transaction, we only override with an - * explicit setting. - */ - if (cval.len == 0) { - if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && - !F_ISSET(txn, WT_TXN_SYNC_SET)) - txn->txn_logsync = 0; - } else { - /* - * If the caller already set sync on begin_transaction then - * they should not be using sync on commit_transaction. - * Flag that as an error. - */ - if (F_ISSET(txn, WT_TXN_SYNC_SET)) - WT_ERR_MSG(session, EINVAL, - "Sync already set during begin_transaction"); - if (WT_STRING_MATCH("background", cval.str, cval.len)) - txn->txn_logsync = WT_LOG_BACKGROUND; - else if (WT_STRING_MATCH("off", cval.str, cval.len)) - txn->txn_logsync = 0; - /* - * We don't need to check for "on" here because that is the - * default to inherit from the connection setting. - */ - } - - /* Commit notification. */ - if (txn->notify != NULL) - WT_ERR(txn->notify->notify(txn->notify, - (WT_SESSION *)session, txn->id, 1)); - - /* - * We are about to release the snapshot: copy values into any - * positioned cursors so they don't point to updates that could be - * freed once we don't have a snapshot. - * If this transaction is prepared, then copying values would have been - * done during prepare. - */ - if (session->ncursors > 0 && !prepare) { - WT_DIAGNOSTIC_YIELD; - WT_ERR(__wt_session_copy_values(session)); - } - - /* If we are logging, write a commit log record. */ - if (txn->logrec != NULL && - FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && - !F_ISSET(session, WT_SESSION_NO_LOGGING)) { - /* - * We are about to block on I/O writing the log. - * Release our snapshot in case it is keeping data pinned. - * This is particularly important for checkpoints. - */ - __wt_txn_release_snapshot(session); - /* - * We hold the visibility lock for reading from the time - * we write our log record until the time we release our - * transaction so that the LSN any checkpoint gets will - * always reflect visible data. - */ - __wt_readlock(session, &txn_global->visibility_rwlock); - locked = true; - WT_ERR(__wt_txn_log_commit(session, cfg)); - } - - /* Note: we're going to commit: nothing can fail after this point. */ - - /* Process and free updates. */ - for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { - fileid = op->btree->id; - switch (op->type) { - case WT_TXN_OP_NONE: - break; - case WT_TXN_OP_BASIC_COL: - case WT_TXN_OP_BASIC_ROW: - case WT_TXN_OP_INMEM_COL: - case WT_TXN_OP_INMEM_ROW: - upd = op->u.op_upd; - - /* - * Need to resolve indirect references of transaction - * operation, in case of prepared transaction. - */ - if (!prepare) { - /* - * Switch reserved operations to abort to - * simplify obsolete update list truncation. - */ - if (upd->type == WT_UPDATE_RESERVE) { - upd->txnid = WT_TXN_ABORTED; - break; - } - - /* - * Writes to the lookaside file can be evicted - * as soon as they commit. - */ - if (conn->cache->las_fileid != 0 && - fileid == conn->cache->las_fileid) { - upd->txnid = WT_TXN_NONE; - break; - } - - __wt_txn_op_set_timestamp(session, op); - } else { - visited_update_count++; - /* - * If we have set the key repeated flag - * we can skip resolving prepared updates as - * it would have happened on a previous - * modification in this txn. - */ - if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) { - skip_update_assert = - skip_update_assert || - F_ISSET(op, WT_TXN_OP_KEY_RESERVED); - WT_ERR(__wt_txn_resolve_prepared_op( - session, op, true, - &resolved_update_count)); - } - - /* - * We should resolve at least one or more - * updates each time we call - * __wt_txn_resolve_prepared_op, as such - * resolved update count should never be less - * than visited update count. - */ - WT_ASSERT(session, - resolved_update_count >= - visited_update_count); - } - - break; - case WT_TXN_OP_REF_DELETE: - __wt_txn_op_set_timestamp(session, op); - break; - case WT_TXN_OP_TRUNCATE_COL: - case WT_TXN_OP_TRUNCATE_ROW: - /* Other operations don't need timestamps. */ - break; - } - - __wt_txn_op_free(session, op); - } - WT_ASSERT(session, skip_update_assert || - resolved_update_count == visited_update_count); - WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved, - resolved_update_count); - - txn->mod_count = 0; - - /* - * If durable is set, we'll try to update the global durable timestamp - * with that value. If durable isn't set, durable is implied to be the - * the same as commit so we'll use that instead. - */ - candidate_durable_timestamp = WT_TS_NONE; - if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) - candidate_durable_timestamp = txn->durable_timestamp; - else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) - candidate_durable_timestamp = txn->commit_timestamp; - - __wt_txn_release(session); - if (locked) - __wt_readunlock(session, &txn_global->visibility_rwlock); - - /* - * If we have made some updates visible, start a new commit generation: - * any cached snapshots have to be refreshed. - */ - if (!readonly) - WT_IGNORE_RET(__wt_gen_next(session, WT_GEN_COMMIT)); - - /* First check if we've made something durable in the future. */ - update_durable_ts = false; - prev_durable_timestamp = WT_TS_NONE; - if (candidate_durable_timestamp != WT_TS_NONE) { - prev_durable_timestamp = txn_global->durable_timestamp; - update_durable_ts = - candidate_durable_timestamp > prev_durable_timestamp; - } - - /* - * If it looks like we'll need to move the global durable timestamp, - * attempt atomic cas and re-check. - */ - if (update_durable_ts) - while (candidate_durable_timestamp > prev_durable_timestamp) { - if (__wt_atomic_cas64(&txn_global->durable_timestamp, - prev_durable_timestamp, - candidate_durable_timestamp)) { - txn_global->has_durable_timestamp = true; - break; - } - prev_durable_timestamp = txn_global->durable_timestamp; - } - - /* - * We're between transactions, if we need to block for eviction, it's - * a good time to do so. Note that we must ignore any error return - * because the user's data is committed. - */ - if (!readonly) - WT_IGNORE_RET( - __wt_cache_eviction_check(session, false, false, NULL)); - return (0); + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_OP *op; + WT_UPDATE *upd; + wt_timestamp_t candidate_durable_timestamp, prev_durable_timestamp; + int64_t resolved_update_count, visited_update_count; + uint32_t fileid; + u_int i; + bool locked, prepare, readonly, skip_update_assert, update_durable_ts; + + txn = &session->txn; + conn = S2C(session); + txn_global = &conn->txn_global; + locked = skip_update_assert = false; + resolved_update_count = visited_update_count = 0; + + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); + + readonly = txn->mod_count == 0; + + prepare = F_ISSET(txn, WT_TXN_PREPARE); + + /* + * Clear the prepared round up flag if the transaction is not prepared. There is no rounding up + * to do in that case. + */ + if (!prepare) + F_CLR(txn, WT_TXN_TS_ROUND_PREPARED); + + /* Set the commit and the durable timestamps. */ + WT_ERR(__wt_txn_set_timestamp(session, cfg)); + + if (prepare) { + if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + WT_ERR_MSG(session, EINVAL, + "commit_timestamp is required for a prepared " + "transaction"); + + if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) + WT_ERR_MSG(session, EINVAL, + "durable_timestamp is required for a prepared " + "transaction"); + + WT_ASSERT(session, txn->prepare_timestamp <= txn->commit_timestamp); + } else { + if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) + WT_ERR_MSG(session, EINVAL, + "prepare timestamp is set for non-prepared " + "transaction"); + + if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) + WT_ERR_MSG(session, EINVAL, + "durable_timestamp should not be specified for " + "non-prepared transaction"); + } + + if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + WT_ASSERT(session, txn->commit_timestamp <= txn->durable_timestamp); + + WT_ERR(__txn_commit_timestamps_assert(session)); + + /* + * The default sync setting is inherited from the connection, but can be overridden by an + * explicit "sync" setting for this transaction. + */ + WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); + + /* + * If the user chose the default setting, check whether sync is enabled + * for this transaction (either inherited or via begin_transaction). + * If sync is disabled, clear the field to avoid the log write being + * flushed. + * + * Otherwise check for specific settings. We don't need to check for + * "on" because that is the default inherited from the connection. If + * the user set anything in begin_transaction, we only override with an + * explicit setting. + */ + if (cval.len == 0) { + if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) + txn->txn_logsync = 0; + } else { + /* + * If the caller already set sync on begin_transaction then they should not be using sync on + * commit_transaction. Flag that as an error. + */ + if (F_ISSET(txn, WT_TXN_SYNC_SET)) + WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction"); + if (WT_STRING_MATCH("background", cval.str, cval.len)) + txn->txn_logsync = WT_LOG_BACKGROUND; + else if (WT_STRING_MATCH("off", cval.str, cval.len)) + txn->txn_logsync = 0; + /* + * We don't need to check for "on" here because that is the default to inherit from the + * connection setting. + */ + } + + /* Commit notification. */ + if (txn->notify != NULL) + WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); + + /* + * We are about to release the snapshot: copy values into any positioned cursors so they don't + * point to updates that could be freed once we don't have a snapshot. If this transaction is + * prepared, then copying values would have been done during prepare. + */ + if (session->ncursors > 0 && !prepare) { + WT_DIAGNOSTIC_YIELD; + WT_ERR(__wt_session_copy_values(session)); + } + + /* If we are logging, write a commit log record. */ + if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && + !F_ISSET(session, WT_SESSION_NO_LOGGING)) { + /* + * We are about to block on I/O writing the log. Release our snapshot in case it is keeping + * data pinned. This is particularly important for checkpoints. + */ + __wt_txn_release_snapshot(session); + /* + * We hold the visibility lock for reading from the time we write our log record until the + * time we release our transaction so that the LSN any checkpoint gets will always reflect + * visible data. + */ + __wt_readlock(session, &txn_global->visibility_rwlock); + locked = true; + WT_ERR(__wt_txn_log_commit(session, cfg)); + } + + /* Note: we're going to commit: nothing can fail after this point. */ + + /* Process and free updates. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { + fileid = op->btree->id; + switch (op->type) { + case WT_TXN_OP_NONE: + break; + case WT_TXN_OP_BASIC_COL: + case WT_TXN_OP_BASIC_ROW: + case WT_TXN_OP_INMEM_COL: + case WT_TXN_OP_INMEM_ROW: + upd = op->u.op_upd; + + /* + * Need to resolve indirect references of transaction operation, in case of prepared + * transaction. + */ + if (!prepare) { + /* + * Switch reserved operations to abort to simplify obsolete update list truncation. + */ + if (upd->type == WT_UPDATE_RESERVE) { + upd->txnid = WT_TXN_ABORTED; + break; + } + + /* + * Writes to the lookaside file can be evicted as soon as they commit. + */ + if (conn->cache->las_fileid != 0 && fileid == conn->cache->las_fileid) { + upd->txnid = WT_TXN_NONE; + break; + } + + __wt_txn_op_set_timestamp(session, op); + } else { + visited_update_count++; + /* + * If we have set the key repeated flag we can skip resolving prepared updates as it + * would have happened on a previous modification in this txn. + */ + if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) { + skip_update_assert = skip_update_assert || F_ISSET(op, WT_TXN_OP_KEY_RESERVED); + WT_ERR(__wt_txn_resolve_prepared_op(session, op, true, &resolved_update_count)); + } + + /* + * We should resolve at least one or more + * updates each time we call + * __wt_txn_resolve_prepared_op, as such + * resolved update count should never be less + * than visited update count. + */ + WT_ASSERT(session, resolved_update_count >= visited_update_count); + } + + break; + case WT_TXN_OP_REF_DELETE: + __wt_txn_op_set_timestamp(session, op); + break; + case WT_TXN_OP_TRUNCATE_COL: + case WT_TXN_OP_TRUNCATE_ROW: + /* Other operations don't need timestamps. */ + break; + } + + __wt_txn_op_free(session, op); + } + WT_ERR_ASSERT(session, skip_update_assert || resolved_update_count == visited_update_count, + EINVAL, "Number of resolved prepared updates: %" PRId64 + " does not match" + " number visited: %" PRId64, + resolved_update_count, visited_update_count); + WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved, resolved_update_count); + + txn->mod_count = 0; + + /* + * If durable is set, we'll try to update the global durable timestamp with that value. If + * durable isn't set, durable is implied to be the same as commit so we'll use that instead. + */ + candidate_durable_timestamp = WT_TS_NONE; + if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) + candidate_durable_timestamp = txn->durable_timestamp; + else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + candidate_durable_timestamp = txn->commit_timestamp; + + __wt_txn_release(session); + if (locked) + __wt_readunlock(session, &txn_global->visibility_rwlock); + + /* + * If we have made some updates visible, start a new commit generation: any cached snapshots + * have to be refreshed. + */ + if (!readonly) + WT_IGNORE_RET(__wt_gen_next(session, WT_GEN_COMMIT)); + + /* First check if we've made something durable in the future. */ + update_durable_ts = false; + prev_durable_timestamp = WT_TS_NONE; + if (candidate_durable_timestamp != WT_TS_NONE) { + prev_durable_timestamp = txn_global->durable_timestamp; + update_durable_ts = candidate_durable_timestamp > prev_durable_timestamp; + } + + /* + * If it looks like we'll need to move the global durable timestamp, attempt atomic cas and + * re-check. + */ + if (update_durable_ts) + while (candidate_durable_timestamp > prev_durable_timestamp) { + if (__wt_atomic_cas64(&txn_global->durable_timestamp, prev_durable_timestamp, + candidate_durable_timestamp)) { + txn_global->has_durable_timestamp = true; + break; + } + prev_durable_timestamp = txn_global->durable_timestamp; + } + + /* + * We're between transactions, if we need to block for eviction, it's a good time to do so. Note + * that we must ignore any error return because the user's data is committed. + */ + if (!readonly) + WT_IGNORE_RET(__wt_cache_eviction_check(session, false, false, NULL)); + return (0); err: - /* - * If anything went wrong, roll back. - * - * !!! - * Nothing can fail after this point. - */ - if (locked) - __wt_readunlock(session, &txn_global->visibility_rwlock); - WT_TRET(__wt_txn_rollback(session, cfg)); - return (ret); + /* + * If anything went wrong, roll back. + * + * !!! + * Nothing can fail after this point. + */ + if (locked) + __wt_readunlock(session, &txn_global->visibility_rwlock); + WT_TRET(__wt_txn_rollback(session, cfg)); + return (ret); } /* * __wt_txn_prepare -- - * Prepare the current transaction. + * Prepare the current transaction. */ int __wt_txn_prepare(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_TXN *txn; - WT_TXN_OP *op; - WT_UPDATE *upd; - u_int i; - - txn = &session->txn; - - WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); - /* - * A transaction should not have updated any of the logged tables, - * if debug mode logging is not turned on. - */ - if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE)) - WT_ASSERT(session, txn->logrec == NULL); - - /* Set the prepare timestamp. */ - WT_RET(__wt_txn_set_timestamp(session, cfg)); - - if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) - WT_RET_MSG(session, EINVAL, "prepare timestamp is not set"); - - /* - * We are about to release the snapshot: copy values into any - * positioned cursors so they don't point to updates that could be - * freed once we don't have a snapshot. - */ - if (session->ncursors > 0) { - WT_DIAGNOSTIC_YIELD; - WT_RET(__wt_session_copy_values(session)); - } - - /* - * Prepare updates, traverse the modification array in reverse order - * so that we visit the update chain in newest to oldest order - * allowing us to set the key repeated flag with reserved updates in - * the chain. - */ - for (i = txn->mod_count; i > 0; i--) { - op = &txn->mod[i - 1]; - /* Assert it's not an update to the lookaside file. */ - WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 || - !F_ISSET(op->btree, WT_BTREE_LOOKASIDE)); - - /* Metadata updates should never be prepared. */ - WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle)); - if (WT_IS_METADATA(op->btree->dhandle)) - continue; - - upd = op->u.op_upd; - - switch (op->type) { - case WT_TXN_OP_NONE: - break; - case WT_TXN_OP_BASIC_COL: - case WT_TXN_OP_BASIC_ROW: - case WT_TXN_OP_INMEM_COL: - case WT_TXN_OP_INMEM_ROW: - /* - * Switch reserved operation to abort to simplify - * obsolete update list truncation. The object free - * function clears the operation type so we don't - * try to visit this update again: it can be evicted. - */ - if (upd->type == WT_UPDATE_RESERVE) { - upd->txnid = WT_TXN_ABORTED; - __wt_txn_op_free(session, op); - break; - } - - /* Set prepare timestamp. */ - upd->start_ts = txn->prepare_timestamp; - - WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS); - op->u.op_upd = NULL; - WT_STAT_CONN_INCR(session, txn_prepared_updates_count); - /* - * Set the key repeated flag which tells us that we've - * got multiple updates to the same key by the same txn. - * This is later used in txn commit. - * - * When we see a reserved update we set the - * WT_UPDATE_RESERVED flag instead. We do this as we - * cannot know if our current update should specify the - * key repeated flag as we don't want to traverse the - * entire update chain to find out. i.e. if there is - * an update with our txnid after the reserved update - * we should set key repeated, but if there isn't we - * shouldn't. - */ - if (upd->next != NULL && - upd->txnid == upd->next->txnid) { - if (upd->next->type == WT_UPDATE_RESERVE) - F_SET(op, WT_TXN_OP_KEY_RESERVED); - else - F_SET(op, WT_TXN_OP_KEY_REPEATED); - } - break; - case WT_TXN_OP_REF_DELETE: - __wt_txn_op_apply_prepare_state( - session, op->u.ref, false); - break; - case WT_TXN_OP_TRUNCATE_COL: - case WT_TXN_OP_TRUNCATE_ROW: - /* Other operations don't need timestamps. */ - break; - } - } - - /* Set transaction state to prepare. */ - F_SET(&session->txn, WT_TXN_PREPARE); - - /* Release our snapshot in case it is keeping data pinned. */ - __wt_txn_release_snapshot(session); - - /* - * Clear the transaction's ID from the global table, to facilitate - * prepared data visibility, but not from local transaction structure. - */ - if (F_ISSET(txn, WT_TXN_HAS_ID)) - __txn_remove_from_global_table(session); - - return (0); + WT_TXN *txn; + WT_TXN_OP *op; + WT_UPDATE *upd; + u_int i; + + txn = &session->txn; + + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); + /* + * A transaction should not have updated any of the logged tables, if debug mode logging is not + * turned on. + */ + if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE)) + WT_RET_ASSERT(session, txn->logrec == NULL, EINVAL, + "A transaction should not have been assigned a log" + " record if WT_CONN_LOG_DEBUG mode is not enabled"); + + /* Set the prepare timestamp. */ + WT_RET(__wt_txn_set_timestamp(session, cfg)); + + if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) + WT_RET_MSG(session, EINVAL, "prepare timestamp is not set"); + + /* + * We are about to release the snapshot: copy values into any positioned cursors so they don't + * point to updates that could be freed once we don't have a snapshot. + */ + if (session->ncursors > 0) { + WT_DIAGNOSTIC_YIELD; + WT_RET(__wt_session_copy_values(session)); + } + + /* + * Prepare updates, traverse the modification array in reverse order so that we visit the update + * chain in newest to oldest order allowing us to set the key repeated flag with reserved + * updates in the chain. + */ + for (i = txn->mod_count; i > 0; i--) { + op = &txn->mod[i - 1]; + /* Assert it's not an update to the lookaside file. */ + WT_ASSERT( + session, S2C(session)->cache->las_fileid == 0 || !F_ISSET(op->btree, WT_BTREE_LOOKASIDE)); + + /* Metadata updates should never be prepared. */ + WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle)); + if (WT_IS_METADATA(op->btree->dhandle)) + continue; + + upd = op->u.op_upd; + + switch (op->type) { + case WT_TXN_OP_NONE: + break; + case WT_TXN_OP_BASIC_COL: + case WT_TXN_OP_BASIC_ROW: + case WT_TXN_OP_INMEM_COL: + case WT_TXN_OP_INMEM_ROW: + /* + * Switch reserved operation to abort to simplify obsolete update list truncation. The + * object free function clears the operation type so we don't try to visit this update + * again: it can be evicted. + */ + if (upd->type == WT_UPDATE_RESERVE) { + upd->txnid = WT_TXN_ABORTED; + __wt_txn_op_free(session, op); + break; + } + + /* Set prepare timestamp. */ + upd->start_ts = txn->prepare_timestamp; + + WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS); + op->u.op_upd = NULL; + WT_STAT_CONN_INCR(session, txn_prepared_updates_count); + /* + * Set the key repeated flag which tells us that we've + * got multiple updates to the same key by the same txn. + * This is later used in txn commit. + * + * When we see a reserved update we set the + * WT_UPDATE_RESERVED flag instead. We do this as we + * cannot know if our current update should specify the + * key repeated flag as we don't want to traverse the + * entire update chain to find out. i.e. if there is + * an update with our txnid after the reserved update + * we should set key repeated, but if there isn't we + * shouldn't. + */ + if (upd->next != NULL && upd->txnid == upd->next->txnid) { + if (upd->next->type == WT_UPDATE_RESERVE) + F_SET(op, WT_TXN_OP_KEY_RESERVED); + else + F_SET(op, WT_TXN_OP_KEY_REPEATED); + } + break; + case WT_TXN_OP_REF_DELETE: + __wt_txn_op_apply_prepare_state(session, op->u.ref, false); + break; + case WT_TXN_OP_TRUNCATE_COL: + case WT_TXN_OP_TRUNCATE_ROW: + /* Other operations don't need timestamps. */ + break; + } + } + + /* Set transaction state to prepare. */ + F_SET(&session->txn, WT_TXN_PREPARE); + + /* Release our snapshot in case it is keeping data pinned. */ + __wt_txn_release_snapshot(session); + + /* + * Clear the transaction's ID from the global table, to facilitate prepared data visibility, but + * not from local transaction structure. + */ + if (F_ISSET(txn, WT_TXN_HAS_ID)) + __txn_remove_from_global_table(session); + + return (0); } /* * __wt_txn_rollback -- - * Roll back the current transaction. + * Roll back the current transaction. */ int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_OP *op; - WT_UPDATE *upd; - int64_t resolved_update_count, visited_update_count; - u_int i; - bool readonly, skip_update_assert; - - WT_UNUSED(cfg); - resolved_update_count = visited_update_count = 0; - txn = &session->txn; - readonly = txn->mod_count == 0; - skip_update_assert = false; - WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - - /* Rollback notification. */ - if (txn->notify != NULL) - WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, - txn->id, 0)); - - /* Rollback updates. */ - for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { - /* Assert it's not an update to the lookaside file. */ - WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 || - !F_ISSET(op->btree, WT_BTREE_LOOKASIDE)); - - /* Metadata updates should never be rolled back. */ - WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle)); - if (WT_IS_METADATA(op->btree->dhandle)) - continue; - - upd = op->u.op_upd; - - switch (op->type) { - case WT_TXN_OP_NONE: - break; - case WT_TXN_OP_BASIC_COL: - case WT_TXN_OP_BASIC_ROW: - case WT_TXN_OP_INMEM_COL: - case WT_TXN_OP_INMEM_ROW: - /* - * Need to resolve indirect references of transaction - * operation, in case of prepared transaction. - */ - if (F_ISSET(txn, WT_TXN_PREPARE)) { - visited_update_count++; - /* - * If we have set the key repeated flag - * we can skip resolving prepared updates as - * it would have happened on a previous - * modification in this txn. - */ - if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) { - skip_update_assert = - skip_update_assert || - F_ISSET(op, WT_TXN_OP_KEY_RESERVED); - WT_RET(__wt_txn_resolve_prepared_op( - session, op, false, - &resolved_update_count)); - } - /* - * We should resolve at least one or more - * updates each time we call - * __wt_txn_resolve_prepared_op, as such - * resolved update count should never be less - * than visited update count. - */ - WT_ASSERT(session, - resolved_update_count >= - visited_update_count); - } else { - WT_ASSERT(session, upd->txnid == txn->id || - upd->txnid == WT_TXN_ABORTED); - upd->txnid = WT_TXN_ABORTED; - } - break; - case WT_TXN_OP_REF_DELETE: - WT_TRET(__wt_delete_page_rollback(session, op->u.ref)); - break; - case WT_TXN_OP_TRUNCATE_COL: - case WT_TXN_OP_TRUNCATE_ROW: - /* - * Nothing to do: these operations are only logged for - * recovery. The in-memory changes will be rolled back - * with a combination of WT_TXN_OP_REF_DELETE and - * WT_TXN_OP_INMEM operations. - */ - break; - } - - __wt_txn_op_free(session, op); - } - WT_ASSERT(session, skip_update_assert || - resolved_update_count == visited_update_count); - WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved, - resolved_update_count); - - txn->mod_count = 0; - - __wt_txn_release(session); - /* - * We're between transactions, if we need to block for eviction, it's - * a good time to do so. Note that we must ignore any error return - * because the user's data is committed. - */ - if (!readonly) - WT_IGNORE_RET( - __wt_cache_eviction_check(session, false, false, NULL)); - return (ret); + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_OP *op; + WT_UPDATE *upd; + int64_t resolved_update_count, visited_update_count; + u_int i; + bool readonly, skip_update_assert; + + WT_UNUSED(cfg); + resolved_update_count = visited_update_count = 0; + txn = &session->txn; + readonly = txn->mod_count == 0; + skip_update_assert = false; + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); + + /* Rollback notification. */ + if (txn->notify != NULL) + WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 0)); + + /* Rollback updates. */ + for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { + /* Assert it's not an update to the lookaside file. */ + WT_ASSERT( + session, S2C(session)->cache->las_fileid == 0 || !F_ISSET(op->btree, WT_BTREE_LOOKASIDE)); + + /* Metadata updates should never be rolled back. */ + WT_ASSERT(session, !WT_IS_METADATA(op->btree->dhandle)); + if (WT_IS_METADATA(op->btree->dhandle)) + continue; + + upd = op->u.op_upd; + + switch (op->type) { + case WT_TXN_OP_NONE: + break; + case WT_TXN_OP_BASIC_COL: + case WT_TXN_OP_BASIC_ROW: + case WT_TXN_OP_INMEM_COL: + case WT_TXN_OP_INMEM_ROW: + /* + * Need to resolve indirect references of transaction operation, in case of prepared + * transaction. + */ + if (F_ISSET(txn, WT_TXN_PREPARE)) { + visited_update_count++; + /* + * If we have set the key repeated flag we can skip resolving prepared updates as it + * would have happened on a previous modification in this txn. + */ + if (!F_ISSET(op, WT_TXN_OP_KEY_REPEATED)) { + skip_update_assert = skip_update_assert || F_ISSET(op, WT_TXN_OP_KEY_RESERVED); + WT_RET( + __wt_txn_resolve_prepared_op(session, op, false, &resolved_update_count)); + } + /* + * We should resolve at least one or more + * updates each time we call + * __wt_txn_resolve_prepared_op, as such + * resolved update count should never be less + * than visited update count. + */ + WT_ASSERT(session, resolved_update_count >= visited_update_count); + } else { + WT_ASSERT(session, upd->txnid == txn->id || upd->txnid == WT_TXN_ABORTED); + upd->txnid = WT_TXN_ABORTED; + } + break; + case WT_TXN_OP_REF_DELETE: + WT_TRET(__wt_delete_page_rollback(session, op->u.ref)); + break; + case WT_TXN_OP_TRUNCATE_COL: + case WT_TXN_OP_TRUNCATE_ROW: + /* + * Nothing to do: these operations are only logged for recovery. The in-memory changes + * will be rolled back with a combination of WT_TXN_OP_REF_DELETE and WT_TXN_OP_INMEM + * operations. + */ + break; + } + + __wt_txn_op_free(session, op); + } + WT_RET_ASSERT(session, skip_update_assert || resolved_update_count == visited_update_count, + EINVAL, "Number of resolved prepared updates: %" PRId64 + " does not match" + " number visited: %" PRId64, + resolved_update_count, visited_update_count); + WT_STAT_CONN_INCRV(session, txn_prepared_updates_resolved, resolved_update_count); + + txn->mod_count = 0; + + __wt_txn_release(session); + /* + * We're between transactions, if we need to block for eviction, it's a good time to do so. Note + * that we must ignore any error return because the user's data is committed. + */ + if (!readonly) + WT_IGNORE_RET(__wt_cache_eviction_check(session, false, false, NULL)); + return (ret); } /* * __wt_txn_rollback_required -- - * Prepare to log a reason if the user attempts to use the transaction to - * do anything other than rollback. + * Prepare to log a reason if the user attempts to use the transaction to do anything other than + * rollback. */ int __wt_txn_rollback_required(WT_SESSION_IMPL *session, const char *reason) { - session->txn.rollback_reason = reason; - return (WT_ROLLBACK); + session->txn.rollback_reason = reason; + return (WT_ROLLBACK); } /* * __wt_txn_init -- - * Initialize a session's transaction data. + * Initialize a session's transaction data. */ int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) { - WT_TXN *txn; + WT_TXN *txn; - txn = &session_ret->txn; - txn->id = WT_TXN_NONE; + txn = &session_ret->txn; + txn->id = WT_TXN_NONE; - WT_RET(__wt_calloc_def(session, - S2C(session_ret)->session_size, &txn->snapshot)); + WT_RET(__wt_calloc_def(session, S2C(session_ret)->session_size, &txn->snapshot)); #ifdef HAVE_DIAGNOSTIC - if (S2C(session_ret)->txn_global.states != NULL) { - WT_TXN_STATE *txn_state; - txn_state = WT_SESSION_TXN_STATE(session_ret); - WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); - } + if (S2C(session_ret)->txn_global.states != NULL) { + WT_TXN_STATE *txn_state; + txn_state = WT_SESSION_TXN_STATE(session_ret); + WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); + } #endif - /* - * Take care to clean these out in case we are reusing the transaction - * for eviction. - */ - txn->mod = NULL; + /* + * Take care to clean these out in case we are reusing the transaction for eviction. + */ + txn->mod = NULL; - txn->isolation = session_ret->isolation; - return (0); + txn->isolation = session_ret->isolation; + return (0); } /* * __wt_txn_stats_update -- - * Update the transaction statistics for return to the application. + * Update the transaction statistics for return to the application. */ void __wt_txn_stats_update(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_CONNECTION_STATS **stats; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t checkpoint_timestamp; - wt_timestamp_t durable_timestamp; - wt_timestamp_t oldest_active_read_timestamp; - wt_timestamp_t pinned_timestamp; - uint64_t checkpoint_pinned, snapshot_pinned; - - conn = S2C(session); - txn_global = &conn->txn_global; - stats = conn->stats; - checkpoint_pinned = txn_global->checkpoint_state.pinned_id; - snapshot_pinned = txn_global->nsnap_oldest_id; - - WT_STAT_SET(session, stats, txn_pinned_range, - txn_global->current - txn_global->oldest_id); - - checkpoint_timestamp = txn_global->checkpoint_timestamp; - durable_timestamp = txn_global->durable_timestamp; - pinned_timestamp = txn_global->pinned_timestamp; - if (checkpoint_timestamp != WT_TS_NONE && - checkpoint_timestamp < pinned_timestamp) - pinned_timestamp = checkpoint_timestamp; - WT_STAT_SET(session, stats, txn_pinned_timestamp, - durable_timestamp - pinned_timestamp); - WT_STAT_SET(session, stats, txn_pinned_timestamp_checkpoint, - durable_timestamp - checkpoint_timestamp); - WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest, - durable_timestamp - txn_global->oldest_timestamp); - - if (__wt_txn_get_pinned_timestamp( - session, &oldest_active_read_timestamp, 0) == 0) { - WT_STAT_SET(session, stats, - txn_timestamp_oldest_active_read, - oldest_active_read_timestamp); - WT_STAT_SET(session, stats, - txn_pinned_timestamp_reader, - durable_timestamp - oldest_active_read_timestamp); - } else { - WT_STAT_SET(session, - stats, txn_timestamp_oldest_active_read, 0); - WT_STAT_SET(session, - stats, txn_pinned_timestamp_reader, 0); - } - - WT_STAT_SET(session, stats, txn_pinned_snapshot_range, - snapshot_pinned == WT_TXN_NONE ? - 0 : txn_global->current - snapshot_pinned); - - WT_STAT_SET(session, stats, txn_pinned_checkpoint_range, - checkpoint_pinned == WT_TXN_NONE ? - 0 : txn_global->current - checkpoint_pinned); - - WT_STAT_SET( - session, stats, txn_checkpoint_time_max, conn->ckpt_time_max); - WT_STAT_SET( - session, stats, txn_checkpoint_time_min, conn->ckpt_time_min); - WT_STAT_SET( - session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent); - WT_STAT_SET( - session, stats, txn_checkpoint_time_total, conn->ckpt_time_total); - WT_STAT_SET(session, - stats, txn_durable_queue_len, txn_global->durable_timestampq_len); - WT_STAT_SET(session, - stats, txn_read_queue_len, txn_global->read_timestampq_len); + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS **stats; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t checkpoint_timestamp; + wt_timestamp_t durable_timestamp; + wt_timestamp_t oldest_active_read_timestamp; + wt_timestamp_t pinned_timestamp; + uint64_t checkpoint_pinned, snapshot_pinned; + + conn = S2C(session); + txn_global = &conn->txn_global; + stats = conn->stats; + checkpoint_pinned = txn_global->checkpoint_state.pinned_id; + snapshot_pinned = txn_global->nsnap_oldest_id; + + WT_STAT_SET(session, stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); + + checkpoint_timestamp = txn_global->checkpoint_timestamp; + durable_timestamp = txn_global->durable_timestamp; + pinned_timestamp = txn_global->pinned_timestamp; + if (checkpoint_timestamp != WT_TS_NONE && checkpoint_timestamp < pinned_timestamp) + pinned_timestamp = checkpoint_timestamp; + WT_STAT_SET(session, stats, txn_pinned_timestamp, durable_timestamp - pinned_timestamp); + WT_STAT_SET( + session, stats, txn_pinned_timestamp_checkpoint, durable_timestamp - checkpoint_timestamp); + WT_STAT_SET(session, stats, txn_pinned_timestamp_oldest, + durable_timestamp - txn_global->oldest_timestamp); + + if (__wt_txn_get_pinned_timestamp(session, &oldest_active_read_timestamp, 0) == 0) { + WT_STAT_SET(session, stats, txn_timestamp_oldest_active_read, oldest_active_read_timestamp); + WT_STAT_SET(session, stats, txn_pinned_timestamp_reader, + durable_timestamp - oldest_active_read_timestamp); + } else { + WT_STAT_SET(session, stats, txn_timestamp_oldest_active_read, 0); + WT_STAT_SET(session, stats, txn_pinned_timestamp_reader, 0); + } + + WT_STAT_SET(session, stats, txn_pinned_snapshot_range, + snapshot_pinned == WT_TXN_NONE ? 0 : txn_global->current - snapshot_pinned); + + WT_STAT_SET(session, stats, txn_pinned_checkpoint_range, + checkpoint_pinned == WT_TXN_NONE ? 0 : txn_global->current - checkpoint_pinned); + + WT_STAT_SET(session, stats, txn_checkpoint_time_max, conn->ckpt_time_max); + WT_STAT_SET(session, stats, txn_checkpoint_time_min, conn->ckpt_time_min); + WT_STAT_SET(session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent); + WT_STAT_SET(session, stats, txn_checkpoint_time_total, conn->ckpt_time_total); + WT_STAT_SET(session, stats, txn_durable_queue_len, txn_global->durable_timestampq_len); + WT_STAT_SET(session, stats, txn_read_queue_len, txn_global->read_timestampq_len); } /* * __wt_txn_release_resources -- - * Release resources for a session's transaction data. + * Release resources for a session's transaction data. */ void __wt_txn_release_resources(WT_SESSION_IMPL *session) { - WT_TXN *txn; + WT_TXN *txn; - txn = &session->txn; + txn = &session->txn; - WT_ASSERT(session, txn->mod_count == 0); - __wt_free(session, txn->mod); - txn->mod_alloc = 0; - txn->mod_count = 0; + WT_ASSERT(session, txn->mod_count == 0); + __wt_free(session, txn->mod); + txn->mod_alloc = 0; + txn->mod_count = 0; } /* * __wt_txn_destroy -- - * Destroy a session's transaction data. + * Destroy a session's transaction data. */ void __wt_txn_destroy(WT_SESSION_IMPL *session) { - __wt_txn_release_resources(session); - __wt_free(session, session->txn.snapshot); + __wt_txn_release_resources(session); + __wt_free(session, session->txn.snapshot); } /* * __wt_txn_global_init -- - * Initialize the global transaction state. + * Initialize the global transaction state. */ int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONNECTION_IMPL *conn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; - u_int i; + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + u_int i; - WT_UNUSED(cfg); - conn = S2C(session); + WT_UNUSED(cfg); + conn = S2C(session); - txn_global = &conn->txn_global; - txn_global->current = txn_global->last_running = - txn_global->metadata_pinned = txn_global->oldest_id = WT_TXN_FIRST; + txn_global = &conn->txn_global; + txn_global->current = txn_global->last_running = txn_global->metadata_pinned = + txn_global->oldest_id = WT_TXN_FIRST; - WT_RET(__wt_spin_init( - session, &txn_global->id_lock, "transaction id lock")); - WT_RWLOCK_INIT_TRACKED(session, &txn_global->rwlock, txn_global); - WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock)); + WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); + WT_RWLOCK_INIT_TRACKED(session, &txn_global->rwlock, txn_global); + WT_RET(__wt_rwlock_init(session, &txn_global->visibility_rwlock)); - WT_RWLOCK_INIT_TRACKED(session, - &txn_global->durable_timestamp_rwlock, durable_timestamp); - TAILQ_INIT(&txn_global->durable_timestamph); + WT_RWLOCK_INIT_TRACKED(session, &txn_global->durable_timestamp_rwlock, durable_timestamp); + TAILQ_INIT(&txn_global->durable_timestamph); - WT_RWLOCK_INIT_TRACKED(session, - &txn_global->read_timestamp_rwlock, read_timestamp); - TAILQ_INIT(&txn_global->read_timestamph); + WT_RWLOCK_INIT_TRACKED(session, &txn_global->read_timestamp_rwlock, read_timestamp); + TAILQ_INIT(&txn_global->read_timestamph); - WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock)); - txn_global->nsnap_oldest_id = WT_TXN_NONE; - TAILQ_INIT(&txn_global->nsnaph); + WT_RET(__wt_rwlock_init(session, &txn_global->nsnap_rwlock)); + txn_global->nsnap_oldest_id = WT_TXN_NONE; + TAILQ_INIT(&txn_global->nsnaph); - WT_RET(__wt_calloc_def( - session, conn->session_size, &txn_global->states)); + WT_RET(__wt_calloc_def(session, conn->session_size, &txn_global->states)); - for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) - s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE; + for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) + s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE; - return (0); + return (0); } /* * __wt_txn_global_destroy -- - * Destroy the global transaction state. + * Destroy the global transaction state. */ void __wt_txn_global_destroy(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_TXN_GLOBAL *txn_global; - - conn = S2C(session); - txn_global = &conn->txn_global; - - if (txn_global == NULL) - return; - - __wt_spin_destroy(session, &txn_global->id_lock); - __wt_rwlock_destroy(session, &txn_global->rwlock); - __wt_rwlock_destroy(session, &txn_global->durable_timestamp_rwlock); - __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock); - __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); - __wt_rwlock_destroy(session, &txn_global->visibility_rwlock); - __wt_free(session, txn_global->states); + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + + conn = S2C(session); + txn_global = &conn->txn_global; + + if (txn_global == NULL) + return; + + __wt_spin_destroy(session, &txn_global->id_lock); + __wt_rwlock_destroy(session, &txn_global->rwlock); + __wt_rwlock_destroy(session, &txn_global->durable_timestamp_rwlock); + __wt_rwlock_destroy(session, &txn_global->read_timestamp_rwlock); + __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); + __wt_rwlock_destroy(session, &txn_global->visibility_rwlock); + __wt_free(session, txn_global->states); } /* * __wt_txn_activity_drain -- - * Wait for transactions to quiesce. + * Wait for transactions to quiesce. */ int __wt_txn_activity_drain(WT_SESSION_IMPL *session) { - bool txn_active; - - /* - * It's possible that the eviction server is in the middle of a long - * operation, with a transaction ID pinned. In that case, we will loop - * here until the transaction ID is released, when the oldest - * transaction ID will catch up with the current ID. - */ - for (;;) { - WT_RET(__wt_txn_activity_check(session, &txn_active)); - if (!txn_active) - break; - - WT_STAT_CONN_INCR(session, txn_release_blocked); - __wt_yield(); - } - - return (0); + bool txn_active; + + /* + * It's possible that the eviction server is in the middle of a long operation, with a + * transaction ID pinned. In that case, we will loop here until the transaction ID is released, + * when the oldest transaction ID will catch up with the current ID. + */ + for (;;) { + WT_RET(__wt_txn_activity_check(session, &txn_active)); + if (!txn_active) + break; + + WT_STAT_CONN_INCR(session, txn_release_blocked); + __wt_yield(); + } + + return (0); } /* * __wt_txn_global_shutdown -- - * Shut down the global transaction state. + * Shut down the global transaction state. */ void __wt_txn_global_shutdown(WT_SESSION_IMPL *session) { - /* - * All application transactions have completed, ignore the pinned - * timestamp so that updates can be evicted from the cache during - * connection close. - * - * Note that we are relying on a special case in __wt_txn_visible_all - * that returns true during close when there is no pinned timestamp - * set. - */ - S2C(session)->txn_global.has_pinned_timestamp = false; + /* + * All application transactions have completed, ignore the pinned + * timestamp so that updates can be evicted from the cache during + * connection close. + * + * Note that we are relying on a special case in __wt_txn_visible_all + * that returns true during close when there is no pinned timestamp + * set. + */ + S2C(session)->txn_global.has_pinned_timestamp = false; } /* * __wt_verbose_dump_txn_one -- - * Output diagnostic information about a transaction structure. + * Output diagnostic information about a transaction structure. */ int __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn) { - const char *iso_tag; - char ts_string[5][WT_TS_INT_STRING_SIZE]; - - WT_NOT_READ(iso_tag, "INVALID"); - switch (txn->isolation) { - case WT_ISO_READ_COMMITTED: - iso_tag = "WT_ISO_READ_COMMITTED"; - break; - case WT_ISO_READ_UNCOMMITTED: - iso_tag = "WT_ISO_READ_UNCOMMITTED"; - break; - case WT_ISO_SNAPSHOT: - iso_tag = "WT_ISO_SNAPSHOT"; - break; - } - WT_RET(__wt_msg(session, - "transaction id: %" PRIu64 - ", mod count: %u" - ", snap min: %" PRIu64 - ", snap max: %" PRIu64 - ", snapshot count: %u" - ", commit_timestamp: %s" - ", durable_timestamp: %s" - ", first_commit_timestamp: %s" - ", prepare_timestamp: %s" - ", read_timestamp: %s" - ", checkpoint LSN: [%" PRIu32 "][%" PRIu32 "]" - ", full checkpoint: %s" - ", rollback reason: %s" - ", flags: 0x%08" PRIx32 - ", isolation: %s", - txn->id, - txn->mod_count, - txn->snap_min, - txn->snap_max, - txn->snapshot_count, - __wt_timestamp_to_string(txn->commit_timestamp, ts_string[0]), - __wt_timestamp_to_string(txn->durable_timestamp, ts_string[1]), - __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[2]), - __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[3]), - __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]), - txn->ckpt_lsn.l.file, txn->ckpt_lsn.l.offset, - txn->full_ckpt ? "true" : "false", - txn->rollback_reason == NULL ? "" : txn->rollback_reason, - txn->flags, - iso_tag)); - return (0); + char ts_string[5][WT_TS_INT_STRING_SIZE]; + const char *iso_tag; + + WT_NOT_READ(iso_tag, "INVALID"); + switch (txn->isolation) { + case WT_ISO_READ_COMMITTED: + iso_tag = "WT_ISO_READ_COMMITTED"; + break; + case WT_ISO_READ_UNCOMMITTED: + iso_tag = "WT_ISO_READ_UNCOMMITTED"; + break; + case WT_ISO_SNAPSHOT: + iso_tag = "WT_ISO_SNAPSHOT"; + break; + } + WT_RET(__wt_msg(session, "transaction id: %" PRIu64 ", mod count: %u" + ", snap min: %" PRIu64 ", snap max: %" PRIu64 ", snapshot count: %u" + ", commit_timestamp: %s" + ", durable_timestamp: %s" + ", first_commit_timestamp: %s" + ", prepare_timestamp: %s" + ", read_timestamp: %s" + ", checkpoint LSN: [%" PRIu32 "][%" PRIu32 "]" + ", full checkpoint: %s" + ", rollback reason: %s" + ", flags: 0x%08" PRIx32 ", isolation: %s", + txn->id, txn->mod_count, txn->snap_min, txn->snap_max, txn->snapshot_count, + __wt_timestamp_to_string(txn->commit_timestamp, ts_string[0]), + __wt_timestamp_to_string(txn->durable_timestamp, ts_string[1]), + __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[2]), + __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[3]), + __wt_timestamp_to_string(txn->read_timestamp, ts_string[4]), txn->ckpt_lsn.l.file, + txn->ckpt_lsn.l.offset, txn->full_ckpt ? "true" : "false", + txn->rollback_reason == NULL ? "" : txn->rollback_reason, txn->flags, iso_tag)); + return (0); } /* * __wt_verbose_dump_txn -- - * Output diagnostic information about the global transaction state. + * Output diagnostic information about the global transaction state. */ int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_SESSION_IMPL *sess; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; - uint64_t id; - uint32_t i, session_cnt; - char ts_string[WT_TS_INT_STRING_SIZE]; - - conn = S2C(session); - txn_global = &conn->txn_global; - - WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); - WT_RET(__wt_msg(session, "transaction state dump")); - - WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); - WT_RET(__wt_msg(session, - "last running ID: %" PRIu64, txn_global->last_running)); - WT_RET(__wt_msg(session, - "metadata_pinned ID: %" PRIu64, txn_global->metadata_pinned)); - WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); - - WT_RET(__wt_msg(session, "durable timestamp: %s", - __wt_timestamp_to_string( - txn_global->durable_timestamp, ts_string))); - WT_RET(__wt_msg(session, "oldest timestamp: %s", - __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string))); - WT_RET(__wt_msg(session, "pinned timestamp: %s", - __wt_timestamp_to_string(txn_global->pinned_timestamp, ts_string))); - WT_RET(__wt_msg(session, "stable timestamp: %s", - __wt_timestamp_to_string(txn_global->stable_timestamp, ts_string))); - WT_RET(__wt_msg(session, "has_durable_timestamp: %s", - txn_global->has_durable_timestamp ? "yes" : "no")); - WT_RET(__wt_msg(session, "has_oldest_timestamp: %s", - txn_global->has_oldest_timestamp ? "yes" : "no")); - WT_RET(__wt_msg(session, "has_pinned_timestamp: %s", - txn_global->has_pinned_timestamp ? "yes" : "no")); - WT_RET(__wt_msg(session, "has_stable_timestamp: %s", - txn_global->has_stable_timestamp ? "yes" : "no")); - WT_RET(__wt_msg(session, "oldest_is_pinned: %s", - txn_global->oldest_is_pinned ? "yes" : "no")); - WT_RET(__wt_msg(session, "stable_is_pinned: %s", - txn_global->stable_is_pinned ? "yes" : "no")); - - WT_RET(__wt_msg(session, "checkpoint running: %s", - txn_global->checkpoint_running ? "yes" : "no")); - WT_RET(__wt_msg(session, "checkpoint generation: %" PRIu64, - __wt_gen(session, WT_GEN_CHECKPOINT))); - WT_RET(__wt_msg(session, "checkpoint pinned ID: %" PRIu64, - txn_global->checkpoint_state.pinned_id)); - WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, - txn_global->checkpoint_state.id)); - - WT_RET(__wt_msg(session, - "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); - - WT_ORDERED_READ(session_cnt, conn->session_cnt); - WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); - WT_RET(__wt_msg(session, "Transaction state of active sessions:")); - - /* - * Walk each session transaction state and dump information. Accessing - * the content of session handles is not thread safe, so some - * information may change while traversing if other threads are active - * at the same time, which is OK since this is diagnostic code. - */ - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip sessions with no active transaction */ - if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) - continue; - sess = &conn->sessions[i]; - WT_RET(__wt_msg(session, - "ID: %" PRIu64 - ", pinned ID: %" PRIu64 - ", metadata pinned ID: %" PRIu64 - ", name: %s", - id, s->pinned_id, s->metadata_pinned, - sess->name == NULL ? - "EMPTY" : sess->name)); - WT_RET(__wt_verbose_dump_txn_one(session, &sess->txn)); - } - - return (0); + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *sess; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + uint64_t id; + uint32_t i, session_cnt; + char ts_string[WT_TS_INT_STRING_SIZE]; + + conn = S2C(session); + txn_global = &conn->txn_global; + + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "transaction state dump")); + + WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); + WT_RET(__wt_msg(session, "last running ID: %" PRIu64, txn_global->last_running)); + WT_RET(__wt_msg(session, "metadata_pinned ID: %" PRIu64, txn_global->metadata_pinned)); + WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); + + WT_RET(__wt_msg(session, "durable timestamp: %s", + __wt_timestamp_to_string(txn_global->durable_timestamp, ts_string))); + WT_RET(__wt_msg(session, "oldest timestamp: %s", + __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string))); + WT_RET(__wt_msg(session, "pinned timestamp: %s", + __wt_timestamp_to_string(txn_global->pinned_timestamp, ts_string))); + WT_RET(__wt_msg(session, "stable timestamp: %s", + __wt_timestamp_to_string(txn_global->stable_timestamp, ts_string))); + WT_RET(__wt_msg( + session, "has_durable_timestamp: %s", txn_global->has_durable_timestamp ? "yes" : "no")); + WT_RET(__wt_msg( + session, "has_oldest_timestamp: %s", txn_global->has_oldest_timestamp ? "yes" : "no")); + WT_RET(__wt_msg( + session, "has_pinned_timestamp: %s", txn_global->has_pinned_timestamp ? "yes" : "no")); + WT_RET(__wt_msg( + session, "has_stable_timestamp: %s", txn_global->has_stable_timestamp ? "yes" : "no")); + WT_RET(__wt_msg(session, "oldest_is_pinned: %s", txn_global->oldest_is_pinned ? "yes" : "no")); + WT_RET(__wt_msg(session, "stable_is_pinned: %s", txn_global->stable_is_pinned ? "yes" : "no")); + + WT_RET( + __wt_msg(session, "checkpoint running: %s", txn_global->checkpoint_running ? "yes" : "no")); + WT_RET( + __wt_msg(session, "checkpoint generation: %" PRIu64, __wt_gen(session, WT_GEN_CHECKPOINT))); + WT_RET( + __wt_msg(session, "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_state.pinned_id)); + WT_RET(__wt_msg(session, "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_state.id)); + + WT_RET(__wt_msg(session, "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); + WT_RET(__wt_msg(session, "Transaction state of active sessions:")); + + /* + * Walk each session transaction state and dump information. Accessing the content of session + * handles is not thread safe, so some information may change while traversing if other threads + * are active at the same time, which is OK since this is diagnostic code. + */ + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip sessions with no active transaction */ + if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) + continue; + sess = &conn->sessions[i]; + WT_RET(__wt_msg(session, + "ID: %" PRIu64 ", pinned ID: %" PRIu64 ", metadata pinned ID: %" PRIu64 ", name: %s", id, + s->pinned_id, s->metadata_pinned, sess->name == NULL ? "EMPTY" : sess->name)); + WT_RET(__wt_verbose_dump_txn_one(session, &sess->txn)); + } + + return (0); } diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index ba3f4520e37..072406a25cc 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -9,1966 +9,1839 @@ #include "wt_internal.h" static void __checkpoint_timing_stress(WT_SESSION_IMPL *); -static int __checkpoint_lock_dirty_tree( - WT_SESSION_IMPL *, bool, bool, bool, const char *[]); +static int __checkpoint_lock_dirty_tree(WT_SESSION_IMPL *, bool, bool, bool, const char *[]); static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool); static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]); static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); /* * __checkpoint_name_ok -- - * Complain if the checkpoint name isn't acceptable. + * Complain if the checkpoint name isn't acceptable. */ static int __checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) { - /* Check for characters we don't want to see in a metadata file. */ - WT_RET(__wt_name_check(session, name, len)); - - /* - * The internal checkpoint name is special, applications aren't allowed - * to use it. Be aggressive and disallow any matching prefix, it makes - * things easier when checking in other places. - */ - if (len < strlen(WT_CHECKPOINT)) - return (0); - if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) - return (0); - - WT_RET_MSG(session, EINVAL, - "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); + /* Check for characters we don't want to see in a metadata file. */ + WT_RET(__wt_name_check(session, name, len)); + + /* + * The internal checkpoint name is special, applications aren't allowed to use it. Be aggressive + * and disallow any matching prefix, it makes things easier when checking in other places. + */ + if (len < strlen(WT_CHECKPOINT)) + return (0); + if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) + return (0); + + WT_RET_MSG(session, EINVAL, "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); } /* * __checkpoint_name_check -- - * Check for an attempt to name a checkpoint that includes anything - * other than a file object. + * Check for an attempt to name a checkpoint that includes anything other than a file object. */ static int __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) { - WT_CURSOR *cursor; - WT_DECL_RET; - const char *fail; - - cursor = NULL; - fail = NULL; - - /* - * This function exists as a place for this comment: named checkpoints - * are only supported on file objects, and not on LSM trees. If a target - * list is configured for the checkpoint, this function is called with - * each target list entry; check the entry to make sure it's backed by - * a file. If no target list is configured, confirm the metadata file - * contains no non-file objects. Skip any internal system objects. We - * don't want spurious error messages, other code will skip over them - * and the user has no control over their existence. - */ - if (uri == NULL) { - WT_RET(__wt_metadata_cursor(session, &cursor)); - while ((ret = cursor->next(cursor)) == 0) { - WT_ERR(cursor->get_key(cursor, &uri)); - if (!WT_PREFIX_MATCH(uri, "colgroup:") && - !WT_PREFIX_MATCH(uri, "file:") && - !WT_PREFIX_MATCH(uri, "index:") && - !WT_PREFIX_MATCH(uri, WT_SYSTEM_PREFIX) && - !WT_PREFIX_MATCH(uri, "table:")) { - fail = uri; - break; - } - } - WT_ERR_NOTFOUND_OK(ret); - } else - if (!WT_PREFIX_MATCH(uri, "colgroup:") && - !WT_PREFIX_MATCH(uri, "file:") && - !WT_PREFIX_MATCH(uri, "index:") && - !WT_PREFIX_MATCH(uri, "table:")) - fail = uri; - - if (fail != NULL) - WT_ERR_MSG(session, EINVAL, - "%s object does not support named checkpoints", fail); - -err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); - return (ret); + WT_CURSOR *cursor; + WT_DECL_RET; + const char *fail; + + cursor = NULL; + fail = NULL; + + /* + * This function exists as a place for this comment: named checkpoints are only supported on + * file objects, and not on LSM trees. If a target list is configured for the checkpoint, this + * function is called with each target list entry; check the entry to make sure it's backed by a + * file. If no target list is configured, confirm the metadata file contains no non-file + * objects. Skip any internal system objects. We don't want spurious error messages, other code + * will skip over them and the user has no control over their existence. + */ + if (uri == NULL) { + WT_RET(__wt_metadata_cursor(session, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_key(cursor, &uri)); + if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, WT_SYSTEM_PREFIX) && + !WT_PREFIX_MATCH(uri, "table:")) { + fail = uri; + break; + } + } + WT_ERR_NOTFOUND_OK(ret); + } else if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && + !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:")) + fail = uri; + + if (fail != NULL) + WT_ERR_MSG(session, EINVAL, "%s object does not support named checkpoints", fail); + +err: + WT_TRET(__wt_metadata_cursor_release(session, &cursor)); + return (ret); } /* * __checkpoint_update_generation -- - * Update the checkpoint generation of the current tree. - * - * This indicates that the tree will not be visited again by the current - * checkpoint. + * Update the checkpoint generation of the current tree. This indicates that the tree will not + * be visited again by the current checkpoint. */ static void __checkpoint_update_generation(WT_SESSION_IMPL *session) { - WT_BTREE *btree; + WT_BTREE *btree; - btree = S2BT(session); + btree = S2BT(session); - /* - * Updates to the metadata are made by the checkpoint transaction, so - * the metadata tree's checkpoint generation should never be updated. - */ - if (WT_IS_METADATA(session->dhandle)) - return; + /* + * Updates to the metadata are made by the checkpoint transaction, so the metadata tree's + * checkpoint generation should never be updated. + */ + if (WT_IS_METADATA(session->dhandle)) + return; - WT_PUBLISH(btree->checkpoint_gen, __wt_gen(session, WT_GEN_CHECKPOINT)); - WT_STAT_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); + WT_PUBLISH(btree->checkpoint_gen, __wt_gen(session, WT_GEN_CHECKPOINT)); + WT_STAT_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); } /* * __checkpoint_apply_all -- - * Apply an operation to all files involved in a checkpoint. + * Apply an operation to all files involved in a checkpoint. */ static int -__checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[])) +__checkpoint_apply_all( + WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { - WT_CONFIG targetconf; - WT_CONFIG_ITEM cval, k, v; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - bool ckpt_closed, named, target_list; - - target_list = false; - - /* Flag if this is a named checkpoint, and check if the name is OK. */ - WT_RET(__wt_config_gets(session, cfg, "name", &cval)); - named = cval.len != 0; - if (named) - WT_RET(__checkpoint_name_ok(session, cval.str, cval.len)); - - /* Step through the targets and optionally operate on each one. */ - WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); - __wt_config_subinit(session, &targetconf, &cval); - while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { - if (!target_list) { - WT_ERR(__wt_scr_alloc(session, 512, &tmp)); - target_list = true; - } - - if (v.len != 0) - WT_ERR_MSG(session, EINVAL, - "invalid checkpoint target %.*s: URIs may require " - "quoting", - (int)cval.len, (char *)cval.str); - - /* Some objects don't support named checkpoints. */ - if (named) - WT_ERR(__checkpoint_name_check(session, k.str)); - - if (op == NULL) - continue; - WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); - if ((ret = __wt_schema_worker( - session, tmp->data, op, NULL, cfg, 0)) != 0) - WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data); - } - WT_ERR_NOTFOUND_OK(ret); - - if (!target_list && named) - /* Some objects don't support named checkpoints. */ - WT_ERR(__checkpoint_name_check(session, NULL)); - - if (!target_list && op != NULL) { - /* - * If the checkpoint is named or we're dropping checkpoints, we - * checkpoint both open and closed files; else, only checkpoint - * open files. - * - * XXX - * We don't optimize unnamed checkpoints of a list of targets, - * we open the targets and checkpoint them even if they are - * quiescent and don't need a checkpoint, believing applications - * unlikely to checkpoint a list of closed targets. - */ - ckpt_closed = named; - if (!ckpt_closed) { - WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); - ckpt_closed = cval.len != 0; - } - WT_ERR(ckpt_closed ? - __wt_meta_apply_all(session, op, NULL, cfg) : - __wt_conn_btree_apply(session, NULL, op, NULL, cfg)); - } - -err: __wt_scr_free(session, &tmp); - return (ret); + WT_CONFIG targetconf; + WT_CONFIG_ITEM cval, k, v; + WT_DECL_ITEM(tmp); + WT_DECL_RET; + bool ckpt_closed, named, target_list; + + target_list = false; + + /* Flag if this is a named checkpoint, and check if the name is OK. */ + WT_RET(__wt_config_gets(session, cfg, "name", &cval)); + named = cval.len != 0; + if (named) + WT_RET(__checkpoint_name_ok(session, cval.str, cval.len)); + + /* Step through the targets and optionally operate on each one. */ + WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); + __wt_config_subinit(session, &targetconf, &cval); + while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { + if (!target_list) { + WT_ERR(__wt_scr_alloc(session, 512, &tmp)); + target_list = true; + } + + if (v.len != 0) + WT_ERR_MSG(session, EINVAL, + "invalid checkpoint target %.*s: URIs may require " + "quoting", + (int)cval.len, (char *)cval.str); + + /* Some objects don't support named checkpoints. */ + if (named) + WT_ERR(__checkpoint_name_check(session, k.str)); + + if (op == NULL) + continue; + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); + if ((ret = __wt_schema_worker(session, tmp->data, op, NULL, cfg, 0)) != 0) + WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data); + } + WT_ERR_NOTFOUND_OK(ret); + + if (!target_list && named) + /* Some objects don't support named checkpoints. */ + WT_ERR(__checkpoint_name_check(session, NULL)); + + if (!target_list && op != NULL) { + /* + * If the checkpoint is named or we're dropping checkpoints, we + * checkpoint both open and closed files; else, only checkpoint + * open files. + * + * XXX + * We don't optimize unnamed checkpoints of a list of targets, + * we open the targets and checkpoint them even if they are + * quiescent and don't need a checkpoint, believing applications + * unlikely to checkpoint a list of closed targets. + */ + ckpt_closed = named; + if (!ckpt_closed) { + WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); + ckpt_closed = cval.len != 0; + } + WT_ERR(ckpt_closed ? __wt_meta_apply_all(session, op, NULL, cfg) : + __wt_conn_btree_apply(session, NULL, op, NULL, cfg)); + } + +err: + __wt_scr_free(session, &tmp); + return (ret); } /* * __checkpoint_apply -- - * Apply an operation to all handles locked for a checkpoint. + * Apply an operation to all handles locked for a checkpoint. */ static int -__checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], - int (*op)(WT_SESSION_IMPL *, const char *[])) +__checkpoint_apply( + WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { - WT_DECL_RET; - u_int i; - - /* If we have already locked the handles, apply the operation. */ - for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i] == NULL) - continue; - WT_WITH_DHANDLE(session, session->ckpt_handle[i], - ret = (*op)(session, cfg)); - WT_RET(ret); - } - - return (0); + WT_DECL_RET; + u_int i; + + /* If we have already locked the handles, apply the operation. */ + for (i = 0; i < session->ckpt_handle_next; ++i) { + if (session->ckpt_handle[i] == NULL) + continue; + WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg)); + WT_RET(ret); + } + + return (0); } /* * __checkpoint_data_source -- - * Checkpoint all data sources. + * Checkpoint all data sources. */ static int __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_DATA_SOURCE *dsrc; - WT_NAMED_DATA_SOURCE *ndsrc; - - /* - * A place-holder, to support data sources: we assume calling the - * underlying data-source session checkpoint function is sufficient to - * checkpoint all objects in the data source, open or closed, and we - * don't attempt to optimize the checkpoint of individual targets. - * Those assumptions are not necessarily going to be true for all - * data sources. - * - * It's not difficult to support data-source checkpoints of individual - * targets (__wt_schema_worker is the underlying function that will do - * the work, and it's already written to support data-sources, although - * we'd probably need to pass the URI of the object to the data source - * checkpoint function which we don't currently do). However, doing a - * full data checkpoint is trickier: currently, the connection code is - * written to ignore all objects other than "file:", and that code will - * require significant changes to work with data sources. - */ - TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) { - dsrc = ndsrc->dsrc; - if (dsrc->checkpoint != NULL) - WT_RET(dsrc->checkpoint(dsrc, - (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg)); - } - return (0); + WT_DATA_SOURCE *dsrc; + WT_NAMED_DATA_SOURCE *ndsrc; + + /* + * A place-holder, to support data sources: we assume calling the + * underlying data-source session checkpoint function is sufficient to + * checkpoint all objects in the data source, open or closed, and we + * don't attempt to optimize the checkpoint of individual targets. + * Those assumptions are not necessarily going to be true for all + * data sources. + * + * It's not difficult to support data-source checkpoints of individual + * targets (__wt_schema_worker is the underlying function that will do + * the work, and it's already written to support data-sources, although + * we'd probably need to pass the URI of the object to the data source + * checkpoint function which we don't currently do). However, doing a + * full data checkpoint is trickier: currently, the connection code is + * written to ignore all objects other than "file:", and that code will + * require significant changes to work with data sources. + */ + TAILQ_FOREACH (ndsrc, &S2C(session)->dsrcqh, q) { + dsrc = ndsrc->dsrc; + if (dsrc->checkpoint != NULL) + WT_RET(dsrc->checkpoint(dsrc, (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg)); + } + return (0); } /* * __wt_checkpoint_get_handles -- - * Get a list of handles to flush. + * Get a list of handles to flush. */ int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_BTREE *btree; - WT_CONFIG_ITEM cval; - WT_DECL_RET; - const char *name; - bool force; - - /* Find out if we have to force a checkpoint. */ - WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); - force = cval.val != 0; - if (!force) { - WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); - force = cval.len != 0; - } - - /* Should not be called with anything other than a live btree handle. */ - WT_ASSERT(session, session->dhandle->type == WT_DHANDLE_TYPE_BTREE && - session->dhandle->checkpoint == NULL); - - btree = S2BT(session); - - /* Skip files that are never involved in a checkpoint. */ - if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) - return (0); - - /* - * We may have raced between starting the checkpoint transaction and - * some operation completing on the handle that updated the metadata - * (e.g., closing a bulk load cursor). All such operations either have - * exclusive access to the handle or hold the schema lock. We are now - * holding the schema lock and have an open btree handle, so if we - * can't update the metadata, then there has been some state change - * invisible to the checkpoint transaction. - */ - if (!WT_IS_METADATA(session->dhandle)) { - WT_CURSOR *meta_cursor; - - WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); - WT_RET(__wt_metadata_cursor(session, &meta_cursor)); - meta_cursor->set_key(meta_cursor, session->dhandle->name); - ret = __wt_curfile_insert_check(meta_cursor); - if (ret == WT_ROLLBACK) { - /* - * If create or drop or any schema operation of a table - * is with in an user transaction then checkpoint can - * see the dhandle before the commit, which will lead - * to the rollback error. We will ignore this dhandle as - * part of this checkpoint by returning from here. - */ - WT_TRET(__wt_metadata_cursor_release(session, - &meta_cursor)); - return (0); - } - WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); - WT_RET(ret); - } - - /* - * Decide whether the tree needs to be included in the checkpoint and - * if so, acquire the necessary locks. - */ - WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( - session, true, force, true, cfg)); - WT_RET(ret); - if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { - WT_ASSERT(session, btree->ckpt == NULL); - __checkpoint_update_generation(session); - return (0); - } - - /* - * Make sure there is space for the new entry: do this before getting - * the handle to avoid cleanup if we can't allocate the memory. - */ - WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, - session->ckpt_handle_next + 1, &session->ckpt_handle)); - - /* - * The current tree will be included: get it again because the handle - * we have is only valid for the duration of this function. - */ - name = session->dhandle->name; - session->dhandle = NULL; - - if ((ret = __wt_session_get_dhandle(session, name, NULL, NULL, 0)) != 0) - return (ret == EBUSY ? 0 : ret); - - /* - * Save the current eviction walk setting: checkpoint can interfere - * with eviction and we don't want to unfairly penalize (or promote) - * eviction in trees due to checkpoints. - */ - btree->evict_walk_saved = btree->evict_walk_period; - - session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; - return (0); + WT_BTREE *btree; + WT_CONFIG_ITEM cval; + WT_DECL_RET; + const char *name; + bool force; + + /* Find out if we have to force a checkpoint. */ + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; + if (!force) { + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + force = cval.len != 0; + } + + /* Should not be called with anything other than a live btree handle. */ + WT_ASSERT(session, + session->dhandle->type == WT_DHANDLE_TYPE_BTREE && session->dhandle->checkpoint == NULL); + + btree = S2BT(session); + + /* Skip files that are never involved in a checkpoint. */ + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + return (0); + + /* + * We may have raced between starting the checkpoint transaction and + * some operation completing on the handle that updated the metadata + * (e.g., closing a bulk load cursor). All such operations either have + * exclusive access to the handle or hold the schema lock. We are now + * holding the schema lock and have an open btree handle, so if we + * can't update the metadata, then there has been some state change + * invisible to the checkpoint transaction. + */ + if (!WT_IS_METADATA(session->dhandle)) { + WT_CURSOR *meta_cursor; + + WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); + WT_RET(__wt_metadata_cursor(session, &meta_cursor)); + meta_cursor->set_key(meta_cursor, session->dhandle->name); + ret = __wt_curfile_insert_check(meta_cursor); + if (ret == WT_ROLLBACK) { + /* + * If create or drop or any schema operation of a table is with in an user transaction + * then checkpoint can see the dhandle before the commit, which will lead to the + * rollback error. We will ignore this dhandle as part of this checkpoint by returning + * from here. + */ + WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); + return (0); + } + WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); + WT_RET(ret); + } + + /* + * Decide whether the tree needs to be included in the checkpoint and if so, acquire the + * necessary locks. + */ + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg)); + WT_RET(ret); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { + WT_ASSERT(session, btree->ckpt == NULL); + __checkpoint_update_generation(session); + return (0); + } + + /* + * Make sure there is space for the new entry: do this before getting the handle to avoid + * cleanup if we can't allocate the memory. + */ + WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, + &session->ckpt_handle)); + + /* + * The current tree will be included: get it again because the handle we have is only valid for + * the duration of this function. + */ + name = session->dhandle->name; + session->dhandle = NULL; + + if ((ret = __wt_session_get_dhandle(session, name, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + + /* + * Save the current eviction walk setting: checkpoint can interfere with eviction and we don't + * want to unfairly penalize (or promote) eviction in trees due to checkpoints. + */ + btree->evict_walk_saved = btree->evict_walk_period; + + session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; + return (0); } /* * __checkpoint_reduce_dirty_cache -- - * Release clean trees from the list cached for checkpoints. + * Release clean trees from the list cached for checkpoints. */ static void __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) { - WT_CACHE *cache; - WT_CONNECTION_IMPL *conn; - double current_dirty, prev_dirty; - uint64_t bytes_written_start, bytes_written_total; - uint64_t cache_size, max_write; - uint64_t time_start, time_stop; - uint64_t total_ms; - - conn = S2C(session); - cache = conn->cache; - - /* - * Give up if scrubbing is disabled, including when checkpointing with - * a timestamp on close (we can't evict dirty pages in that case, so - * scrubbing cannot help). - */ - if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) || - cache->eviction_checkpoint_target < DBL_EPSILON) - return; - - time_start = __wt_clock(session); - bytes_written_start = cache->bytes_written; - - /* - * If the cache size is zero or very small, we're done. The cache - * size can briefly become zero if we're transitioning to a shared - * cache via reconfigure. This avoids potential divide by zero. - */ - if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE) - return; - - current_dirty = - (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; - if (current_dirty <= cache->eviction_checkpoint_target) - return; - - /* Stop if we write as much dirty data as is currently in cache. */ - max_write = __wt_cache_dirty_leaf_inuse(cache); - - /* Set the dirty trigger to the target value. */ - cache->eviction_scrub_target = cache->eviction_checkpoint_target; - WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); - - /* Wait while the dirty level is going down. */ - for (;;) { - __wt_sleep(0, 100 * WT_THOUSAND); - - prev_dirty = current_dirty; - current_dirty = - (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; - if (current_dirty <= cache->eviction_checkpoint_target || - current_dirty >= prev_dirty) - break; - - /* - * Don't scrub when the lookaside table is in use: scrubbing is - * counter-productive in that case. - */ - if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) - break; - - /* - * We haven't reached the current target. - * - * Don't wait indefinitely: there might be dirty pages - * that can't be evicted. If we can't meet the target, - * give up and start the checkpoint for real. - */ - bytes_written_total = - cache->bytes_written - bytes_written_start; - if (bytes_written_total > max_write) - break; - } - - time_stop = __wt_clock(session); - total_ms = WT_CLOCKDIFF_MS(time_stop, time_start); - WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + double current_dirty, prev_dirty; + uint64_t bytes_written_start, bytes_written_total; + uint64_t cache_size, max_write; + uint64_t time_start, time_stop; + uint64_t total_ms; + + conn = S2C(session); + cache = conn->cache; + + /* + * Give up if scrubbing is disabled, including when checkpointing with a timestamp on close (we + * can't evict dirty pages in that case, so scrubbing cannot help). + */ + if (F_ISSET(conn, WT_CONN_CLOSING_TIMESTAMP) || cache->eviction_checkpoint_target < DBL_EPSILON) + return; + + time_start = __wt_clock(session); + bytes_written_start = cache->bytes_written; + + /* + * If the cache size is zero or very small, we're done. The cache size can briefly become zero + * if we're transitioning to a shared cache via reconfigure. This avoids potential divide by + * zero. + */ + if ((cache_size = conn->cache_size) < 10 * WT_MEGABYTE) + return; + + current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; + if (current_dirty <= cache->eviction_checkpoint_target) + return; + + /* Stop if we write as much dirty data as is currently in cache. */ + max_write = __wt_cache_dirty_leaf_inuse(cache); + + /* Set the dirty trigger to the target value. */ + cache->eviction_scrub_target = cache->eviction_checkpoint_target; + WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); + + /* Wait while the dirty level is going down. */ + for (;;) { + __wt_sleep(0, 100 * WT_THOUSAND); + + prev_dirty = current_dirty; + current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; + if (current_dirty <= cache->eviction_checkpoint_target || current_dirty >= prev_dirty) + break; + + /* + * Don't scrub when the lookaside table is in use: scrubbing is counter-productive in that + * case. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_LOOKASIDE)) + break; + + /* + * We haven't reached the current target. + * + * Don't wait indefinitely: there might be dirty pages + * that can't be evicted. If we can't meet the target, + * give up and start the checkpoint for real. + */ + bytes_written_total = cache->bytes_written - bytes_written_start; + if (bytes_written_total > max_write) + break; + } + + time_stop = __wt_clock(session); + total_ms = WT_CLOCKDIFF_MS(time_stop, time_start); + WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); } /* * __wt_checkpoint_progress -- - * Output a checkpoint progress message. + * Output a checkpoint progress message. */ void __wt_checkpoint_progress(WT_SESSION_IMPL *session, bool closing) { - struct timespec cur_time; - WT_CONNECTION_IMPL *conn; - uint64_t time_diff; - - conn = S2C(session); - __wt_epoch(session, &cur_time); - - /* Time since the full database checkpoint started */ - time_diff = WT_TIMEDIFF_SEC(cur_time, - conn->ckpt_timer_start); - - if (closing || (time_diff / WT_PROGRESS_MSG_PERIOD) > - conn->ckpt_progress_msg_count) { - __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS, - "Checkpoint %s for %" PRIu64 - " seconds and wrote: %" PRIu64 " pages (%" PRIu64 " MB)", - closing ? "ran" : "has been running", - time_diff, conn->ckpt_write_pages, - conn->ckpt_write_bytes / WT_MEGABYTE); - conn->ckpt_progress_msg_count++; - } + struct timespec cur_time; + WT_CONNECTION_IMPL *conn; + uint64_t time_diff; + + conn = S2C(session); + __wt_epoch(session, &cur_time); + + /* Time since the full database checkpoint started */ + time_diff = WT_TIMEDIFF_SEC(cur_time, conn->ckpt_timer_start); + + if (closing || (time_diff / WT_PROGRESS_MSG_PERIOD) > conn->ckpt_progress_msg_count) { + __wt_verbose(session, WT_VERB_CHECKPOINT_PROGRESS, + "Checkpoint %s for %" PRIu64 " seconds and wrote: %" PRIu64 " pages (%" PRIu64 " MB)", + closing ? "ran" : "has been running", time_diff, conn->ckpt_write_pages, + conn->ckpt_write_bytes / WT_MEGABYTE); + conn->ckpt_progress_msg_count++; + } } /* * __checkpoint_stats -- - * Update checkpoint timer stats. + * Update checkpoint timer stats. */ static void __checkpoint_stats(WT_SESSION_IMPL *session) { - struct timespec stop; - WT_CONNECTION_IMPL *conn; - uint64_t msec; + struct timespec stop; + WT_CONNECTION_IMPL *conn; + uint64_t msec; - conn = S2C(session); + conn = S2C(session); - /* Output a verbose progress message for long running checkpoints */ - if (conn->ckpt_progress_msg_count > 0) - __wt_checkpoint_progress(session, true); + /* Output a verbose progress message for long running checkpoints */ + if (conn->ckpt_progress_msg_count > 0) + __wt_checkpoint_progress(session, true); - __wt_epoch(session, &stop); - msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_scrub_end); + __wt_epoch(session, &stop); + msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_scrub_end); - if (msec > conn->ckpt_time_max) - conn->ckpt_time_max = msec; - if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min) - conn->ckpt_time_min = msec; - conn->ckpt_time_recent = msec; - conn->ckpt_time_total += msec; + if (msec > conn->ckpt_time_max) + conn->ckpt_time_max = msec; + if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min) + conn->ckpt_time_min = msec; + conn->ckpt_time_recent = msec; + conn->ckpt_time_total += msec; } /* * __checkpoint_verbose_track -- - * Output a verbose message with timing information + * Output a verbose message with timing information */ static void __checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg) { - struct timespec stop; - WT_CONNECTION_IMPL *conn; - uint64_t msec; + struct timespec stop; + WT_CONNECTION_IMPL *conn; + uint64_t msec; - if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) - return; + if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) + return; - conn = S2C(session); - __wt_epoch(session, &stop); - - /* Get time diff in milliseconds. */ - msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_start); - __wt_verbose(session, - WT_VERB_CHECKPOINT, "time: %" PRIu64 " ms, gen: %" PRIu64 - ": Full database checkpoint %s", - msec, __wt_gen(session, WT_GEN_CHECKPOINT), msg); + conn = S2C(session); + __wt_epoch(session, &stop); + /* Get time diff in milliseconds. */ + msec = WT_TIMEDIFF_MS(stop, conn->ckpt_timer_start); + __wt_verbose(session, WT_VERB_CHECKPOINT, + "time: %" PRIu64 " ms, gen: %" PRIu64 ": Full database checkpoint %s", msec, + __wt_gen(session, WT_GEN_CHECKPOINT), msg); } /* * __checkpoint_fail_reset -- - * Reset fields when a failure occurs. + * Reset fields when a failure occurs. */ static void __checkpoint_fail_reset(WT_SESSION_IMPL *session) { - WT_BTREE *btree; + WT_BTREE *btree; - btree = S2BT(session); - btree->modified = true; - __wt_meta_ckptlist_free(session, &btree->ckpt); + btree = S2BT(session); + btree->modified = true; + __wt_meta_ckptlist_free(session, &btree->ckpt); } /* * __checkpoint_prepare -- - * Start the transaction for a checkpoint and gather handles. + * Start the transaction for a checkpoint and gather handles. */ static int -__checkpoint_prepare( - WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[]) +__checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; - const char *txn_cfg[] = { WT_CONFIG_BASE(session, - WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; - bool use_timestamp; - - conn = S2C(session); - txn = &session->txn; - txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); - - WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); - use_timestamp = (cval.val != 0); - - /* - * Start a snapshot transaction for the checkpoint. - * - * Note: we don't go through the public API calls because they have - * side effects on cursors, which applications can hold open across - * calls to checkpoint. - */ - WT_RET(__wt_txn_begin(session, txn_cfg)); - - WT_DIAGNOSTIC_YIELD; - - /* Ensure a transaction ID is allocated prior to sharing it globally */ - WT_RET(__wt_txn_id_check(session)); - - /* Keep track of handles acquired for locking. */ - WT_RET(__wt_meta_track_on(session)); - *trackingp = true; - - /* - * Mark the connection as clean. If some data gets modified after - * generating checkpoint transaction id, connection will be reset to - * dirty when reconciliation marks the btree dirty on encountering the - * dirty page. - */ - conn->modified = false; - - /* - * Save the checkpoint session ID. - * - * We never do checkpoints in the default session (with id zero). - */ - WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); - txn_global->checkpoint_id = session->id; - - /* - * Remove the checkpoint transaction from the global table. - * - * This allows ordinary visibility checks to move forward because - * checkpoints often take a long time and only write to the metadata. - */ - __wt_writelock(session, &txn_global->rwlock); - txn_global->checkpoint_state = *txn_state; - txn_global->checkpoint_state.pinned_id = txn->snap_min; - - /* - * Sanity check that the oldest ID hasn't moved on before we have - * cleared our entry. - */ - WT_ASSERT(session, - WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); - - /* - * Clear our entry from the global transaction session table. Any - * operation that needs to know about the ID for this checkpoint will - * consider the checkpoint ID in the global structure. Most operations - * can safely ignore the checkpoint ID (see the visible all check for - * details). - */ - txn_state->id = txn_state->pinned_id = - txn_state->metadata_pinned = WT_TXN_NONE; - - /* - * Set the checkpoint transaction's timestamp, if requested. - * - * We rely on having the global transaction data locked so the oldest - * timestamp can't move past the stable timestamp. - */ - WT_ASSERT(session, !F_ISSET(txn, - WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ | - WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ)); - - if (use_timestamp) { - /* - * If the user wants timestamps then set the metadata - * checkpoint timestamp based on whether or not a stable - * timestamp is actually in use. Only set it when we're not - * running recovery because recovery doesn't set the recovery - * timestamp until its checkpoint is complete. - */ - if (txn_global->has_stable_timestamp) { - txn->read_timestamp = txn_global->stable_timestamp; - txn_global->checkpoint_timestamp = txn->read_timestamp; - F_SET(txn, WT_TXN_HAS_TS_READ); - if (!F_ISSET(conn, WT_CONN_RECOVERING)) - txn_global->meta_ckpt_timestamp = - txn->read_timestamp; - } else if (!F_ISSET(conn, WT_CONN_RECOVERING)) - txn_global->meta_ckpt_timestamp = - txn_global->recovery_timestamp; - } else if (!F_ISSET(conn, WT_CONN_RECOVERING)) - txn_global->meta_ckpt_timestamp = 0; - - __wt_writeunlock(session, &txn_global->rwlock); - - if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) { - __wt_verbose_timestamp(session, txn->read_timestamp, - "Checkpoint requested at stable timestamp"); - - /* - * The snapshot we established when the transaction started may - * be too early to match the timestamp we just read. - * - * Get a new one. - */ - __wt_txn_get_snapshot(session); - } - - /* - * Get a list of handles we want to flush; for named checkpoints this - * may pull closed objects into the session cache. - * - * First, gather all handles, then start the checkpoint transaction, - * then release any clean handles. - */ - WT_ASSERT(session, session->ckpt_handle_next == 0); - WT_WITH_TABLE_READ_LOCK(session, ret = - __checkpoint_apply_all(session, cfg, __wt_checkpoint_get_handles)); - return (ret); + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + const char *txn_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL}; + bool use_timestamp; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); + + WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); + use_timestamp = (cval.val != 0); + + /* + * Start a snapshot transaction for the checkpoint. + * + * Note: we don't go through the public API calls because they have + * side effects on cursors, which applications can hold open across + * calls to checkpoint. + */ + WT_RET(__wt_txn_begin(session, txn_cfg)); + + WT_DIAGNOSTIC_YIELD; + + /* Ensure a transaction ID is allocated prior to sharing it globally */ + WT_RET(__wt_txn_id_check(session)); + + /* Keep track of handles acquired for locking. */ + WT_RET(__wt_meta_track_on(session)); + *trackingp = true; + + /* + * Mark the connection as clean. If some data gets modified after generating checkpoint + * transaction id, connection will be reset to dirty when reconciliation marks the btree dirty + * on encountering the dirty page. + */ + conn->modified = false; + + /* + * Save the checkpoint session ID. + * + * We never do checkpoints in the default session (with id zero). + */ + WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); + txn_global->checkpoint_id = session->id; + + /* + * Remove the checkpoint transaction from the global table. + * + * This allows ordinary visibility checks to move forward because + * checkpoints often take a long time and only write to the metadata. + */ + __wt_writelock(session, &txn_global->rwlock); + txn_global->checkpoint_state = *txn_state; + txn_global->checkpoint_state.pinned_id = txn->snap_min; + + /* + * Sanity check that the oldest ID hasn't moved on before we have cleared our entry. + */ + WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && + WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); + + /* + * Clear our entry from the global transaction session table. Any operation that needs to know + * about the ID for this checkpoint will consider the checkpoint ID in the global structure. + * Most operations can safely ignore the checkpoint ID (see the visible all check for details). + */ + txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; + + /* + * Set the checkpoint transaction's timestamp, if requested. + * + * We rely on having the global transaction data locked so the oldest + * timestamp can't move past the stable timestamp. + */ + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_HAS_TS_READ | + WT_TXN_TS_PUBLISHED | WT_TXN_PUBLIC_TS_READ)); + + if (use_timestamp) { + /* + * If the user wants timestamps then set the metadata checkpoint timestamp based on whether + * or not a stable timestamp is actually in use. Only set it when we're not running recovery + * because recovery doesn't set the recovery timestamp until its checkpoint is complete. + */ + if (txn_global->has_stable_timestamp) { + txn->read_timestamp = txn_global->stable_timestamp; + txn_global->checkpoint_timestamp = txn->read_timestamp; + F_SET(txn, WT_TXN_HAS_TS_READ); + if (!F_ISSET(conn, WT_CONN_RECOVERING)) + txn_global->meta_ckpt_timestamp = txn->read_timestamp; + } else if (!F_ISSET(conn, WT_CONN_RECOVERING)) + txn_global->meta_ckpt_timestamp = txn_global->recovery_timestamp; + } else if (!F_ISSET(conn, WT_CONN_RECOVERING)) + txn_global->meta_ckpt_timestamp = 0; + + __wt_writeunlock(session, &txn_global->rwlock); + + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) { + __wt_verbose_timestamp( + session, txn->read_timestamp, "Checkpoint requested at stable timestamp"); + + /* + * The snapshot we established when the transaction started may + * be too early to match the timestamp we just read. + * + * Get a new one. + */ + __wt_txn_get_snapshot(session); + } + + /* + * Get a list of handles we want to flush; for named checkpoints this + * may pull closed objects into the session cache. + * + * First, gather all handles, then start the checkpoint transaction, + * then release any clean handles. + */ + WT_ASSERT(session, session->ckpt_handle_next == 0); + WT_WITH_TABLE_READ_LOCK( + session, ret = __checkpoint_apply_all(session, cfg, __wt_checkpoint_get_handles)); + return (ret); } /* * __txn_checkpoint_can_skip -- - * Determine whether it's safe to skip taking a checkpoint. + * Determine whether it's safe to skip taking a checkpoint. */ static int -__txn_checkpoint_can_skip(WT_SESSION_IMPL *session, - const char *cfg[], bool *fullp, bool *use_timestampp, bool *can_skipp) +__txn_checkpoint_can_skip( + WT_SESSION_IMPL *session, const char *cfg[], bool *fullp, bool *use_timestampp, bool *can_skipp) { - WT_CONFIG targetconf; - WT_CONFIG_ITEM cval, k, v; - WT_CONNECTION_IMPL *conn; - WT_TXN_GLOBAL *txn_global; - bool full, use_timestamp; - - /* - * Default to not skipping - also initialize the other output - * parameters - even though they will always be initialized unless - * there is an error and callers need to ignore the results on error. - */ - *can_skipp = *fullp = *use_timestampp = false; - - conn = S2C(session); - txn_global = &conn->txn_global; - - /* - * This function also parses out some configuration options and hands - * them back to the caller - make sure it does that parsing regardless - * of the result. - * - * Determine if this is going to be a full checkpoint, that is a - * checkpoint that applies to all data tables in a database. - */ - WT_RET(__wt_config_gets(session, cfg, "target", &cval)); - __wt_config_subinit(session, &targetconf, &cval); - *fullp = full = __wt_config_next(&targetconf, &k, &v) != 0; - - WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); - *use_timestampp = use_timestamp = cval.val != 0; - - /* Never skip non-full checkpoints */ - if (!full) - return (0); - - /* Never skip if force is configured. */ - WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); - if (cval.val != 0) - return (0); - - /* Never skip named checkpoints. */ - WT_RET(__wt_config_gets(session, cfg, "name", &cval)); - if (cval.len != 0) - return (0); - - /* - * It isn't currently safe to skip timestamp checkpoints - see WT-4958. - * We should fix this so we can skip timestamp checkpoints if they - * don't have new content. - */ - if (use_timestamp) - return (0); - - /* - * Skip checkpointing the database if nothing has been dirtied since - * the last checkpoint. That said there can be short instances when a - * btree gets marked dirty and the connection is yet to be. We might - * skip a checkpoint in that short instance, which is okay because by - * the next time we get to checkpoint, the connection would have been - * marked dirty and hence the checkpoint will not be skipped again. - */ - if (!conn->modified) { - *can_skipp = true; - return (0); - } - - /* - * If the checkpoint is using timestamps, and the stable timestamp - * hasn't been updated since the last checkpoint there is nothing - * more that could be written. - */ - if (use_timestamp && txn_global->has_stable_timestamp && - txn_global->last_ckpt_timestamp != WT_TS_NONE && - txn_global->last_ckpt_timestamp == txn_global->stable_timestamp) { - *can_skipp = true; - return (0); - } - - return (0); + WT_CONFIG targetconf; + WT_CONFIG_ITEM cval, k, v; + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + bool full, use_timestamp; + + /* + * Default to not skipping - also initialize the other output parameters - even though they will + * always be initialized unless there is an error and callers need to ignore the results on + * error. + */ + *can_skipp = *fullp = *use_timestampp = false; + + conn = S2C(session); + txn_global = &conn->txn_global; + + /* + * This function also parses out some configuration options and hands + * them back to the caller - make sure it does that parsing regardless + * of the result. + * + * Determine if this is going to be a full checkpoint, that is a + * checkpoint that applies to all data tables in a database. + */ + WT_RET(__wt_config_gets(session, cfg, "target", &cval)); + __wt_config_subinit(session, &targetconf, &cval); + *fullp = full = __wt_config_next(&targetconf, &k, &v) != 0; + + WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); + *use_timestampp = use_timestamp = cval.val != 0; + + /* Never skip non-full checkpoints */ + if (!full) + return (0); + + /* Never skip if force is configured. */ + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + if (cval.val != 0) + return (0); + + /* Never skip named checkpoints. */ + WT_RET(__wt_config_gets(session, cfg, "name", &cval)); + if (cval.len != 0) + return (0); + + /* + * It isn't currently safe to skip timestamp checkpoints - see WT-4958. We should fix this so we + * can skip timestamp checkpoints if they don't have new content. + */ + if (use_timestamp) + return (0); + + /* + * Skip checkpointing the database if nothing has been dirtied since the last checkpoint. That + * said there can be short instances when a btree gets marked dirty and the connection is yet to + * be. We might skip a checkpoint in that short instance, which is okay because by the next time + * we get to checkpoint, the connection would have been marked dirty and hence the checkpoint + * will not be skipped again. + */ + if (!conn->modified) { + *can_skipp = true; + return (0); + } + + /* + * If the checkpoint is using timestamps, and the stable timestamp hasn't been updated since the + * last checkpoint there is nothing more that could be written. + */ + if (use_timestamp && txn_global->has_stable_timestamp && + txn_global->last_ckpt_timestamp != WT_TS_NONE && + txn_global->last_ckpt_timestamp == txn_global->stable_timestamp) { + *can_skipp = true; + return (0); + } + + return (0); } /* * __txn_checkpoint -- - * Checkpoint a database or a list of objects in the database. + * Checkpoint a database or a list of objects in the database. */ static int __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CACHE *cache; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_ISOLATION saved_isolation; - wt_timestamp_t ckpt_tmp_ts; - uint64_t fsync_duration_usecs, generation, time_start, time_stop; - u_int i; - bool can_skip, failed, full, idle, logging, tracking, use_timestamp; - void *saved_meta_next; - - conn = S2C(session); - cache = conn->cache; - txn = &session->txn; - txn_global = &conn->txn_global; - saved_isolation = session->isolation; - full = idle = logging = tracking = use_timestamp = false; - - /* Avoid doing work if possible. */ - WT_RET(__txn_checkpoint_can_skip(session, - cfg, &full, &use_timestamp, &can_skip)); - if (can_skip) { - WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); - return (0); - } - - /* - * Do a pass over the configuration arguments and figure out what kind - * of checkpoint this is. - */ - WT_RET(__checkpoint_apply_all(session, cfg, NULL)); - - logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); - - /* Reset the maximum page size seen by eviction. */ - conn->cache->evict_max_page_size = 0; - - /* Initialize the verbose tracking timer */ - __wt_epoch(session, &conn->ckpt_timer_start); - - /* Initialize the checkpoint progress tracking data */ - conn->ckpt_progress_msg_count = 0; - conn->ckpt_write_bytes = 0; - conn->ckpt_write_pages = 0; - - /* - * Update the global oldest ID so we do all possible cleanup. - * - * This is particularly important for compact, so that all dirty pages - * can be fully written. - */ - WT_ERR(__wt_txn_update_oldest( - session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - - /* Flush data-sources before we start the checkpoint. */ - WT_ERR(__checkpoint_data_source(session, cfg)); - - /* - * Try to reduce the amount of dirty data in cache so there is less - * work do during the critical section of the checkpoint. - */ - __checkpoint_reduce_dirty_cache(session); - - /* Tell logging that we are about to start a database checkpoint. */ - if (full && logging) - WT_ERR(__wt_txn_checkpoint_log( - session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); - - __checkpoint_verbose_track(session, "starting transaction"); - - if (full) - __wt_epoch(session, &conn->ckpt_timer_scrub_end); - - /* - * Start the checkpoint for real. - * - * Bump the global checkpoint generation, used to figure out whether - * checkpoint has visited a tree. Use an atomic increment even though - * we are single-threaded because readers of the checkpoint generation - * don't hold the checkpoint lock. - * - * We do need to update it before clearing the checkpoint's entry out - * of the transaction table, or a thread evicting in a tree could - * ignore the checkpoint's transaction. - */ - generation = __wt_gen_next(session, WT_GEN_CHECKPOINT); - WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation); - - /* - * We want to skip checkpointing clean handles whenever possible. That - * is, when the checkpoint is not named or forced. However, we need to - * take care about ordering with respect to the checkpoint transaction. - * - * We can't skip clean handles before starting the transaction or the - * checkpoint can miss updates in trees that become dirty as the - * checkpoint is starting. If we wait until the transaction has - * started before locking a handle, there could be a metadata-changing - * operation in between (e.g., salvage) that will cause a write - * conflict when the checkpoint goes to write the metadata. - * - * Hold the schema lock while starting the transaction and gathering - * handles so the set we get is complete and correct. - */ - WT_WITH_SCHEMA_LOCK(session, - ret = __checkpoint_prepare(session, &tracking, cfg)); - WT_ERR(ret); - - WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT); - - /* - * Unblock updates -- we can figure out that any updates to clean pages - * after this point are too new to be written in the checkpoint. - */ - cache->eviction_scrub_target = 0.0; - WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); - - /* Tell logging that we have started a database checkpoint. */ - if (full && logging) - WT_ERR(__wt_txn_checkpoint_log( - session, full, WT_TXN_LOG_CKPT_START, NULL)); - - __checkpoint_timing_stress(session); - - WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); - - /* - * Clear the dhandle so the visibility check doesn't get confused about - * the snap min. Don't bother restoring the handle since it doesn't - * make sense to carry a handle across a checkpoint. - */ - session->dhandle = NULL; - - /* - * Record the timestamp from the transaction if we were successful. - * Store it in a temp variable now because it will be invalidated during - * commit but we don't want to set it until we know the checkpoint - * is successful. We have to set the system information before we - * release the snapshot. - */ - ckpt_tmp_ts = 0; - if (full) { - WT_ERR(__wt_meta_sysinfo_set(session)); - ckpt_tmp_ts = txn->read_timestamp; - } - - /* Release the snapshot so we aren't pinning updates in cache. */ - __wt_txn_release_snapshot(session); - - /* Mark all trees as open for business (particularly eviction). */ - WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync)); - __wt_evict_server_wake(session); - - __checkpoint_verbose_track(session, "committing transaction"); - - /* - * Checkpoints have to hit disk (it would be reasonable to configure for - * lazy checkpoints, but we don't support them yet). - */ - time_start = __wt_clock(session); - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - time_stop = __wt_clock(session); - fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); - WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post); - WT_STAT_CONN_SET(session, - txn_checkpoint_fsync_post_duration, fsync_duration_usecs); - - __checkpoint_verbose_track(session, "sync completed"); - - /* - * Commit the transaction now that we are sure that all files in the - * checkpoint have been flushed to disk. It's OK to commit before - * checkpointing the metadata since we know that all files in the - * checkpoint are now in a consistent state. - */ - WT_ERR(__wt_txn_commit(session, NULL)); - - /* - * Ensure that the metadata changes are durable before the checkpoint - * is resolved. Do this by either checkpointing the metadata or syncing - * the log file. - * Recovery relies on the checkpoint LSN in the metadata only being - * updated by full checkpoints so only checkpoint the metadata for - * full or non-logged checkpoints. - * - * This is very similar to __wt_meta_track_off, ideally they would be - * merged. - */ - if (full || !logging) { - session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; - /* Disable metadata tracking during the metadata checkpoint. */ - saved_meta_next = session->meta_track_next; - session->meta_track_next = NULL; - WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), - WT_WITH_METADATA_LOCK(session, - ret = __wt_checkpoint(session, cfg))); - session->meta_track_next = saved_meta_next; - WT_ERR(ret); - - WT_WITH_DHANDLE(session, - WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint_sync(session, NULL)); - WT_ERR(ret); - - __checkpoint_verbose_track(session, "metadata sync completed"); - } else - WT_WITH_DHANDLE(session, - WT_SESSION_META_DHANDLE(session), - ret = __wt_txn_checkpoint_log( - session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); - - /* - * Now that the metadata is stable, re-open the metadata file for - * regular eviction by clearing the checkpoint_pinned flag. - */ - txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; - - if (full) { - __checkpoint_stats(session); - - /* - * If timestamps were used to define the content of the - * checkpoint update the saved last checkpoint timestamp, - * otherwise leave it alone. If a checkpoint is taken without - * timestamps, it's likely a bug, but we don't want to clear - * the saved last checkpoint timestamp regardless. - */ - if (use_timestamp) - conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts; - } + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_ISOLATION saved_isolation; + wt_timestamp_t ckpt_tmp_ts; + uint64_t fsync_duration_usecs, generation, time_start, time_stop; + u_int i; + bool can_skip, failed, full, idle, logging, tracking, use_timestamp; + void *saved_meta_next; + + conn = S2C(session); + cache = conn->cache; + txn = &session->txn; + txn_global = &conn->txn_global; + saved_isolation = session->isolation; + full = idle = logging = tracking = use_timestamp = false; + + /* Avoid doing work if possible. */ + WT_RET(__txn_checkpoint_can_skip(session, cfg, &full, &use_timestamp, &can_skip)); + if (can_skip) { + WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); + return (0); + } + + /* + * Do a pass over the configuration arguments and figure out what kind of checkpoint this is. + */ + WT_RET(__checkpoint_apply_all(session, cfg, NULL)); + + logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); + + /* Reset the maximum page size seen by eviction. */ + conn->cache->evict_max_page_size = 0; + + /* Initialize the verbose tracking timer */ + __wt_epoch(session, &conn->ckpt_timer_start); + + /* Initialize the checkpoint progress tracking data */ + conn->ckpt_progress_msg_count = 0; + conn->ckpt_write_bytes = 0; + conn->ckpt_write_pages = 0; + + /* + * Update the global oldest ID so we do all possible cleanup. + * + * This is particularly important for compact, so that all dirty pages + * can be fully written. + */ + WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); + + /* Flush data-sources before we start the checkpoint. */ + WT_ERR(__checkpoint_data_source(session, cfg)); + + /* + * Try to reduce the amount of dirty data in cache so there is less work do during the critical + * section of the checkpoint. + */ + __checkpoint_reduce_dirty_cache(session); + + /* Tell logging that we are about to start a database checkpoint. */ + if (full && logging) + WT_ERR(__wt_txn_checkpoint_log(session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); + + __checkpoint_verbose_track(session, "starting transaction"); + + if (full) + __wt_epoch(session, &conn->ckpt_timer_scrub_end); + + /* + * Start the checkpoint for real. + * + * Bump the global checkpoint generation, used to figure out whether + * checkpoint has visited a tree. Use an atomic increment even though + * we are single-threaded because readers of the checkpoint generation + * don't hold the checkpoint lock. + * + * We do need to update it before clearing the checkpoint's entry out + * of the transaction table, or a thread evicting in a tree could + * ignore the checkpoint's transaction. + */ + generation = __wt_gen_next(session, WT_GEN_CHECKPOINT); + WT_STAT_CONN_SET(session, txn_checkpoint_generation, generation); + + /* + * We want to skip checkpointing clean handles whenever possible. That + * is, when the checkpoint is not named or forced. However, we need to + * take care about ordering with respect to the checkpoint transaction. + * + * We can't skip clean handles before starting the transaction or the + * checkpoint can miss updates in trees that become dirty as the + * checkpoint is starting. If we wait until the transaction has + * started before locking a handle, there could be a metadata-changing + * operation in between (e.g., salvage) that will cause a write + * conflict when the checkpoint goes to write the metadata. + * + * Hold the schema lock while starting the transaction and gathering + * handles so the set we get is complete and correct. + */ + WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, &tracking, cfg)); + WT_ERR(ret); + + WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT); + + /* + * Unblock updates -- we can figure out that any updates to clean pages after this point are too + * new to be written in the checkpoint. + */ + cache->eviction_scrub_target = 0.0; + WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); + + /* Tell logging that we have started a database checkpoint. */ + if (full && logging) + WT_ERR(__wt_txn_checkpoint_log(session, full, WT_TXN_LOG_CKPT_START, NULL)); + + __checkpoint_timing_stress(session); + + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); + + /* + * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't + * bother restoring the handle since it doesn't make sense to carry a handle across a + * checkpoint. + */ + session->dhandle = NULL; + + /* + * Record the timestamp from the transaction if we were successful. Store it in a temp variable + * now because it will be invalidated during commit but we don't want to set it until we know + * the checkpoint is successful. We have to set the system information before we release the + * snapshot. + */ + ckpt_tmp_ts = 0; + if (full) { + WT_ERR(__wt_meta_sysinfo_set(session)); + ckpt_tmp_ts = txn->read_timestamp; + } + + /* Release the snapshot so we aren't pinning updates in cache. */ + __wt_txn_release_snapshot(session); + + /* Mark all trees as open for business (particularly eviction). */ + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync)); + __wt_evict_server_wake(session); + + __checkpoint_verbose_track(session, "committing transaction"); + + /* + * Checkpoints have to hit disk (it would be reasonable to configure for lazy checkpoints, but + * we don't support them yet). + */ + time_start = __wt_clock(session); + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); + WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post); + WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs); + + __checkpoint_verbose_track(session, "sync completed"); + + /* + * Commit the transaction now that we are sure that all files in the checkpoint have been + * flushed to disk. It's OK to commit before checkpointing the metadata since we know that all + * files in the checkpoint are now in a consistent state. + */ + WT_ERR(__wt_txn_commit(session, NULL)); + + /* + * Ensure that the metadata changes are durable before the checkpoint + * is resolved. Do this by either checkpointing the metadata or syncing + * the log file. + * Recovery relies on the checkpoint LSN in the metadata only being + * updated by full checkpoints so only checkpoint the metadata for + * full or non-logged checkpoints. + * + * This is very similar to __wt_meta_track_off, ideally they would be + * merged. + */ + if (full || !logging) { + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; + /* Disable metadata tracking during the metadata checkpoint. */ + saved_meta_next = session->meta_track_next; + session->meta_track_next = NULL; + WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), + WT_WITH_METADATA_LOCK(session, ret = __wt_checkpoint(session, cfg))); + session->meta_track_next = saved_meta_next; + WT_ERR(ret); + + WT_WITH_DHANDLE( + session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL)); + WT_ERR(ret); + + __checkpoint_verbose_track(session, "metadata sync completed"); + } else + WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), + ret = __wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); + + /* + * Now that the metadata is stable, re-open the metadata file for regular eviction by clearing + * the checkpoint_pinned flag. + */ + txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; + + if (full) { + __checkpoint_stats(session); + + /* + * If timestamps were used to define the content of the checkpoint update the saved last + * checkpoint timestamp, otherwise leave it alone. If a checkpoint is taken without + * timestamps, it's likely a bug, but we don't want to clear the saved last checkpoint + * timestamp regardless. + */ + if (use_timestamp) + conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts; + } err: - /* - * Reset the timer so that next checkpoint tracks the progress only if - * configured. - */ - conn->ckpt_timer_start.tv_sec = 0; - - /* - * XXX - * Rolling back the changes here is problematic. - * - * If we unroll here, we need a way to roll back changes to the avail - * list for each tree that was successfully synced before the error - * occurred. Otherwise, the next time we try this operation, we will - * try to free an old checkpoint again. - * - * OTOH, if we commit the changes after a failure, we have partially - * overwritten the checkpoint, so what ends up on disk is not - * consistent. - */ - failed = ret != 0; - if (failed) - conn->modified = true; - - session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; - if (tracking) - WT_TRET(__wt_meta_track_off(session, false, failed)); - - cache->eviction_scrub_target = 0.0; - WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); - - if (F_ISSET(txn, WT_TXN_RUNNING)) { - /* - * Clear the dhandle so the visibility check doesn't get - * confused about the snap min. Don't bother restoring the - * handle since it doesn't make sense to carry a handle across - * a checkpoint. - */ - session->dhandle = NULL; - WT_TRET(__wt_txn_rollback(session, NULL)); - } - - /* - * Tell logging that we have finished a database checkpoint. Do not - * write a log record if the database was idle. - */ - if (full && logging) { - if (ret == 0 && - F_ISSET(((WT_CURSOR_BTREE *) - session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT)) - idle = true; - WT_TRET(__wt_txn_checkpoint_log(session, full, - (ret == 0 && !idle) ? - WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); - } - - for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i] == NULL) - continue; - /* - * If the operation failed, mark all trees dirty so they are - * included if a future checkpoint can succeed. - */ - if (failed) - WT_WITH_DHANDLE(session, session->ckpt_handle[i], - __checkpoint_fail_reset(session)); - WT_WITH_DHANDLE(session, session->ckpt_handle[i], - WT_TRET(__wt_session_release_dhandle(session))); - } - - __wt_free(session, session->ckpt_handle); - session->ckpt_handle_allocated = session->ckpt_handle_next = 0; - - session->isolation = txn->isolation = saved_isolation; - return (ret); + /* + * Reset the timer so that next checkpoint tracks the progress only if configured. + */ + conn->ckpt_timer_start.tv_sec = 0; + + /* + * XXX + * Rolling back the changes here is problematic. + * + * If we unroll here, we need a way to roll back changes to the avail + * list for each tree that was successfully synced before the error + * occurred. Otherwise, the next time we try this operation, we will + * try to free an old checkpoint again. + * + * OTOH, if we commit the changes after a failure, we have partially + * overwritten the checkpoint, so what ends up on disk is not + * consistent. + */ + failed = ret != 0; + if (failed) + conn->modified = true; + + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; + if (tracking) + WT_TRET(__wt_meta_track_off(session, false, failed)); + + cache->eviction_scrub_target = 0.0; + WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); + + if (F_ISSET(txn, WT_TXN_RUNNING)) { + /* + * Clear the dhandle so the visibility check doesn't get confused about the snap min. Don't + * bother restoring the handle since it doesn't make sense to carry a handle across a + * checkpoint. + */ + session->dhandle = NULL; + WT_TRET(__wt_txn_rollback(session, NULL)); + } + + /* + * Tell logging that we have finished a database checkpoint. Do not write a log record if the + * database was idle. + */ + if (full && logging) { + if (ret == 0 && + F_ISSET(((WT_CURSOR_BTREE *)session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT)) + idle = true; + WT_TRET(__wt_txn_checkpoint_log(session, full, + (ret == 0 && !idle) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); + } + + for (i = 0; i < session->ckpt_handle_next; ++i) { + if (session->ckpt_handle[i] == NULL) + continue; + /* + * If the operation failed, mark all trees dirty so they are included if a future checkpoint + * can succeed. + */ + if (failed) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], __checkpoint_fail_reset(session)); + WT_WITH_DHANDLE( + session, session->ckpt_handle[i], WT_TRET(__wt_session_release_dhandle(session))); + } + + __wt_free(session, session->ckpt_handle); + session->ckpt_handle_allocated = session->ckpt_handle_next = 0; + + session->isolation = txn->isolation = saved_isolation; + return (ret); } /* * __txn_checkpoint_wrapper -- - * Checkpoint wrapper. + * Checkpoint wrapper. */ static int __txn_checkpoint_wrapper(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; - txn_global = &S2C(session)->txn_global; + txn_global = &S2C(session)->txn_global; - WT_STAT_CONN_SET(session, txn_checkpoint_running, 1); - txn_global->checkpoint_running = true; + WT_STAT_CONN_SET(session, txn_checkpoint_running, 1); + txn_global->checkpoint_running = true; - ret = __txn_checkpoint(session, cfg); + ret = __txn_checkpoint(session, cfg); - WT_STAT_CONN_SET(session, txn_checkpoint_running, 0); - txn_global->checkpoint_running = false; + WT_STAT_CONN_SET(session, txn_checkpoint_running, 0); + txn_global->checkpoint_running = false; - return (ret); + return (ret); } /* * __wt_txn_checkpoint -- - * Checkpoint a database or a list of objects in the database. + * Checkpoint a database or a list of objects in the database. */ int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) { - WT_DECL_RET; - uint32_t orig_flags; - - /* - * Reset open cursors. Do this explicitly, even though it will happen - * implicitly in the call to begin_transaction for the checkpoint, the - * checkpoint code will acquire the schema lock before we do that, and - * some implementation of WT_CURSOR::reset might need the schema lock. - */ - WT_RET(__wt_session_reset_cursors(session, false)); - - /* Ensure the metadata table is open before taking any locks. */ - WT_RET(__wt_metadata_cursor(session, NULL)); - - /* - * Don't highjack the session checkpoint thread for eviction. - * - * Application threads are not generally available for potentially slow - * operations, but checkpoint does enough I/O it may be called upon to - * perform slow operations for the block manager. - * - * Application checkpoints wait until the checkpoint lock is available, - * compaction checkpoints don't. - * - * Checkpoints should always use a separate session for lookaside - * updates, otherwise those updates are pinned until the checkpoint - * commits. Also, there are unfortunate interactions between the - * special rules for lookaside eviction and the special handling of the - * checkpoint transaction. - */ + WT_DECL_RET; + uint32_t orig_flags; + + /* + * Reset open cursors. Do this explicitly, even though it will happen implicitly in the call to + * begin_transaction for the checkpoint, the checkpoint code will acquire the schema lock before + * we do that, and some implementation of WT_CURSOR::reset might need the schema lock. + */ + WT_RET(__wt_session_reset_cursors(session, false)); + + /* Ensure the metadata table is open before taking any locks. */ + WT_RET(__wt_metadata_cursor(session, NULL)); + +/* + * Don't highjack the session checkpoint thread for eviction. + * + * Application threads are not generally available for potentially slow + * operations, but checkpoint does enough I/O it may be called upon to + * perform slow operations for the block manager. + * + * Application checkpoints wait until the checkpoint lock is available, + * compaction checkpoints don't. + * + * Checkpoints should always use a separate session for lookaside + * updates, otherwise those updates are pinned until the checkpoint + * commits. Also, there are unfortunate interactions between the + * special rules for lookaside eviction and the special handling of the + * checkpoint transaction. + */ #undef WT_CHECKPOINT_SESSION_FLAGS -#define WT_CHECKPOINT_SESSION_FLAGS \ - (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE) +#define WT_CHECKPOINT_SESSION_FLAGS (WT_SESSION_CAN_WAIT | WT_SESSION_IGNORE_CACHE_SIZE) #undef WT_CHECKPOINT_SESSION_FLAGS_OFF -#define WT_CHECKPOINT_SESSION_FLAGS_OFF \ - (WT_SESSION_LOOKASIDE_CURSOR) - orig_flags = F_MASK(session, - WT_CHECKPOINT_SESSION_FLAGS | WT_CHECKPOINT_SESSION_FLAGS_OFF); - F_SET(session, WT_CHECKPOINT_SESSION_FLAGS); - F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS_OFF); - - /* - * Only one checkpoint can be active at a time, and checkpoints must run - * in the same order as they update the metadata. It's probably a bad - * idea to run checkpoints out of multiple threads, but as compaction - * calls checkpoint directly, it can be tough to avoid. Serialize here - * to ensure we don't get into trouble. - */ - if (waiting) - WT_WITH_CHECKPOINT_LOCK(session, - ret = __txn_checkpoint_wrapper(session, cfg)); - else - WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, - ret = __txn_checkpoint_wrapper(session, cfg)); - - F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS); - F_SET(session, orig_flags); - - return (ret); +#define WT_CHECKPOINT_SESSION_FLAGS_OFF (WT_SESSION_LOOKASIDE_CURSOR) + orig_flags = F_MASK(session, WT_CHECKPOINT_SESSION_FLAGS | WT_CHECKPOINT_SESSION_FLAGS_OFF); + F_SET(session, WT_CHECKPOINT_SESSION_FLAGS); + F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS_OFF); + + /* + * Only one checkpoint can be active at a time, and checkpoints must run in the same order as + * they update the metadata. It's probably a bad idea to run checkpoints out of multiple + * threads, but as compaction calls checkpoint directly, it can be tough to avoid. Serialize + * here to ensure we don't get into trouble. + */ + if (waiting) + WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint_wrapper(session, cfg)); + else + WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, ret = __txn_checkpoint_wrapper(session, cfg)); + + F_CLR(session, WT_CHECKPOINT_SESSION_FLAGS); + F_SET(session, orig_flags); + + return (ret); } /* * __drop -- - * Drop all checkpoints with a specific name. + * Drop all checkpoints with a specific name. */ static void __drop(WT_CKPT *ckptbase, const char *name, size_t len) { - WT_CKPT *ckpt; - - /* - * If we're dropping internal checkpoints, match to the '.' separating - * the checkpoint name from the generational number, and take all that - * we can find. Applications aren't allowed to use any variant of this - * name, so the test is still pretty simple, if the leading bytes match, - * it's one we want to drop. - */ - if (strncmp(WT_CHECKPOINT, name, len) == 0) { - WT_CKPT_FOREACH(ckptbase, ckpt) - if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) - F_SET(ckpt, WT_CKPT_DELETE); - } else - WT_CKPT_FOREACH(ckptbase, ckpt) - if (WT_STRING_MATCH(ckpt->name, name, len)) - F_SET(ckpt, WT_CKPT_DELETE); + WT_CKPT *ckpt; + + /* + * If we're dropping internal checkpoints, match to the '.' separating the checkpoint name from + * the generational number, and take all that we can find. Applications aren't allowed to use + * any variant of this name, so the test is still pretty simple, if the leading bytes match, + * it's one we want to drop. + */ + if (strncmp(WT_CHECKPOINT, name, len) == 0) { + WT_CKPT_FOREACH (ckptbase, ckpt) + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) + F_SET(ckpt, WT_CKPT_DELETE); + } else + WT_CKPT_FOREACH (ckptbase, ckpt) + if (WT_STRING_MATCH(ckpt->name, name, len)) + F_SET(ckpt, WT_CKPT_DELETE); } /* * __drop_from -- - * Drop all checkpoints after, and including, the named checkpoint. + * Drop all checkpoints after, and including, the named checkpoint. */ static void __drop_from(WT_CKPT *ckptbase, const char *name, size_t len) { - WT_CKPT *ckpt; - bool matched; - - /* - * There's a special case -- if the name is "all", then we delete all - * of the checkpoints. - */ - if (WT_STRING_MATCH("all", name, len)) { - WT_CKPT_FOREACH(ckptbase, ckpt) - F_SET(ckpt, WT_CKPT_DELETE); - return; - } - - /* - * We use the first checkpoint we can find, that is, if there are two - * checkpoints with the same name in the list, we'll delete from the - * first match to the end. - */ - matched = false; - WT_CKPT_FOREACH(ckptbase, ckpt) { - if (!matched && !WT_STRING_MATCH(ckpt->name, name, len)) - continue; - - matched = true; - F_SET(ckpt, WT_CKPT_DELETE); - } + WT_CKPT *ckpt; + bool matched; + + /* + * There's a special case -- if the name is "all", then we delete all of the checkpoints. + */ + if (WT_STRING_MATCH("all", name, len)) { + WT_CKPT_FOREACH (ckptbase, ckpt) + F_SET(ckpt, WT_CKPT_DELETE); + return; + } + + /* + * We use the first checkpoint we can find, that is, if there are two checkpoints with the same + * name in the list, we'll delete from the first match to the end. + */ + matched = false; + WT_CKPT_FOREACH (ckptbase, ckpt) { + if (!matched && !WT_STRING_MATCH(ckpt->name, name, len)) + continue; + + matched = true; + F_SET(ckpt, WT_CKPT_DELETE); + } } /* * __drop_to -- - * Drop all checkpoints before, and including, the named checkpoint. + * Drop all checkpoints before, and including, the named checkpoint. */ static void __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) { - WT_CKPT *ckpt, *mark; - - /* - * We use the last checkpoint we can find, that is, if there are two - * checkpoints with the same name in the list, we'll delete from the - * beginning to the second match, not the first. - */ - mark = NULL; - WT_CKPT_FOREACH(ckptbase, ckpt) - if (WT_STRING_MATCH(ckpt->name, name, len)) - mark = ckpt; - - if (mark == NULL) - return; - - WT_CKPT_FOREACH(ckptbase, ckpt) { - F_SET(ckpt, WT_CKPT_DELETE); - - if (ckpt == mark) - break; - } + WT_CKPT *ckpt, *mark; + + /* + * We use the last checkpoint we can find, that is, if there are two checkpoints with the same + * name in the list, we'll delete from the beginning to the second match, not the first. + */ + mark = NULL; + WT_CKPT_FOREACH (ckptbase, ckpt) + if (WT_STRING_MATCH(ckpt->name, name, len)) + mark = ckpt; + + if (mark == NULL) + return; + + WT_CKPT_FOREACH (ckptbase, ckpt) { + F_SET(ckpt, WT_CKPT_DELETE); + + if (ckpt == mark) + break; + } } /* * __checkpoint_lock_dirty_tree_int -- - * Helper for __checkpoint_lock_dirty_tree. Intended to be called while - * holding the hot backup lock. + * Helper for __checkpoint_lock_dirty_tree. Intended to be called while holding the hot backup + * lock. */ static int -__checkpoint_lock_dirty_tree_int( - WT_SESSION_IMPL *session, bool is_checkpoint, - bool force, WT_BTREE *btree, WT_CKPT *ckpt, WT_CKPT *ckptbase) +__checkpoint_lock_dirty_tree_int(WT_SESSION_IMPL *session, bool is_checkpoint, bool force, + WT_BTREE *btree, WT_CKPT *ckpt, WT_CKPT *ckptbase) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - - WT_UNUSED(is_checkpoint); - conn = S2C(session); - - /* - * We can't delete checkpoints if a backup cursor is open. WiredTiger - * checkpoints are uniquely named and it's OK to have multiple of them - * in the system: clear the delete flag for them, and otherwise fail. - * Hold the lock until we're done (blocking hot backups from starting), - * we don't want to race with a future hot backup. - */ - if (conn->hot_backup) - WT_CKPT_FOREACH(ckptbase, ckpt) { - if (!F_ISSET(ckpt, WT_CKPT_DELETE)) - continue; - if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { - F_CLR(ckpt, WT_CKPT_DELETE); - continue; - } - WT_RET_MSG(session, EBUSY, - "checkpoint %s blocked by hot backup: it would" - "delete an existing checkpoint, and checkpoints " - "cannot be deleted during a hot backup", - ckpt->name); - } - /* - * Mark old checkpoints that are being deleted and figure out which - * trees we can skip in this checkpoint. - */ - WT_RET(__checkpoint_mark_skip(session, ckptbase, force)); - if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) - return (0); - /* - * Lock the checkpoints that will be deleted. - * - * Checkpoints are only locked when tracking is enabled, which covers - * checkpoint and drop operations, but not close. The reasoning is - * there should be no access to a checkpoint during close, because any - * thread accessing a checkpoint will also have the current file handle - * open. - */ - if (WT_META_TRACKING(session)) - WT_CKPT_FOREACH(ckptbase, ckpt) { - if (!F_ISSET(ckpt, WT_CKPT_DELETE)) - continue; - /* - * We can't delete checkpoints referenced by a cursor. - * WiredTiger checkpoints are uniquely named and it's - * OK to have multiple in the system: clear the delete - * flag for them, and otherwise fail. - */ - ret = __wt_session_lock_checkpoint(session, ckpt->name); - if (ret == 0) - continue; - if (ret == EBUSY && - WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { - F_CLR(ckpt, WT_CKPT_DELETE); - continue; - } - WT_RET_MSG(session, ret, - "checkpoints cannot be dropped when in-use"); - } - /* - * There are special trees: those being bulk-loaded, salvaged, upgraded - * or verified during the checkpoint. They should never be part of a - * checkpoint: we will fail to lock them because the operations have - * exclusive access to the handles. Named checkpoints will fail in that - * case, ordinary checkpoints skip files that cannot be opened normally. - */ - WT_ASSERT(session, - !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); - - return (0); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + WT_UNUSED(is_checkpoint); + conn = S2C(session); + + /* + * We can't delete checkpoints if a backup cursor is open. WiredTiger checkpoints are uniquely + * named and it's OK to have multiple of them in the system: clear the delete flag for them, and + * otherwise fail. Hold the lock until we're done (blocking hot backups from starting), we don't + * want to race with a future hot backup. + */ + if (conn->hot_backup) + WT_CKPT_FOREACH (ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + F_CLR(ckpt, WT_CKPT_DELETE); + continue; + } + WT_RET_MSG(session, EBUSY, + "checkpoint %s blocked by hot backup: it would" + "delete an existing checkpoint, and checkpoints " + "cannot be deleted during a hot backup", + ckpt->name); + } + /* + * Mark old checkpoints that are being deleted and figure out which trees we can skip in this + * checkpoint. + */ + WT_RET(__checkpoint_mark_skip(session, ckptbase, force)); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) + return (0); + /* + * Lock the checkpoints that will be deleted. + * + * Checkpoints are only locked when tracking is enabled, which covers + * checkpoint and drop operations, but not close. The reasoning is + * there should be no access to a checkpoint during close, because any + * thread accessing a checkpoint will also have the current file handle + * open. + */ + if (WT_META_TRACKING(session)) + WT_CKPT_FOREACH (ckptbase, ckpt) { + if (!F_ISSET(ckpt, WT_CKPT_DELETE)) + continue; + /* + * We can't delete checkpoints referenced by a cursor. WiredTiger checkpoints are + * uniquely named and it's OK to have multiple in the system: clear the delete flag for + * them, and otherwise fail. + */ + ret = __wt_session_lock_checkpoint(session, ckpt->name); + if (ret == 0) + continue; + if (ret == EBUSY && WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { + F_CLR(ckpt, WT_CKPT_DELETE); + continue; + } + WT_RET_MSG(session, ret, "checkpoints cannot be dropped when in-use"); + } + /* + * There are special trees: those being bulk-loaded, salvaged, upgraded or verified during the + * checkpoint. They should never be part of a checkpoint: we will fail to lock them because the + * operations have exclusive access to the handles. Named checkpoints will fail in that case, + * ordinary checkpoints skip files that cannot be opened normally. + */ + WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); + + return (0); } /* * __checkpoint_lock_dirty_tree -- - * Decide whether the tree needs to be included in the checkpoint and if - * so, acquire the necessary locks. + * Decide whether the tree needs to be included in the checkpoint and if so, acquire the + * necessary locks. */ static int -__checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, - bool is_checkpoint, bool force, bool need_tracking, const char *cfg[]) +__checkpoint_lock_dirty_tree( + WT_SESSION_IMPL *session, bool is_checkpoint, bool force, bool need_tracking, const char *cfg[]) { - WT_BTREE *btree; - WT_CKPT *ckpt, *ckptbase; - WT_CONFIG dropconf; - WT_CONFIG_ITEM cval, k, v; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - const char *name; - char *name_alloc; - - btree = S2BT(session); - ckpt = ckptbase = NULL; - dhandle = session->dhandle; - name_alloc = NULL; - - /* Only referenced in diagnostic builds. */ - WT_UNUSED(is_checkpoint); - - /* - * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied - * with wrapping the entire assert condition in the unused macro. - */ - WT_UNUSED(need_tracking); - - /* - * Most callers need meta tracking to be on here, otherwise it is - * possible for this checkpoint to cleanup handles that are still in - * use. The exceptions are: - * - Checkpointing the metadata handle itself. - * - On connection close when we know there can't be any races. - */ - WT_ASSERT(session, !need_tracking || - WT_IS_METADATA(dhandle) || WT_META_TRACKING(session)); - - /* Get the list of checkpoints for this file. */ - WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase)); - - /* This may be a named checkpoint, check the configuration. */ - cval.len = 0; - if (cfg != NULL) - WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); - if (cval.len == 0) - name = WT_CHECKPOINT; - else { - WT_ERR(__checkpoint_name_ok(session, cval.str, cval.len)); - WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); - name = name_alloc; - } - - /* We may be dropping specific checkpoints, check the configuration. */ - if (cfg != NULL) { - cval.len = 0; - WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); - if (cval.len != 0) { - __wt_config_subinit(session, &dropconf, &cval); - while ((ret = - __wt_config_next(&dropconf, &k, &v)) == 0) { - /* Disallow unsafe checkpoint names. */ - if (v.len == 0) - WT_ERR(__checkpoint_name_ok( - session, k.str, k.len)); - else - WT_ERR(__checkpoint_name_ok( - session, v.str, v.len)); - - if (v.len == 0) - __drop(ckptbase, k.str, k.len); - else if (WT_STRING_MATCH("from", k.str, k.len)) - __drop_from(ckptbase, v.str, v.len); - else if (WT_STRING_MATCH("to", k.str, k.len)) - __drop_to(ckptbase, v.str, v.len); - else - WT_ERR_MSG(session, EINVAL, - "unexpected value for checkpoint " - "key: %.*s", - (int)k.len, k.str); - } - WT_ERR_NOTFOUND_OK(ret); - } - } - - /* Drop checkpoints with the same name as the one we're taking. */ - __drop(ckptbase, name, strlen(name)); - - /* Set the name of the new entry at the end of the list. */ - WT_CKPT_FOREACH(ckptbase, ckpt) - ; - WT_ERR(__wt_strdup(session, name, &ckpt->name)); - - /* - * There is some interaction between backups and checkpoints. Perform - * all backup related operations that the checkpoint needs now, while - * holding the hot backup read lock. - */ - WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, - ret = __checkpoint_lock_dirty_tree_int( - session, is_checkpoint, force, btree, ckpt, ckptbase)); - WT_ERR(ret); - if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) - goto err; - - WT_ASSERT(session, btree->ckpt == NULL && - !F_ISSET(btree, WT_BTREE_SKIP_CKPT)); - btree->ckpt = ckptbase; - - if (0) { + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG dropconf; + WT_CONFIG_ITEM cval, k, v; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + char *name_alloc; + const char *name; + + btree = S2BT(session); + ckpt = ckptbase = NULL; + dhandle = session->dhandle; + name_alloc = NULL; + + /* Only referenced in diagnostic builds. */ + WT_UNUSED(is_checkpoint); + + /* + * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied with wrapping the entire + * assert condition in the unused macro. + */ + WT_UNUSED(need_tracking); + + /* + * Most callers need meta tracking to be on here, otherwise it is + * possible for this checkpoint to cleanup handles that are still in + * use. The exceptions are: + * - Checkpointing the metadata handle itself. + * - On connection close when we know there can't be any races. + */ + WT_ASSERT(session, !need_tracking || WT_IS_METADATA(dhandle) || WT_META_TRACKING(session)); + + /* Get the list of checkpoints for this file. */ + WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, true, &ckptbase)); + + /* This may be a named checkpoint, check the configuration. */ + cval.len = 0; + if (cfg != NULL) + WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); + if (cval.len == 0) + name = WT_CHECKPOINT; + else { + WT_ERR(__checkpoint_name_ok(session, cval.str, cval.len)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); + name = name_alloc; + } + + /* We may be dropping specific checkpoints, check the configuration. */ + if (cfg != NULL) { + cval.len = 0; + WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); + if (cval.len != 0) { + __wt_config_subinit(session, &dropconf, &cval); + while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) { + /* Disallow unsafe checkpoint names. */ + if (v.len == 0) + WT_ERR(__checkpoint_name_ok(session, k.str, k.len)); + else + WT_ERR(__checkpoint_name_ok(session, v.str, v.len)); + + if (v.len == 0) + __drop(ckptbase, k.str, k.len); + else if (WT_STRING_MATCH("from", k.str, k.len)) + __drop_from(ckptbase, v.str, v.len); + else if (WT_STRING_MATCH("to", k.str, k.len)) + __drop_to(ckptbase, v.str, v.len); + else + WT_ERR_MSG(session, EINVAL, + "unexpected value for checkpoint " + "key: %.*s", + (int)k.len, k.str); + } + WT_ERR_NOTFOUND_OK(ret); + } + } + + /* Drop checkpoints with the same name as the one we're taking. */ + __drop(ckptbase, name, strlen(name)); + + /* Set the name of the new entry at the end of the list. */ + WT_CKPT_FOREACH (ckptbase, ckpt) + ; + WT_ERR(__wt_strdup(session, name, &ckpt->name)); + + /* + * There is some interaction between backups and checkpoints. Perform all backup related + * operations that the checkpoint needs now, while holding the hot backup read lock. + */ + WT_WITH_HOTBACKUP_READ_LOCK_UNCOND(session, + ret = __checkpoint_lock_dirty_tree_int(session, is_checkpoint, force, btree, ckpt, ckptbase)); + WT_ERR(ret); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) + goto err; + + WT_ASSERT(session, btree->ckpt == NULL && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)); + btree->ckpt = ckptbase; + + if (0) { err: - __wt_meta_ckptlist_free(session, &ckptbase); - } - __wt_free(session, name_alloc); + __wt_meta_ckptlist_free(session, &ckptbase); + } + __wt_free(session, name_alloc); - return (ret); + return (ret); } /* * __checkpoint_mark_skip -- - * Figure out whether the checkpoint can be skipped for a tree. + * Figure out whether the checkpoint can be skipped for a tree. */ static int -__checkpoint_mark_skip( - WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) +__checkpoint_mark_skip(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) { - WT_BTREE *btree; - WT_CKPT *ckpt; - int deleted; - const char *name; - - btree = S2BT(session); - - /* - * Check for clean objects not requiring a checkpoint. - * - * If we're closing a handle, and the object is clean, we can skip the - * checkpoint, whatever checkpoints we have are sufficient. (We might - * not have any checkpoints if the object was never modified, and that's - * OK: the object creation code doesn't mark the tree modified so we can - * skip newly created trees here.) - * - * If the application repeatedly checkpoints an object (imagine hourly - * checkpoints using the same explicit or internal name), there's no - * reason to repeat the checkpoint for clean objects. The test is if - * the only checkpoint we're deleting is the last one in the list and - * it has the same name as the checkpoint we're about to take, skip the - * work. (We can't skip checkpoints that delete more than the last - * checkpoint because deleting those checkpoints might free up space in - * the file.) This means an application toggling between two (or more) - * checkpoint names will repeatedly take empty checkpoints, but that's - * not likely enough to make detection worthwhile. - * - * Checkpoint read-only objects otherwise: the application must be able - * to open the checkpoint in a cursor after taking any checkpoint, which - * means it must exist. - */ - F_CLR(btree, WT_BTREE_SKIP_CKPT); - if (!btree->modified && !force) { - deleted = 0; - WT_CKPT_FOREACH(ckptbase, ckpt) - if (F_ISSET(ckpt, WT_CKPT_DELETE)) - ++deleted; - - /* - * Complicated test: if the tree is clean and last two - * checkpoints have the same name (correcting for internal - * checkpoint names with their generational suffix numbers), we - * can skip the checkpoint, there's nothing to do. The - * exception is if we're deleting two or more checkpoints: then - * we may save space. - */ - name = (ckpt - 1)->name; - if (ckpt > ckptbase + 1 && deleted < 2 && - (strcmp(name, (ckpt - 2)->name) == 0 || - (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && - WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) { - F_SET(btree, WT_BTREE_SKIP_CKPT); - return (0); - } - } - - return (0); + WT_BTREE *btree; + WT_CKPT *ckpt; + int deleted; + const char *name; + + btree = S2BT(session); + + /* + * Check for clean objects not requiring a checkpoint. + * + * If we're closing a handle, and the object is clean, we can skip the + * checkpoint, whatever checkpoints we have are sufficient. (We might + * not have any checkpoints if the object was never modified, and that's + * OK: the object creation code doesn't mark the tree modified so we can + * skip newly created trees here.) + * + * If the application repeatedly checkpoints an object (imagine hourly + * checkpoints using the same explicit or internal name), there's no + * reason to repeat the checkpoint for clean objects. The test is if + * the only checkpoint we're deleting is the last one in the list and + * it has the same name as the checkpoint we're about to take, skip the + * work. (We can't skip checkpoints that delete more than the last + * checkpoint because deleting those checkpoints might free up space in + * the file.) This means an application toggling between two (or more) + * checkpoint names will repeatedly take empty checkpoints, but that's + * not likely enough to make detection worthwhile. + * + * Checkpoint read-only objects otherwise: the application must be able + * to open the checkpoint in a cursor after taking any checkpoint, which + * means it must exist. + */ + F_CLR(btree, WT_BTREE_SKIP_CKPT); + if (!btree->modified && !force) { + deleted = 0; + WT_CKPT_FOREACH (ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + ++deleted; + + /* + * Complicated test: if the tree is clean and last two checkpoints have the same name + * (correcting for internal checkpoint names with their generational suffix numbers), we can + * skip the checkpoint, there's nothing to do. The exception is if we're deleting two or + * more checkpoints: then we may save space. + */ + name = (ckpt - 1)->name; + if (ckpt > ckptbase + 1 && deleted < 2 && + (strcmp(name, (ckpt - 2)->name) == 0 || + (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && + WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) { + F_SET(btree, WT_BTREE_SKIP_CKPT); + return (0); + } + } + + return (0); } /* * __wt_checkpoint_tree_reconcile_update -- - * Update a checkpoint based on reconciliation results. + * Update a checkpoint based on reconciliation results. */ void -__wt_checkpoint_tree_reconcile_update( - WT_SESSION_IMPL *session, wt_timestamp_t newest_durable_ts, - wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, - wt_timestamp_t newest_stop_ts, uint64_t newest_stop_txn) +__wt_checkpoint_tree_reconcile_update(WT_SESSION_IMPL *session, wt_timestamp_t newest_durable_ts, + wt_timestamp_t oldest_start_ts, uint64_t oldest_start_txn, wt_timestamp_t newest_stop_ts, + uint64_t newest_stop_txn) { - WT_BTREE *btree; - WT_CKPT *ckpt, *ckptbase; - - btree = S2BT(session); - - /* - * Reconciliation just wrote a checkpoint, everything has been written. - * Update the checkpoint with reconciliation information. The reason - * for this function is the reconciliation code just passes through the - * btree structure's checkpoint array, it doesn't know any more. - */ - ckptbase = btree->ckpt; - WT_CKPT_FOREACH(ckptbase, ckpt) - if (F_ISSET(ckpt, WT_CKPT_ADD)) { - ckpt->write_gen = btree->write_gen; - ckpt->newest_durable_ts = newest_durable_ts; - ckpt->oldest_start_ts = oldest_start_ts; - ckpt->oldest_start_txn = oldest_start_txn; - ckpt->newest_stop_ts = newest_stop_ts; - ckpt->newest_stop_txn = newest_stop_txn; - } + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase; + + btree = S2BT(session); + + /* + * Reconciliation just wrote a checkpoint, everything has been written. Update the checkpoint + * with reconciliation information. The reason for this function is the reconciliation code just + * passes through the btree structure's checkpoint array, it doesn't know any more. + */ + ckptbase = btree->ckpt; + WT_CKPT_FOREACH (ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_ADD)) { + ckpt->write_gen = btree->write_gen; + ckpt->newest_durable_ts = newest_durable_ts; + ckpt->oldest_start_ts = oldest_start_ts; + ckpt->oldest_start_txn = oldest_start_txn; + ckpt->newest_stop_ts = newest_stop_ts; + ckpt->newest_stop_txn = newest_stop_txn; + } } /* * __checkpoint_tree -- - * Checkpoint a single tree. - * Assumes all necessary locks have been acquired by the caller. + * Checkpoint a single tree. Assumes all necessary locks have been acquired by the caller. */ static int -__checkpoint_tree( - WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[]) +__checkpoint_tree(WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[]) { - WT_BM *bm; - WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - WT_LSN ckptlsn; - bool fake_ckpt, resolve_bm; - - WT_UNUSED(cfg); - - btree = S2BT(session); - bm = btree->bm; - conn = S2C(session); - dhandle = session->dhandle; - fake_ckpt = resolve_bm = false; - - /* - * Set the checkpoint LSN to the maximum LSN so that if logging is - * disabled, recovery will never roll old changes forward over the - * non-logged changes in this checkpoint. If logging is enabled, a - * real checkpoint LSN will be assigned for this checkpoint and - * overwrite this. - */ - WT_MAX_LSN(&ckptlsn); - - /* - * If an object has never been used (in other words, if it could become - * a bulk-loaded file), then we must fake the checkpoint. This is good - * because we don't write physical checkpoint blocks for just-created - * files, but it's not just a good idea. The reason is because deleting - * a physical checkpoint requires writing the file, and fake checkpoints - * can't write the file. If you (1) create a physical checkpoint for an - * empty file which writes blocks, (2) start bulk-loading records into - * the file, (3) during the bulk-load perform another checkpoint with - * the same name; in order to keep from having two checkpoints with the - * same name you would have to use the bulk-load's fake checkpoint to - * delete a physical checkpoint, and that will end in tears. - */ - if (is_checkpoint && btree->original) { - __wt_checkpoint_tree_reconcile_update(session, - WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX); - - fake_ckpt = true; - goto fake; - } - - /* - * Mark the root page dirty to ensure something gets written. (If the - * tree is modified, we must write the root page anyway, this doesn't - * add additional writes to the process. If the tree is not modified, - * we have to dirty the root page to ensure something gets written.) - * This is really about paranoia: if the tree modification value gets - * out of sync with the set of dirty pages (modify is set, but there - * are no dirty pages), we perform a checkpoint without any writes, no - * checkpoint is created, and then things get bad. - * While marking the root page as dirty, we do not want to dirty the - * btree because we are marking the btree as clean just after this call. - * Also, marking the btree dirty at this stage will unnecessarily mark - * the connection as dirty causing checkpoint-skip code to fail. - */ - WT_ERR(__wt_page_modify_init(session, btree->root.page)); - __wt_page_only_modify_set(session, btree->root.page); - - /* - * Clear the tree's modified flag; any changes before we clear the flag - * are guaranteed to be part of this checkpoint (unless reconciliation - * skips updates for transactional reasons), and changes subsequent to - * the checkpoint start, which might not be included, will re-set the - * modified flag. The "unless reconciliation skips updates" problem is - * handled in the reconciliation code: if reconciliation skips updates, - * it sets the modified flag itself. - */ - btree->modified = false; - WT_FULL_BARRIER(); - - /* Tell logging that a file checkpoint is starting. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - WT_ERR(__wt_txn_checkpoint_log( - session, false, WT_TXN_LOG_CKPT_START, &ckptlsn)); - - /* Tell the block manager that a file checkpoint is starting. */ - WT_ERR(bm->checkpoint_start(bm, session)); - resolve_bm = true; - - /* Flush the file from the cache, creating the checkpoint. */ - if (is_checkpoint) - WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT)); - else - WT_ERR(__wt_evict_file(session, WT_SYNC_CLOSE)); + WT_BM *bm; + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_LSN ckptlsn; + bool fake_ckpt, resolve_bm; + + WT_UNUSED(cfg); + + btree = S2BT(session); + bm = btree->bm; + conn = S2C(session); + dhandle = session->dhandle; + fake_ckpt = resolve_bm = false; + + /* + * Set the checkpoint LSN to the maximum LSN so that if logging is disabled, recovery will never + * roll old changes forward over the non-logged changes in this checkpoint. If logging is + * enabled, a real checkpoint LSN will be assigned for this checkpoint and overwrite this. + */ + WT_MAX_LSN(&ckptlsn); + + /* + * If an object has never been used (in other words, if it could become a bulk-loaded file), + * then we must fake the checkpoint. This is good because we don't write physical checkpoint + * blocks for just-created files, but it's not just a good idea. The reason is because deleting + * a physical checkpoint requires writing the file, and fake checkpoints can't write the file. + * If you (1) create a physical checkpoint for an empty file which writes blocks, (2) start + * bulk-loading records into the file, (3) during the bulk-load perform another checkpoint with + * the same name; in order to keep from having two checkpoints with the same name you would have + * to use the bulk-load's fake checkpoint to delete a physical checkpoint, and that will end in + * tears. + */ + if (is_checkpoint && btree->original) { + __wt_checkpoint_tree_reconcile_update( + session, WT_TS_NONE, WT_TS_NONE, WT_TXN_NONE, WT_TS_MAX, WT_TXN_MAX); + + fake_ckpt = true; + goto fake; + } + + /* + * Mark the root page dirty to ensure something gets written. (If the tree is modified, we must + * write the root page anyway, this doesn't add additional writes to the process. If the tree is + * not modified, we have to dirty the root page to ensure something gets written.) This is + * really about paranoia: if the tree modification value gets out of sync with the set of dirty + * pages (modify is set, but there are no dirty pages), we perform a checkpoint without any + * writes, no checkpoint is created, and then things get bad. While marking the root page as + * dirty, we do not want to dirty the btree because we are marking the btree as clean just after + * this call. Also, marking the btree dirty at this stage will unnecessarily mark the connection + * as dirty causing checkpoint-skip code to fail. + */ + WT_ERR(__wt_page_modify_init(session, btree->root.page)); + __wt_page_only_modify_set(session, btree->root.page); + + /* + * Clear the tree's modified flag; any changes before we clear the flag are guaranteed to be + * part of this checkpoint (unless reconciliation skips updates for transactional reasons), and + * changes subsequent to the checkpoint start, which might not be included, will re-set the + * modified flag. The "unless reconciliation skips updates" problem is handled in the + * reconciliation code: if reconciliation skips updates, it sets the modified flag itself. + */ + btree->modified = false; + WT_FULL_BARRIER(); + + /* Tell logging that a file checkpoint is starting. */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + WT_ERR(__wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_START, &ckptlsn)); + + /* Tell the block manager that a file checkpoint is starting. */ + WT_ERR(bm->checkpoint_start(bm, session)); + resolve_bm = true; + + /* Flush the file from the cache, creating the checkpoint. */ + if (is_checkpoint) + WT_ERR(__wt_sync_file(session, WT_SYNC_CHECKPOINT)); + else + WT_ERR(__wt_evict_file(session, WT_SYNC_CLOSE)); fake: - /* - * If we're faking a checkpoint and logging is enabled, recovery should - * roll forward any changes made between now and the next checkpoint, - * so set the checkpoint LSN to the beginning of time. - */ - if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - WT_INIT_LSN(&ckptlsn); - - /* - * Update the object's metadata. - * - * If the object is the metadata, the call to __wt_meta_ckptlist_set - * will update the turtle file and swap the new one into place. We - * need to make sure the metadata is on disk before the turtle file is - * updated. - * - * If we are doing a checkpoint in a file without a transaction (e.g., - * closing a dirty tree before an exclusive operation like verify), - * the metadata update will be auto-committed. In that case, we need to - * sync the file here or we could roll forward the metadata in - * recovery and open a checkpoint that isn't yet durable. - */ - if (WT_IS_METADATA(dhandle) || - !F_ISSET(&session->txn, WT_TXN_RUNNING)) - WT_ERR(__wt_checkpoint_sync(session, NULL)); - - WT_ERR(__wt_meta_ckptlist_set( - session, dhandle->name, btree->ckpt, &ckptlsn)); - - /* - * If we wrote a checkpoint (rather than faking one), we have to resolve - * it. Normally, tracking is enabled and resolution deferred until - * transaction end. The exception is if the handle is being discarded, - * in which case the handle will be gone by the time we try to apply or - * unroll the meta tracking event. - */ - if (!fake_ckpt) { - resolve_bm = false; - if (WT_META_TRACKING(session) && is_checkpoint) - WT_ERR(__wt_meta_track_checkpoint(session)); - else - WT_ERR(bm->checkpoint_resolve(bm, session, false)); - } - - /* Tell logging that the checkpoint is complete. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - WT_ERR(__wt_txn_checkpoint_log( - session, false, WT_TXN_LOG_CKPT_STOP, NULL)); + /* + * If we're faking a checkpoint and logging is enabled, recovery should roll forward any changes + * made between now and the next checkpoint, so set the checkpoint LSN to the beginning of time. + */ + if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + WT_INIT_LSN(&ckptlsn); + + /* + * Update the object's metadata. + * + * If the object is the metadata, the call to __wt_meta_ckptlist_set + * will update the turtle file and swap the new one into place. We + * need to make sure the metadata is on disk before the turtle file is + * updated. + * + * If we are doing a checkpoint in a file without a transaction (e.g., + * closing a dirty tree before an exclusive operation like verify), + * the metadata update will be auto-committed. In that case, we need to + * sync the file here or we could roll forward the metadata in + * recovery and open a checkpoint that isn't yet durable. + */ + if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING)) + WT_ERR(__wt_checkpoint_sync(session, NULL)); + + WT_ERR(__wt_meta_ckptlist_set(session, dhandle->name, btree->ckpt, &ckptlsn)); + + /* + * If we wrote a checkpoint (rather than faking one), we have to resolve it. Normally, tracking + * is enabled and resolution deferred until transaction end. The exception is if the handle is + * being discarded, in which case the handle will be gone by the time we try to apply or unroll + * the meta tracking event. + */ + if (!fake_ckpt) { + resolve_bm = false; + if (WT_META_TRACKING(session) && is_checkpoint) + WT_ERR(__wt_meta_track_checkpoint(session)); + else + WT_ERR(bm->checkpoint_resolve(bm, session, false)); + } + + /* Tell logging that the checkpoint is complete. */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + WT_ERR(__wt_txn_checkpoint_log(session, false, WT_TXN_LOG_CKPT_STOP, NULL)); err: - /* Resolved the checkpoint for the block manager in the error path. */ - if (resolve_bm) - WT_TRET(bm->checkpoint_resolve(bm, session, ret != 0)); - - /* - * If the checkpoint didn't complete successfully, make sure the - * tree is marked dirty. - */ - if (ret != 0) { - btree->modified = true; - conn->modified = true; - } - - __wt_meta_ckptlist_free(session, &btree->ckpt); - - return (ret); + /* Resolved the checkpoint for the block manager in the error path. */ + if (resolve_bm) + WT_TRET(bm->checkpoint_resolve(bm, session, ret != 0)); + + /* + * If the checkpoint didn't complete successfully, make sure the tree is marked dirty. + */ + if (ret != 0) { + btree->modified = true; + conn->modified = true; + } + + __wt_meta_ckptlist_free(session, &btree->ckpt); + + return (ret); } /* * __checkpoint_presync -- - * Visit all handles after the checkpoint writes are complete and before - * syncing. At this point, all trees should be completely open for - * business. + * Visit all handles after the checkpoint writes are complete and before syncing. At this point, + * all trees should be completely open for business. */ static int __checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_BTREE *btree; + WT_BTREE *btree; - WT_UNUSED(cfg); + WT_UNUSED(cfg); - btree = S2BT(session); - WT_ASSERT(session, - btree->checkpoint_gen == __wt_gen(session, WT_GEN_CHECKPOINT)); - btree->evict_walk_period = btree->evict_walk_saved; - return (0); + btree = S2BT(session); + WT_ASSERT(session, btree->checkpoint_gen == __wt_gen(session, WT_GEN_CHECKPOINT)); + btree->evict_walk_period = btree->evict_walk_saved; + return (0); } /* * __checkpoint_tree_helper -- - * Checkpoint a tree (suitable for use in *_apply functions). + * Checkpoint a tree (suitable for use in *_apply functions). */ static int __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_BTREE *btree; - WT_DECL_RET; - WT_TXN *txn; - bool with_timestamp; - - btree = S2BT(session); - txn = &session->txn; - - /* Are we using a read timestamp for this checkpoint transaction? */ - with_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_READ); - - /* - * For tables with immediate durability (indicated by having logging - * enabled), ignore any read timestamp configured for the checkpoint. - */ - if (__wt_btree_immediately_durable(session)) - F_CLR(txn, WT_TXN_HAS_TS_READ); - - ret = __checkpoint_tree(session, true, cfg); - - /* Restore the use of the timestamp for other tables. */ - if (with_timestamp) - F_SET(txn, WT_TXN_HAS_TS_READ); - - /* - * Whatever happened, we aren't visiting this tree again in this - * checkpoint. Don't keep updates pinned any longer. - */ - __checkpoint_update_generation(session); - - /* - * In case this tree was being skipped by the eviction server - * during the checkpoint, restore the previous state. - */ - btree->evict_walk_period = btree->evict_walk_saved; - - /* - * Wake the eviction server, in case application threads have - * stalled while the eviction server decided it couldn't make - * progress. Without this, application threads will be stalled - * until the eviction server next wakes. - */ - __wt_evict_server_wake(session); - - return (ret); + WT_BTREE *btree; + WT_DECL_RET; + WT_TXN *txn; + bool with_timestamp; + + btree = S2BT(session); + txn = &session->txn; + + /* Are we using a read timestamp for this checkpoint transaction? */ + with_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_READ); + + /* + * For tables with immediate durability (indicated by having logging enabled), ignore any read + * timestamp configured for the checkpoint. + */ + if (__wt_btree_immediately_durable(session)) + F_CLR(txn, WT_TXN_HAS_TS_READ); + + ret = __checkpoint_tree(session, true, cfg); + + /* Restore the use of the timestamp for other tables. */ + if (with_timestamp) + F_SET(txn, WT_TXN_HAS_TS_READ); + + /* + * Whatever happened, we aren't visiting this tree again in this checkpoint. Don't keep updates + * pinned any longer. + */ + __checkpoint_update_generation(session); + + /* + * In case this tree was being skipped by the eviction server during the checkpoint, restore the + * previous state. + */ + btree->evict_walk_period = btree->evict_walk_saved; + + /* + * Wake the eviction server, in case application threads have stalled while the eviction server + * decided it couldn't make progress. Without this, application threads will be stalled until + * the eviction server next wakes. + */ + __wt_evict_server_wake(session); + + return (ret); } /* * __wt_checkpoint -- - * Checkpoint a file. + * Checkpoint a file. */ int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_DECL_RET; - bool force; - - /* Should not be called with a checkpoint handle. */ - WT_ASSERT(session, session->dhandle->checkpoint == NULL); - - /* We must hold the metadata lock if checkpointing the metadata. */ - WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || - F_ISSET(session, WT_SESSION_LOCKED_METADATA)); - - WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); - force = cval.val != 0; - WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( - session, true, force, true, cfg)); - WT_RET(ret); - if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) - return (0); - return (__checkpoint_tree(session, true, cfg)); + WT_CONFIG_ITEM cval; + WT_DECL_RET; + bool force; + + /* Should not be called with a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + + /* We must hold the metadata lock if checkpointing the metadata. */ + WT_ASSERT( + session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); + + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree(session, true, force, true, cfg)); + WT_RET(ret); + if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) + return (0); + return (__checkpoint_tree(session, true, cfg)); } /* * __wt_checkpoint_sync -- - * Sync a file that has been checkpointed, and wait for the result. + * Sync a file that has been checkpointed, and wait for the result. */ int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_BM *bm; + WT_BM *bm; - WT_UNUSED(cfg); + WT_UNUSED(cfg); - bm = S2BT(session)->bm; + bm = S2BT(session)->bm; - /* Should not be called with a checkpoint handle. */ - WT_ASSERT(session, session->dhandle->checkpoint == NULL); + /* Should not be called with a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); - /* Unnecessary if checkpoint_sync has been configured "off". */ - if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC)) - return (0); + /* Unnecessary if checkpoint_sync has been configured "off". */ + if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC)) + return (0); - return (bm->sync(bm, session, true)); + return (bm->sync(bm, session, true)); } /* * __wt_checkpoint_close -- - * Checkpoint a single file as part of closing the handle. + * Checkpoint a single file as part of closing the handle. */ int __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) { - WT_BTREE *btree; - WT_DECL_RET; - bool bulk, need_tracking; - - btree = S2BT(session); - bulk = F_ISSET(btree, WT_BTREE_BULK); - - /* - * We've done the final checkpoint before the final close, subsequent - * writes to normal objects are wasted effort. Discard the objects to - * validate exit accounting. - */ - if (final && !WT_IS_METADATA(session->dhandle)) - return (__wt_evict_file(session, WT_SYNC_DISCARD)); - - /* - * If closing an unmodified file, check that no update is required - * for active readers. - */ - if (!btree->modified && !bulk) { - WT_RET(__wt_txn_update_oldest( - session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - return (__wt_txn_visible_all(session, btree->rec_max_txn, - btree->rec_max_timestamp) ? - __wt_evict_file(session, WT_SYNC_DISCARD) : EBUSY); - } - - /* - * Don't flush data from trees when there is a stable timestamp set: - * that can lead to files that are inconsistent on disk after a crash. - */ - if (btree->modified && !bulk && - S2C(session)->txn_global.has_stable_timestamp && - !__wt_btree_immediately_durable(session)) - return (__wt_set_return(session, EBUSY)); - - /* - * Turn on metadata tracking if: - * - The session is not already doing metadata tracking. - * - The file was not bulk loaded. - * - The close is not during connection close. - */ - need_tracking = !WT_META_TRACKING(session) && !bulk && !final; - - if (need_tracking) - WT_RET(__wt_meta_track_on(session)); - - WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( - session, false, false, need_tracking, NULL)); - WT_ASSERT(session, ret == 0); - if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)) - ret = __checkpoint_tree(session, false, NULL); - - if (need_tracking) - WT_TRET(__wt_meta_track_off(session, true, ret != 0)); - - return (ret); + WT_BTREE *btree; + WT_DECL_RET; + bool bulk, need_tracking; + + btree = S2BT(session); + bulk = F_ISSET(btree, WT_BTREE_BULK); + + /* + * We've done the final checkpoint before the final close, subsequent writes to normal objects + * are wasted effort. Discard the objects to validate exit accounting. + */ + if (final && !WT_IS_METADATA(session->dhandle)) + return (__wt_evict_file(session, WT_SYNC_DISCARD)); + + /* + * If closing an unmodified file, check that no update is required for active readers. + */ + if (!btree->modified && !bulk) { + WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); + return (__wt_txn_visible_all(session, btree->rec_max_txn, btree->rec_max_timestamp) ? + __wt_evict_file(session, WT_SYNC_DISCARD) : + EBUSY); + } + + /* + * Don't flush data from trees when there is a stable timestamp set: that can lead to files that + * are inconsistent on disk after a crash. + */ + if (btree->modified && !bulk && S2C(session)->txn_global.has_stable_timestamp && + !__wt_btree_immediately_durable(session)) + return (__wt_set_return(session, EBUSY)); + + /* + * Turn on metadata tracking if: + * - The session is not already doing metadata tracking. + * - The file was not bulk loaded. + * - The close is not during connection close. + */ + need_tracking = !WT_META_TRACKING(session) && !bulk && !final; + + if (need_tracking) + WT_RET(__wt_meta_track_on(session)); + + WT_SAVE_DHANDLE( + session, ret = __checkpoint_lock_dirty_tree(session, false, false, need_tracking, NULL)); + WT_ASSERT(session, ret == 0); + if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)) + ret = __checkpoint_tree(session, false, NULL); + + if (need_tracking) + WT_TRET(__wt_meta_track_off(session, true, ret != 0)); + + return (ret); } /* * __checkpoint_timing_stress -- - * Optionally add a 10 second delay to a checkpoint to simulate a long - * running checkpoint for debug purposes. The reason for this option is - * finding operations that can block while waiting for a checkpoint to - * complete. + * Optionally add a 10 second delay to a checkpoint to simulate a long running checkpoint for + * debug purposes. The reason for this option is finding operations that can block while waiting + * for a checkpoint to complete. */ static void __checkpoint_timing_stress(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - - /* - * We only want to sleep if the flag is set and the checkpoint comes - * from the API, so check if the session used is either of the two - * sessions set aside for internal checkpoints. - */ - if (conn->ckpt_session != session && - conn->meta_ckpt_session != session && - FLD_ISSET(conn->timing_stress_flags, - WT_TIMING_STRESS_CHECKPOINT_SLOW)) - __wt_sleep(10, 0); + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* + * We only want to sleep if the flag is set and the checkpoint comes from the API, so check if + * the session used is either of the two sessions set aside for internal checkpoints. + */ + if (conn->ckpt_session != session && conn->meta_ckpt_session != session && + FLD_ISSET(conn->timing_stress_flags, WT_TIMING_STRESS_CHECKPOINT_SLOW)) + __wt_sleep(10, 0); } diff --git a/src/third_party/wiredtiger/src/txn/txn_ext.c b/src/third_party/wiredtiger/src/txn/txn_ext.c index 1f42ab5eb43..43d9c380eb5 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ext.c +++ b/src/third_party/wiredtiger/src/txn/txn_ext.c @@ -10,97 +10,90 @@ /* * __wt_ext_transaction_id -- - * Return the session's transaction ID. + * Return the session's transaction ID. */ uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) { - WT_SESSION_IMPL *session; + WT_SESSION_IMPL *session; - (void)wt_api; /* Unused parameters */ - session = (WT_SESSION_IMPL *)wt_session; - /* Ignore failures: the only case is running out of transaction IDs. */ - WT_IGNORE_RET(__wt_txn_id_check(session)); - return (session->txn.id); + (void)wt_api; /* Unused parameters */ + session = (WT_SESSION_IMPL *)wt_session; + /* Ignore failures: the only case is running out of transaction IDs. */ + WT_IGNORE_RET(__wt_txn_id_check(session)); + return (session->txn.id); } /* * __wt_ext_transaction_isolation_level -- - * Return if the current transaction's isolation level. + * Return if the current transaction's isolation level. */ int -__wt_ext_transaction_isolation_level( - WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) +__wt_ext_transaction_isolation_level(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) { - WT_SESSION_IMPL *session; - WT_TXN *txn; + WT_SESSION_IMPL *session; + WT_TXN *txn; - (void)wt_api; /* Unused parameters */ + (void)wt_api; /* Unused parameters */ - session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; + session = (WT_SESSION_IMPL *)wt_session; + txn = &session->txn; - if (txn->isolation == WT_ISO_READ_COMMITTED) - return (WT_TXN_ISO_READ_COMMITTED); - if (txn->isolation == WT_ISO_READ_UNCOMMITTED) - return (WT_TXN_ISO_READ_UNCOMMITTED); - return (WT_TXN_ISO_SNAPSHOT); + if (txn->isolation == WT_ISO_READ_COMMITTED) + return (WT_TXN_ISO_READ_COMMITTED); + if (txn->isolation == WT_ISO_READ_UNCOMMITTED) + return (WT_TXN_ISO_READ_UNCOMMITTED); + return (WT_TXN_ISO_SNAPSHOT); } /* * __wt_ext_transaction_notify -- - * Request notification of transaction resolution. + * Request notification of transaction resolution. */ int -__wt_ext_transaction_notify( - WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify) +__wt_ext_transaction_notify(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, WT_TXN_NOTIFY *notify) { - WT_SESSION_IMPL *session; - WT_TXN *txn; + WT_SESSION_IMPL *session; + WT_TXN *txn; - (void)wt_api; /* Unused parameters */ + (void)wt_api; /* Unused parameters */ - session = (WT_SESSION_IMPL *)wt_session; - txn = &session->txn; + session = (WT_SESSION_IMPL *)wt_session; + txn = &session->txn; - /* - * XXX - * For now, a single slot for notifications: I'm not bothering with - * more than one because more than one data-source in a transaction - * doesn't work anyway. - */ - if (txn->notify == notify) - return (0); - if (txn->notify != NULL) - WT_RET_MSG( - session, WT_ERROR, "transaction notify already scheduled"); + /* + * XXX For now, a single slot for notifications: I'm not bothering with more than one because + * more than one data-source in a transaction doesn't work anyway. + */ + if (txn->notify == notify) + return (0); + if (txn->notify != NULL) + WT_RET_MSG(session, WT_ERROR, "transaction notify already scheduled"); - txn->notify = notify; + txn->notify = notify; - return (0); + return (0); } /* * __wt_ext_transaction_oldest -- - * Return the oldest transaction ID not yet visible to a running - * transaction. + * Return the oldest transaction ID not yet visible to a running transaction. */ uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api) { - return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id); + return (((WT_CONNECTION_IMPL *)wt_api->conn)->txn_global.oldest_id); } /* * __wt_ext_transaction_visible -- - * Return if the current transaction can see the given transaction ID. + * Return if the current transaction can see the given transaction ID. */ int __wt_ext_transaction_visible( - WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id) + WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint64_t transaction_id) { - (void)wt_api; /* Unused parameters */ + (void)wt_api; /* Unused parameters */ - return (__wt_txn_visible( - (WT_SESSION_IMPL *)wt_session, transaction_id, WT_TS_NONE)); + return (__wt_txn_visible((WT_SESSION_IMPL *)wt_session, transaction_id, WT_TS_NONE)); } diff --git a/src/third_party/wiredtiger/src/txn/txn_log.c b/src/third_party/wiredtiger/src/txn/txn_log.c index 07a1b1152cb..f74f0d45562 100644 --- a/src/third_party/wiredtiger/src/txn/txn_log.c +++ b/src/third_party/wiredtiger/src/txn/txn_log.c @@ -11,769 +11,724 @@ #ifdef HAVE_DIAGNOSTIC /* * __txn_op_log_row_key_check -- - * Confirm the cursor references the correct key. + * Confirm the cursor references the correct key. */ static void __txn_op_log_row_key_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_CURSOR *cursor; - WT_ITEM key; - WT_PAGE *page; - WT_ROW *rip; - int cmp; - - cursor = &cbt->iface; - WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_SET)); - - memset(&key, 0, sizeof(key)); - - /* - * We used to take the row-store logging key from the page referenced by - * the cursor, then switched to taking it from the cursor itself. Check - * they are the same. - * - * If the cursor references a WT_INSERT item, take the key from there, - * else take the key from the original page. - */ - if (cbt->ins == NULL) { - session = (WT_SESSION_IMPL *)cbt->iface.session; - page = cbt->ref->page; - WT_ASSERT(session, cbt->slot < page->entries); - rip = &page->pg_row[cbt->slot]; - WT_ASSERT(session, - __wt_row_leaf_key(session, page, rip, &key, false) == 0); - } else { - key.data = WT_INSERT_KEY(cbt->ins); - key.size = WT_INSERT_KEY_SIZE(cbt->ins); - } - - WT_ASSERT(session, __wt_compare( - session, cbt->btree->collator, &key, &cursor->key, &cmp) == 0); - WT_ASSERT(session, cmp == 0); - - __wt_buf_free(session, &key); + WT_CURSOR *cursor; + WT_ITEM key; + WT_PAGE *page; + WT_ROW *rip; + int cmp; + + cursor = &cbt->iface; + WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_SET)); + + memset(&key, 0, sizeof(key)); + + /* + * We used to take the row-store logging key from the page referenced by + * the cursor, then switched to taking it from the cursor itself. Check + * they are the same. + * + * If the cursor references a WT_INSERT item, take the key from there, + * else take the key from the original page. + */ + if (cbt->ins == NULL) { + session = (WT_SESSION_IMPL *)cbt->iface.session; + page = cbt->ref->page; + WT_ASSERT(session, cbt->slot < page->entries); + rip = &page->pg_row[cbt->slot]; + WT_ASSERT(session, __wt_row_leaf_key(session, page, rip, &key, false) == 0); + } else { + key.data = WT_INSERT_KEY(cbt->ins); + key.size = WT_INSERT_KEY_SIZE(cbt->ins); + } + + WT_ASSERT(session, __wt_compare(session, cbt->btree->collator, &key, &cursor->key, &cmp) == 0); + WT_ASSERT(session, cmp == 0); + + __wt_buf_free(session, &key); } #endif /* * __txn_op_log -- - * Log an operation for the current transaction. + * Log an operation for the current transaction. */ static int -__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, - WT_TXN_OP *op, WT_CURSOR_BTREE *cbt, uint32_t fileid) +__txn_op_log( + WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt, uint32_t fileid) { - WT_CURSOR *cursor; - WT_ITEM value; - WT_UPDATE *upd; - uint64_t recno; - - cursor = &cbt->iface; - upd = op->u.op_upd; - value.data = upd->data; - value.size = upd->size; - - /* - * Log the row- or column-store insert, modify, remove or update. Our - * caller doesn't log reserve operations, we shouldn't see them here. - */ - if (cbt->btree->type == BTREE_ROW) { + WT_CURSOR *cursor; + WT_ITEM value; + WT_UPDATE *upd; + uint64_t recno; + + cursor = &cbt->iface; + upd = op->u.op_upd; + value.data = upd->data; + value.size = upd->size; + + /* + * Log the row- or column-store insert, modify, remove or update. Our caller doesn't log reserve + * operations, we shouldn't see them here. + */ + if (cbt->btree->type == BTREE_ROW) { #ifdef HAVE_DIAGNOSTIC - __txn_op_log_row_key_check(session, cbt); + __txn_op_log_row_key_check(session, cbt); #endif - switch (upd->type) { - case WT_UPDATE_MODIFY: - WT_RET(__wt_logop_row_modify_pack( - session, logrec, fileid, &cursor->key, &value)); - break; - case WT_UPDATE_STANDARD: - WT_RET(__wt_logop_row_put_pack( - session, logrec, fileid, &cursor->key, &value)); - break; - case WT_UPDATE_TOMBSTONE: - WT_RET(__wt_logop_row_remove_pack( - session, logrec, fileid, &cursor->key)); - break; - default: - return (__wt_illegal_value(session, upd->type)); - } - } else { - recno = WT_INSERT_RECNO(cbt->ins); - WT_ASSERT(session, recno != WT_RECNO_OOB); - - switch (upd->type) { - case WT_UPDATE_MODIFY: - WT_RET(__wt_logop_col_modify_pack( - session, logrec, fileid, recno, &value)); - break; - case WT_UPDATE_STANDARD: - WT_RET(__wt_logop_col_put_pack( - session, logrec, fileid, recno, &value)); - break; - case WT_UPDATE_TOMBSTONE: - WT_RET(__wt_logop_col_remove_pack( - session, logrec, fileid, recno)); - break; - default: - return (__wt_illegal_value(session, upd->type)); - } - } - - return (0); + switch (upd->type) { + case WT_UPDATE_MODIFY: + WT_RET(__wt_logop_row_modify_pack(session, logrec, fileid, &cursor->key, &value)); + break; + case WT_UPDATE_STANDARD: + WT_RET(__wt_logop_row_put_pack(session, logrec, fileid, &cursor->key, &value)); + break; + case WT_UPDATE_TOMBSTONE: + WT_RET(__wt_logop_row_remove_pack(session, logrec, fileid, &cursor->key)); + break; + default: + return (__wt_illegal_value(session, upd->type)); + } + } else { + recno = WT_INSERT_RECNO(cbt->ins); + WT_ASSERT(session, recno != WT_RECNO_OOB); + + switch (upd->type) { + case WT_UPDATE_MODIFY: + WT_RET(__wt_logop_col_modify_pack(session, logrec, fileid, recno, &value)); + break; + case WT_UPDATE_STANDARD: + WT_RET(__wt_logop_col_put_pack(session, logrec, fileid, recno, &value)); + break; + case WT_UPDATE_TOMBSTONE: + WT_RET(__wt_logop_col_remove_pack(session, logrec, fileid, recno)); + break; + default: + return (__wt_illegal_value(session, upd->type)); + } + } + + return (0); } /* * __txn_oplist_printlog -- - * Print a list of operations from a log record. + * Print a list of operations from a log record. */ static int -__txn_oplist_printlog(WT_SESSION_IMPL *session, - const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) +__txn_oplist_printlog( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_TXN_PRINTLOG_ARGS *args) { - bool firstrecord; + bool firstrecord; - firstrecord = true; - WT_RET(__wt_fprintf(session, args->fs, " \"ops\": [\n")); + firstrecord = true; + WT_RET(__wt_fprintf(session, args->fs, " \"ops\": [\n")); - /* The logging subsystem zero-pads records. */ - while (*pp < end && **pp) { - if (!firstrecord) - WT_RET(__wt_fprintf(session, args->fs, ",\n")); - WT_RET(__wt_fprintf(session, args->fs, " {")); + /* The logging subsystem zero-pads records. */ + while (*pp < end && **pp) { + if (!firstrecord) + WT_RET(__wt_fprintf(session, args->fs, ",\n")); + WT_RET(__wt_fprintf(session, args->fs, " {")); - firstrecord = false; + firstrecord = false; - WT_RET(__wt_txn_op_printlog(session, pp, end, args)); - WT_RET(__wt_fprintf(session, args->fs, "\n }")); - } + WT_RET(__wt_txn_op_printlog(session, pp, end, args)); + WT_RET(__wt_fprintf(session, args->fs, "\n }")); + } - WT_RET(__wt_fprintf(session, args->fs, "\n ]\n")); + WT_RET(__wt_fprintf(session, args->fs, "\n ]\n")); - return (0); + return (0); } /* * __wt_txn_op_free -- - * Free memory associated with a transactional operation. + * Free memory associated with a transactional operation. */ void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op) { - switch (op->type) { - case WT_TXN_OP_NONE: - /* - * The free function can be called more than once: when there's - * no operation, a free is unnecessary or has already been done. - */ - return; - case WT_TXN_OP_BASIC_COL: - case WT_TXN_OP_INMEM_COL: - case WT_TXN_OP_REF_DELETE: - case WT_TXN_OP_TRUNCATE_COL: - break; - - case WT_TXN_OP_BASIC_ROW: - case WT_TXN_OP_INMEM_ROW: - __wt_buf_free(session, &op->u.op_row.key); - break; - - case WT_TXN_OP_TRUNCATE_ROW: - __wt_buf_free(session, &op->u.truncate_row.start); - __wt_buf_free(session, &op->u.truncate_row.stop); - break; - } - - (void)__wt_atomic_subi32(&op->btree->dhandle->session_inuse, 1); - - op->type = WT_TXN_OP_NONE; - op->flags = 0; + switch (op->type) { + case WT_TXN_OP_NONE: + /* + * The free function can be called more than once: when there's no operation, a free is + * unnecessary or has already been done. + */ + return; + case WT_TXN_OP_BASIC_COL: + case WT_TXN_OP_INMEM_COL: + case WT_TXN_OP_REF_DELETE: + case WT_TXN_OP_TRUNCATE_COL: + break; + + case WT_TXN_OP_BASIC_ROW: + case WT_TXN_OP_INMEM_ROW: + __wt_buf_free(session, &op->u.op_row.key); + break; + + case WT_TXN_OP_TRUNCATE_ROW: + __wt_buf_free(session, &op->u.truncate_row.start); + __wt_buf_free(session, &op->u.truncate_row.stop); + break; + } + + (void)__wt_atomic_subi32(&op->btree->dhandle->session_inuse, 1); + + op->type = WT_TXN_OP_NONE; + op->flags = 0; } /* * __txn_logrec_init -- - * Allocate and initialize a buffer for a transaction's log records. + * Allocate and initialize a buffer for a transaction's log records. */ static int __txn_logrec_init(WT_SESSION_IMPL *session) { - WT_DECL_ITEM(logrec); - WT_DECL_RET; - WT_TXN *txn; - size_t header_size; - uint32_t rectype; - const char *fmt; - - txn = &session->txn; - rectype = WT_LOGREC_COMMIT; - fmt = WT_UNCHECKED_STRING(Iq); - - if (txn->logrec != NULL) - return (0); - - /* - * The only way we should ever get in here without a txn id is if we - * are recording diagnostic information. In that case, allocate an id. - */ - if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE) && - txn->id == WT_TXN_NONE) - WT_RET(__wt_txn_id_check(session)); - else - WT_ASSERT(session, txn->id != WT_TXN_NONE); - - WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id)); - WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); - - WT_ERR(__wt_struct_pack(session, - (uint8_t *)logrec->data + logrec->size, header_size, - fmt, rectype, txn->id)); - logrec->size += (uint32_t)header_size; - txn->logrec = logrec; - - if (0) { -err: __wt_logrec_free(session, &logrec); - } - return (ret); + WT_DECL_ITEM(logrec); + WT_DECL_RET; + WT_TXN *txn; + size_t header_size; + uint32_t rectype; + const char *fmt; + + txn = &session->txn; + rectype = WT_LOGREC_COMMIT; + fmt = WT_UNCHECKED_STRING(Iq); + + if (txn->logrec != NULL) { + WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_ID)); + return (0); + } + + /* + * The only way we should ever get in here without a txn id is if we are recording diagnostic + * information. In that case, allocate an id. + */ + if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_DEBUG_MODE) && txn->id == WT_TXN_NONE) + WT_RET(__wt_txn_id_check(session)); + else + WT_ASSERT(session, txn->id != WT_TXN_NONE); + + WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, txn->id)); + WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); + + WT_ERR(__wt_struct_pack( + session, (uint8_t *)logrec->data + logrec->size, header_size, fmt, rectype, txn->id)); + logrec->size += (uint32_t)header_size; + txn->logrec = logrec; + + if (0) { +err: + __wt_logrec_free(session, &logrec); + } + return (ret); } /* * __wt_txn_log_op -- - * Write the last logged operation into the in-memory buffer. + * Write the last logged operation into the in-memory buffer. */ int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_ITEM *logrec; - WT_TXN *txn; - WT_TXN_OP *op; - - uint32_t fileid; - - conn = S2C(session); - txn = &session->txn; - - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || - F_ISSET(session, WT_SESSION_NO_LOGGING) || - (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) && - !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))) - return (0); - - /* We'd better have a transaction. */ - WT_ASSERT(session, - F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID)); - - WT_ASSERT(session, txn->mod_count > 0); - op = txn->mod + txn->mod_count - 1; - fileid = op->btree->id; - - /* - * If this operation is diagnostic only, set the ignore bit on the - * fileid so that recovery can skip it. - */ - if (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) && - FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)) - FLD_SET(fileid, WT_LOGOP_IGNORE); - - WT_RET(__txn_logrec_init(session)); - logrec = txn->logrec; - - switch (op->type) { - case WT_TXN_OP_NONE: - case WT_TXN_OP_INMEM_COL: - case WT_TXN_OP_INMEM_ROW: - case WT_TXN_OP_REF_DELETE: - /* Nothing to log, we're done. */ - break; - case WT_TXN_OP_BASIC_COL: - case WT_TXN_OP_BASIC_ROW: - ret = __txn_op_log(session, logrec, op, cbt, fileid); - break; - case WT_TXN_OP_TRUNCATE_COL: - ret = __wt_logop_col_truncate_pack(session, logrec, fileid, - op->u.truncate_col.start, op->u.truncate_col.stop); - break; - case WT_TXN_OP_TRUNCATE_ROW: - ret = __wt_logop_row_truncate_pack(session, logrec, fileid, - &op->u.truncate_row.start, &op->u.truncate_row.stop, - (uint32_t)op->u.truncate_row.mode); - break; - } - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_ITEM *logrec; + WT_TXN *txn; + WT_TXN_OP *op; + + uint32_t fileid; + + conn = S2C(session); + txn = &session->txn; + + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || + F_ISSET(session, WT_SESSION_NO_LOGGING) || + (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) && + !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE))) + return (0); + + /* We'd better have a transaction. */ + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING) && F_ISSET(txn, WT_TXN_HAS_ID)); + + WT_ASSERT(session, txn->mod_count > 0); + op = txn->mod + txn->mod_count - 1; + fileid = op->btree->id; + + /* + * If this operation is diagnostic only, set the ignore bit on the fileid so that recovery can + * skip it. + */ + if (F_ISSET(S2BT(session), WT_BTREE_NO_LOGGING) && + FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)) + FLD_SET(fileid, WT_LOGOP_IGNORE); + + WT_RET(__txn_logrec_init(session)); + logrec = txn->logrec; + + switch (op->type) { + case WT_TXN_OP_NONE: + case WT_TXN_OP_INMEM_COL: + case WT_TXN_OP_INMEM_ROW: + case WT_TXN_OP_REF_DELETE: + /* Nothing to log, we're done. */ + break; + case WT_TXN_OP_BASIC_COL: + case WT_TXN_OP_BASIC_ROW: + ret = __txn_op_log(session, logrec, op, cbt, fileid); + break; + case WT_TXN_OP_TRUNCATE_COL: + ret = __wt_logop_col_truncate_pack( + session, logrec, fileid, op->u.truncate_col.start, op->u.truncate_col.stop); + break; + case WT_TXN_OP_TRUNCATE_ROW: + ret = __wt_logop_row_truncate_pack(session, logrec, fileid, &op->u.truncate_row.start, + &op->u.truncate_row.stop, (uint32_t)op->u.truncate_row.mode); + break; + } + return (ret); } /* * __wt_txn_log_commit -- - * Write the operations of a transaction to the log at commit time. + * Write the operations of a transaction to the log at commit time. */ int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_TXN *txn; - - WT_UNUSED(cfg); - txn = &session->txn; - /* - * If there are no log records there is nothing to do. - */ - if (txn->logrec == NULL) - return (0); - - /* Write updates to the log. */ - return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync)); + WT_TXN *txn; + + WT_UNUSED(cfg); + txn = &session->txn; + /* + * If there are no log records there is nothing to do. + */ + if (txn->logrec == NULL) + return (0); + + /* Write updates to the log. */ + return (__wt_log_write(session, txn->logrec, NULL, txn->txn_logsync)); } /* * __txn_log_file_sync -- - * Write a log record for a file sync. + * Write a log record for a file sync. */ static int __txn_log_file_sync(WT_SESSION_IMPL *session, uint32_t flags, WT_LSN *lsnp) { - WT_BTREE *btree; - WT_DECL_ITEM(logrec); - WT_DECL_RET; - size_t header_size; - uint32_t rectype, start; - const char *fmt; - bool need_sync; - - btree = S2BT(session); - rectype = WT_LOGREC_FILE_SYNC; - start = LF_ISSET(WT_TXN_LOG_CKPT_START) ? 1 : 0; - fmt = WT_UNCHECKED_STRING(III); - need_sync = LF_ISSET(WT_TXN_LOG_CKPT_SYNC); - - WT_RET(__wt_struct_size( - session, &header_size, fmt, rectype, btree->id, start)); - WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); - - WT_ERR(__wt_struct_pack(session, - (uint8_t *)logrec->data + logrec->size, header_size, - fmt, rectype, btree->id, start)); - logrec->size += (uint32_t)header_size; - - WT_ERR(__wt_log_write( - session, logrec, lsnp, need_sync ? WT_LOG_FSYNC : 0)); -err: __wt_logrec_free(session, &logrec); - return (ret); + WT_BTREE *btree; + WT_DECL_ITEM(logrec); + WT_DECL_RET; + size_t header_size; + uint32_t rectype, start; + const char *fmt; + bool need_sync; + + btree = S2BT(session); + rectype = WT_LOGREC_FILE_SYNC; + start = LF_ISSET(WT_TXN_LOG_CKPT_START) ? 1 : 0; + fmt = WT_UNCHECKED_STRING(III); + need_sync = LF_ISSET(WT_TXN_LOG_CKPT_SYNC); + + WT_RET(__wt_struct_size(session, &header_size, fmt, rectype, btree->id, start)); + WT_RET(__wt_logrec_alloc(session, header_size, &logrec)); + + WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, header_size, fmt, + rectype, btree->id, start)); + logrec->size += (uint32_t)header_size; + + WT_ERR(__wt_log_write(session, logrec, lsnp, need_sync ? WT_LOG_FSYNC : 0)); +err: + __wt_logrec_free(session, &logrec); + return (ret); } /* * __wt_txn_checkpoint_logread -- - * Read a log record for a checkpoint operation. + * Read a log record for a checkpoint operation. */ int -__wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, - const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn) +__wt_txn_checkpoint_logread( + WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, WT_LSN *ckpt_lsn) { - WT_DECL_RET; - WT_ITEM ckpt_snapshot_unused; - uint32_t ckpt_file, ckpt_offset; - u_int ckpt_nsnapshot_unused; - const char *fmt; - - fmt = WT_UNCHECKED_STRING(IIIu); - - if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, - &ckpt_file, &ckpt_offset, - &ckpt_nsnapshot_unused, &ckpt_snapshot_unused)) != 0) - WT_RET_MSG(session, - ret, "txn_checkpoint_logread: unpack failure"); - WT_SET_LSN(ckpt_lsn, ckpt_file, ckpt_offset); - *pp = end; - return (0); + WT_DECL_RET; + WT_ITEM ckpt_snapshot_unused; + uint32_t ckpt_file, ckpt_offset; + u_int ckpt_nsnapshot_unused; + const char *fmt; + + fmt = WT_UNCHECKED_STRING(IIIu); + + if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset, + &ckpt_nsnapshot_unused, &ckpt_snapshot_unused)) != 0) + WT_RET_MSG(session, ret, "txn_checkpoint_logread: unpack failure"); + WT_SET_LSN(ckpt_lsn, ckpt_file, ckpt_offset); + *pp = end; + return (0); } /* * __wt_txn_ts_log -- - * Write a log record recording timestamps in the transaction. + * Write a log record recording timestamps in the transaction. */ int __wt_txn_ts_log(WT_SESSION_IMPL *session) { - struct timespec t; - WT_CONNECTION_IMPL *conn; - WT_ITEM *logrec; - WT_TXN *txn; - wt_timestamp_t commit, durable, first, prepare, read; - - conn = S2C(session); - txn = &session->txn; - - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || - F_ISSET(session, WT_SESSION_NO_LOGGING) || - !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)) - return (0); - - /* We'd better have a transaction running. */ - WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); - - WT_RET(__txn_logrec_init(session)); - logrec = txn->logrec; - commit = durable = first = prepare = read = WT_TS_NONE; - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { - commit = txn->commit_timestamp; - first = txn->first_commit_timestamp; - } - if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) - durable = txn->durable_timestamp; - if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) - prepare = txn->prepare_timestamp; - if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) - read = txn->read_timestamp; - - __wt_epoch(session, &t); - return (__wt_logop_txn_timestamp_pack(session, logrec, - (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec, - commit, durable, first, prepare, read)); + struct timespec t; + WT_CONNECTION_IMPL *conn; + WT_ITEM *logrec; + WT_TXN *txn; + wt_timestamp_t commit, durable, first, prepare, read; + + conn = S2C(session); + txn = &session->txn; + + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || + F_ISSET(session, WT_SESSION_NO_LOGGING) || + !FLD_ISSET(conn->log_flags, WT_CONN_LOG_DEBUG_MODE)) + return (0); + + /* + * There is a rare usage case of a prepared transaction that has no modifications, but then + * commits and sets timestamps. If an empty transaction has been prepared, don't bother writing + * a timestamp operation record. + */ + if (F_ISSET(txn, WT_TXN_PREPARE) && txn->mod_count == 0) + return (0); + + /* We'd better have a transaction running. */ + WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); + + WT_RET(__txn_logrec_init(session)); + logrec = txn->logrec; + commit = durable = first = prepare = read = WT_TS_NONE; + if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { + commit = txn->commit_timestamp; + first = txn->first_commit_timestamp; + } + if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) + durable = txn->durable_timestamp; + if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) + prepare = txn->prepare_timestamp; + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + read = txn->read_timestamp; + + __wt_epoch(session, &t); + return (__wt_logop_txn_timestamp_pack(session, logrec, (uint64_t)t.tv_sec, (uint64_t)t.tv_nsec, + commit, durable, first, prepare, read)); } /* * __wt_txn_checkpoint_log -- - * Write a log record for a checkpoint operation. + * Write a log record for a checkpoint operation. */ int -__wt_txn_checkpoint_log( - WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) +__wt_txn_checkpoint_log(WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(logrec); - WT_DECL_RET; - WT_ITEM *ckpt_snapshot, empty; - WT_LSN *ckpt_lsn; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - size_t recsize; - uint32_t i, rectype; - uint8_t *end, *p; - const char *fmt; - - conn = S2C(session); - txn_global = &conn->txn_global; - txn = &session->txn; - ckpt_lsn = &txn->ckpt_lsn; - - /* - * If this is a file sync, log it unless there is a full checkpoint in - * progress. - */ - if (!full) { - if (txn->full_ckpt) { - if (lsnp != NULL) - *lsnp = *ckpt_lsn; - return (0); - } - return (__txn_log_file_sync(session, flags, lsnp)); - } - - switch (flags) { - case WT_TXN_LOG_CKPT_PREPARE: - txn->full_ckpt = true; - - if (conn->compat_major >= WT_LOG_V2_MAJOR) { - /* - * Write the system log record containing a checkpoint - * start operation. - */ - rectype = WT_LOGREC_SYSTEM; - fmt = WT_UNCHECKED_STRING(I); - WT_ERR(__wt_struct_size( - session, &recsize, fmt, rectype)); - WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); - - WT_ERR(__wt_struct_pack(session, - (uint8_t *)logrec->data + logrec->size, recsize, - fmt, rectype)); - logrec->size += (uint32_t)recsize; - WT_ERR(__wt_logop_checkpoint_start_pack( - session, logrec)); - WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0)); - } else { - WT_ERR(__wt_log_printf(session, - "CHECKPOINT: Starting record")); - WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); - } - - /* - * We take and immediately release the visibility lock. - * Acquiring the write lock guarantees that any transaction - * that has written to the log has also made its transaction - * visible at this time. - */ - __wt_writelock(session, &txn_global->visibility_rwlock); - __wt_writeunlock(session, &txn_global->visibility_rwlock); - - /* - * We need to make sure that the log records in the checkpoint - * LSN are on disk. In particular to make sure that the - * current log file exists. - */ - WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); - break; - case WT_TXN_LOG_CKPT_START: - /* Take a copy of the transaction snapshot. */ - txn->ckpt_nsnapshot = txn->snapshot_count; - recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; - WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); - p = txn->ckpt_snapshot->mem; - end = p + recsize; - for (i = 0; i < txn->snapshot_count; i++) - WT_ERR(__wt_vpack_uint( - &p, WT_PTRDIFF(end, p), txn->snapshot[i])); - break; - case WT_TXN_LOG_CKPT_STOP: - /* - * During a clean connection close, we get here without the - * prepare or start steps. In that case, log the current LSN - * as the checkpoint LSN. - */ - if (!txn->full_ckpt) { - txn->ckpt_nsnapshot = 0; - WT_CLEAR(empty); - ckpt_snapshot = ∅ - WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); - } else - ckpt_snapshot = txn->ckpt_snapshot; - - /* Write the checkpoint log record. */ - rectype = WT_LOGREC_CHECKPOINT; - fmt = WT_UNCHECKED_STRING(IIIIu); - WT_ERR(__wt_struct_size(session, &recsize, - fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, - txn->ckpt_nsnapshot, ckpt_snapshot)); - WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); - - WT_ERR(__wt_struct_pack(session, - (uint8_t *)logrec->data + logrec->size, recsize, - fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, - txn->ckpt_nsnapshot, ckpt_snapshot)); - logrec->size += (uint32_t)recsize; - WT_ERR(__wt_log_write(session, logrec, lsnp, - F_ISSET(conn, WT_CONN_CKPT_SYNC) ? - WT_LOG_FSYNC : 0)); - - /* - * If this full checkpoint completed successfully and there is - * no hot backup in progress and this is not an unclean - * recovery, tell the logging subsystem the checkpoint LSN so - * that it can archive. Do not update the logging checkpoint - * LSN if this is during a clean connection close, only during - * a full checkpoint. A clean close may not update any - * metadata LSN and we do not want to archive in that case. - */ - if (!conn->hot_backup && - (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || - FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && - txn->full_ckpt) - __wt_log_ckpt(session, ckpt_lsn); - - /* FALLTHROUGH */ - case WT_TXN_LOG_CKPT_CLEANUP: - /* Cleanup any allocated resources */ - WT_INIT_LSN(ckpt_lsn); - txn->ckpt_nsnapshot = 0; - __wt_scr_free(session, &txn->ckpt_snapshot); - txn->full_ckpt = false; - break; - default: - WT_ERR(__wt_illegal_value(session, flags)); - } - -err: __wt_logrec_free(session, &logrec); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(logrec); + WT_DECL_RET; + WT_ITEM *ckpt_snapshot, empty; + WT_LSN *ckpt_lsn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + size_t recsize; + uint32_t i, rectype; + uint8_t *end, *p; + const char *fmt; + + conn = S2C(session); + txn_global = &conn->txn_global; + txn = &session->txn; + ckpt_lsn = &txn->ckpt_lsn; + + /* + * If this is a file sync, log it unless there is a full checkpoint in progress. + */ + if (!full) { + if (txn->full_ckpt) { + if (lsnp != NULL) + *lsnp = *ckpt_lsn; + return (0); + } + return (__txn_log_file_sync(session, flags, lsnp)); + } + + switch (flags) { + case WT_TXN_LOG_CKPT_PREPARE: + txn->full_ckpt = true; + + if (conn->compat_major >= WT_LOG_V2_MAJOR) { + /* + * Write the system log record containing a checkpoint start operation. + */ + rectype = WT_LOGREC_SYSTEM; + fmt = WT_UNCHECKED_STRING(I); + WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype)); + WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); + + WT_ERR(__wt_struct_pack( + session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype)); + logrec->size += (uint32_t)recsize; + WT_ERR(__wt_logop_checkpoint_start_pack(session, logrec)); + WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0)); + } else { + WT_ERR(__wt_log_printf(session, "CHECKPOINT: Starting record")); + WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); + } + + /* + * We take and immediately release the visibility lock. Acquiring the write lock guarantees + * that any transaction that has written to the log has also made its transaction visible at + * this time. + */ + __wt_writelock(session, &txn_global->visibility_rwlock); + __wt_writeunlock(session, &txn_global->visibility_rwlock); + + /* + * We need to make sure that the log records in the checkpoint LSN are on disk. In + * particular to make sure that the current log file exists. + */ + WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); + break; + case WT_TXN_LOG_CKPT_START: + /* Take a copy of the transaction snapshot. */ + txn->ckpt_nsnapshot = txn->snapshot_count; + recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; + WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); + p = txn->ckpt_snapshot->mem; + end = p + recsize; + for (i = 0; i < txn->snapshot_count; i++) + WT_ERR(__wt_vpack_uint(&p, WT_PTRDIFF(end, p), txn->snapshot[i])); + break; + case WT_TXN_LOG_CKPT_STOP: + /* + * During a clean connection close, we get here without the prepare or start steps. In that + * case, log the current LSN as the checkpoint LSN. + */ + if (!txn->full_ckpt) { + txn->ckpt_nsnapshot = 0; + WT_CLEAR(empty); + ckpt_snapshot = ∅ + WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); + } else + ckpt_snapshot = txn->ckpt_snapshot; + + /* Write the checkpoint log record. */ + rectype = WT_LOGREC_CHECKPOINT; + fmt = WT_UNCHECKED_STRING(IIIIu); + WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->l.file, + ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); + WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); + + WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, + rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); + logrec->size += (uint32_t)recsize; + WT_ERR(__wt_log_write( + session, logrec, lsnp, F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); + + /* + * If this full checkpoint completed successfully and there is no hot backup in progress and + * this is not an unclean recovery, tell the logging subsystem the checkpoint LSN so that it + * can archive. Do not update the logging checkpoint LSN if this is during a clean + * connection close, only during a full checkpoint. A clean close may not update any + * metadata LSN and we do not want to archive in that case. + */ + if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || + FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && + txn->full_ckpt) + __wt_log_ckpt(session, ckpt_lsn); + + /* FALLTHROUGH */ + case WT_TXN_LOG_CKPT_CLEANUP: + /* Cleanup any allocated resources */ + WT_INIT_LSN(ckpt_lsn); + txn->ckpt_nsnapshot = 0; + __wt_scr_free(session, &txn->ckpt_snapshot); + txn->full_ckpt = false; + break; + default: + WT_ERR(__wt_illegal_value(session, flags)); + } + +err: + __wt_logrec_free(session, &logrec); + return (ret); } /* * __wt_txn_truncate_log -- - * Begin truncating a range of a file. + * Begin truncating a range of a file. */ int -__wt_txn_truncate_log( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) +__wt_txn_truncate_log(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) { - WT_BTREE *btree; - WT_ITEM *item; - WT_TXN_OP *op; - - btree = S2BT(session); - - WT_RET(__txn_next_op(session, &op)); - - if (btree->type == BTREE_ROW) { - op->type = WT_TXN_OP_TRUNCATE_ROW; - op->u.truncate_row.mode = WT_TXN_TRUNC_ALL; - WT_CLEAR(op->u.truncate_row.start); - WT_CLEAR(op->u.truncate_row.stop); - if (start != NULL) { - op->u.truncate_row.mode = WT_TXN_TRUNC_START; - item = &op->u.truncate_row.start; - WT_RET(__wt_cursor_get_raw_key(&start->iface, item)); - WT_RET(__wt_buf_set( - session, item, item->data, item->size)); - } - if (stop != NULL) { - op->u.truncate_row.mode = - (op->u.truncate_row.mode == WT_TXN_TRUNC_ALL) ? - WT_TXN_TRUNC_STOP : WT_TXN_TRUNC_BOTH; - item = &op->u.truncate_row.stop; - WT_RET(__wt_cursor_get_raw_key(&stop->iface, item)); - WT_RET(__wt_buf_set( - session, item, item->data, item->size)); - } - } else { - op->type = WT_TXN_OP_TRUNCATE_COL; - op->u.truncate_col.start = - (start == NULL) ? WT_RECNO_OOB : start->recno; - op->u.truncate_col.stop = - (stop == NULL) ? WT_RECNO_OOB : stop->recno; - } - - /* Write that operation into the in-memory log. */ - WT_RET(__wt_txn_log_op(session, NULL)); - - WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM)); - F_SET(session, WT_SESSION_LOGGING_INMEM); - return (0); + WT_BTREE *btree; + WT_ITEM *item; + WT_TXN_OP *op; + + btree = S2BT(session); + + WT_RET(__txn_next_op(session, &op)); + + if (btree->type == BTREE_ROW) { + op->type = WT_TXN_OP_TRUNCATE_ROW; + op->u.truncate_row.mode = WT_TXN_TRUNC_ALL; + WT_CLEAR(op->u.truncate_row.start); + WT_CLEAR(op->u.truncate_row.stop); + if (start != NULL) { + op->u.truncate_row.mode = WT_TXN_TRUNC_START; + item = &op->u.truncate_row.start; + WT_RET(__wt_cursor_get_raw_key(&start->iface, item)); + WT_RET(__wt_buf_set(session, item, item->data, item->size)); + } + if (stop != NULL) { + op->u.truncate_row.mode = + (op->u.truncate_row.mode == WT_TXN_TRUNC_ALL) ? WT_TXN_TRUNC_STOP : WT_TXN_TRUNC_BOTH; + item = &op->u.truncate_row.stop; + WT_RET(__wt_cursor_get_raw_key(&stop->iface, item)); + WT_RET(__wt_buf_set(session, item, item->data, item->size)); + } + } else { + op->type = WT_TXN_OP_TRUNCATE_COL; + op->u.truncate_col.start = (start == NULL) ? WT_RECNO_OOB : start->recno; + op->u.truncate_col.stop = (stop == NULL) ? WT_RECNO_OOB : stop->recno; + } + + /* Write that operation into the in-memory log. */ + WT_RET(__wt_txn_log_op(session, NULL)); + + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM)); + F_SET(session, WT_SESSION_LOGGING_INMEM); + return (0); } /* * __wt_txn_truncate_end -- - * Finish truncating a range of a file. + * Finish truncating a range of a file. */ void __wt_txn_truncate_end(WT_SESSION_IMPL *session) { - F_CLR(session, WT_SESSION_LOGGING_INMEM); + F_CLR(session, WT_SESSION_LOGGING_INMEM); } /* * __txn_printlog -- - * Print a log record in a human-readable format. + * Print a log record in a human-readable format. */ static int -__txn_printlog(WT_SESSION_IMPL *session, - WT_ITEM *rawrec, WT_LSN *lsnp, WT_LSN *next_lsnp, - void *cookie, int firstrecord) +__txn_printlog(WT_SESSION_IMPL *session, WT_ITEM *rawrec, WT_LSN *lsnp, WT_LSN *next_lsnp, + void *cookie, int firstrecord) { - WT_LOG_RECORD *logrec; - WT_TXN_PRINTLOG_ARGS *args; - uint64_t txnid; - uint32_t fileid, lsnfile, lsnoffset, rectype; - int32_t start; - const uint8_t *end, *p; - const char *msg; - bool compressed; - - WT_UNUSED(next_lsnp); - args = cookie; - - p = WT_LOG_SKIP_HEADER(rawrec->data); - end = (const uint8_t *)rawrec->data + rawrec->size; - logrec = (WT_LOG_RECORD *)rawrec->data; - compressed = F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED); - - /* First, peek at the log record type. */ - WT_RET(__wt_logrec_read(session, &p, end, &rectype)); - - if (!firstrecord) - WT_RET(__wt_fprintf(session, args->fs, ",\n")); - - WT_RET(__wt_fprintf(session, args->fs, - " { \"lsn\" : [%" PRIu32 ",%" PRIu32 "],\n", - lsnp->l.file, lsnp->l.offset)); - WT_RET(__wt_fprintf(session, args->fs, - " \"hdr_flags\" : \"%s\",\n", compressed ? "compressed" : "")); - WT_RET(__wt_fprintf(session, args->fs, - " \"rec_len\" : %" PRIu32 ",\n", logrec->len)); - WT_RET(__wt_fprintf(session, args->fs, - " \"mem_len\" : %" PRIu32 ",\n", - compressed ? logrec->mem_len : logrec->len)); - - switch (rectype) { - case WT_LOGREC_CHECKPOINT: - WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), - WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset)); - WT_RET(__wt_fprintf(session, args->fs, - " \"type\" : \"checkpoint\",\n")); - WT_RET(__wt_fprintf(session, args->fs, - " \"ckpt_lsn\" : [%" PRIu32 ",%" PRIu32 "]\n", - lsnfile, lsnoffset)); - break; - - case WT_LOGREC_COMMIT: - WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); - WT_RET(__wt_fprintf(session, args->fs, - " \"type\" : \"commit\",\n")); - WT_RET(__wt_fprintf(session, args->fs, - " \"txnid\" : %" PRIu64 ",\n", txnid)); - WT_RET(__txn_oplist_printlog(session, &p, end, args)); - break; - - case WT_LOGREC_FILE_SYNC: - WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), - WT_UNCHECKED_STRING(Ii), &fileid, &start)); - WT_RET(__wt_fprintf(session, args->fs, - " \"type\" : \"file_sync\",\n")); - WT_RET(__wt_fprintf(session, args->fs, - " \"fileid\" : %" PRIu32 ",\n", fileid)); - WT_RET(__wt_fprintf(session, args->fs, - " \"start\" : %" PRId32 "\n", start)); - break; - - case WT_LOGREC_MESSAGE: - WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), - WT_UNCHECKED_STRING(S), &msg)); - WT_RET(__wt_fprintf(session, args->fs, - " \"type\" : \"message\",\n")); - WT_RET(__wt_fprintf(session, args->fs, - " \"message\" : \"%s\"\n", msg)); - break; - - case WT_LOGREC_SYSTEM: - WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), - WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset)); - WT_RET(__wt_fprintf(session, args->fs, - " \"type\" : \"system\",\n")); - WT_RET(__txn_oplist_printlog(session, &p, end, args)); - break; - } - - WT_RET(__wt_fprintf(session, args->fs, " }")); - - return (0); + WT_LOG_RECORD *logrec; + WT_TXN_PRINTLOG_ARGS *args; + uint64_t txnid; + uint32_t fileid, lsnfile, lsnoffset, rectype; + int32_t start; + const uint8_t *end, *p; + const char *msg; + bool compressed; + + WT_UNUSED(next_lsnp); + args = cookie; + + p = WT_LOG_SKIP_HEADER(rawrec->data); + end = (const uint8_t *)rawrec->data + rawrec->size; + logrec = (WT_LOG_RECORD *)rawrec->data; + compressed = F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED); + + /* First, peek at the log record type. */ + WT_RET(__wt_logrec_read(session, &p, end, &rectype)); + + if (!firstrecord) + WT_RET(__wt_fprintf(session, args->fs, ",\n")); + + WT_RET(__wt_fprintf(session, args->fs, " { \"lsn\" : [%" PRIu32 ",%" PRIu32 "],\n", + lsnp->l.file, lsnp->l.offset)); + WT_RET(__wt_fprintf( + session, args->fs, " \"hdr_flags\" : \"%s\",\n", compressed ? "compressed" : "")); + WT_RET(__wt_fprintf(session, args->fs, " \"rec_len\" : %" PRIu32 ",\n", logrec->len)); + WT_RET(__wt_fprintf(session, args->fs, " \"mem_len\" : %" PRIu32 ",\n", + compressed ? logrec->mem_len : logrec->len)); + + switch (rectype) { + case WT_LOGREC_CHECKPOINT: + WT_RET(__wt_struct_unpack( + session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset)); + WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"checkpoint\",\n")); + WT_RET(__wt_fprintf( + session, args->fs, " \"ckpt_lsn\" : [%" PRIu32 ",%" PRIu32 "]\n", lsnfile, lsnoffset)); + break; + + case WT_LOGREC_COMMIT: + WT_RET(__wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid)); + WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"commit\",\n")); + WT_RET(__wt_fprintf(session, args->fs, " \"txnid\" : %" PRIu64 ",\n", txnid)); + WT_RET(__txn_oplist_printlog(session, &p, end, args)); + break; + + case WT_LOGREC_FILE_SYNC: + WT_RET(__wt_struct_unpack( + session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(Ii), &fileid, &start)); + WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"file_sync\",\n")); + WT_RET(__wt_fprintf(session, args->fs, " \"fileid\" : %" PRIu32 ",\n", fileid)); + WT_RET(__wt_fprintf(session, args->fs, " \"start\" : %" PRId32 "\n", start)); + break; + + case WT_LOGREC_MESSAGE: + WT_RET(__wt_struct_unpack(session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(S), &msg)); + WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"message\",\n")); + WT_RET(__wt_fprintf(session, args->fs, " \"message\" : \"%s\"\n", msg)); + break; + + case WT_LOGREC_SYSTEM: + WT_RET(__wt_struct_unpack( + session, p, WT_PTRDIFF(end, p), WT_UNCHECKED_STRING(II), &lsnfile, &lsnoffset)); + WT_RET(__wt_fprintf(session, args->fs, " \"type\" : \"system\",\n")); + WT_RET(__txn_oplist_printlog(session, &p, end, args)); + break; + } + + WT_RET(__wt_fprintf(session, args->fs, " }")); + + return (0); } /* * __wt_txn_printlog -- - * Print the log in a human-readable format. + * Print the log in a human-readable format. */ int __wt_txn_printlog(WT_SESSION *wt_session, const char *ofile, uint32_t flags) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - WT_DECL_RET; - WT_FSTREAM *fs; - WT_SESSION_IMPL *session; - WT_TXN_PRINTLOG_ARGS args; - - session = (WT_SESSION_IMPL *)wt_session; - if (ofile == NULL) - fs = WT_STDOUT(session); - else - WT_RET(__wt_fopen(session, ofile, - WT_FS_OPEN_CREATE | WT_FS_OPEN_FIXED, - WT_STREAM_WRITE, &fs)); - - WT_ERR(__wt_fprintf(session, fs, "[\n")); - args.fs = fs; - args.flags = flags; - WT_ERR(__wt_log_scan( - session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args)); - ret = __wt_fprintf(session, fs, "\n]\n"); - -err: if (ofile != NULL) - WT_TRET(__wt_fclose(session, &fs)); - - return (ret); + WT_DECL_RET; + WT_FSTREAM *fs; + WT_SESSION_IMPL *session; + WT_TXN_PRINTLOG_ARGS args; + + session = (WT_SESSION_IMPL *)wt_session; + if (ofile == NULL) + fs = WT_STDOUT(session); + else + WT_RET( + __wt_fopen(session, ofile, WT_FS_OPEN_CREATE | WT_FS_OPEN_FIXED, WT_STREAM_WRITE, &fs)); + + WT_ERR(__wt_fprintf(session, fs, "[\n")); + args.fs = fs; + args.flags = flags; + WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_printlog, &args)); + ret = __wt_fprintf(session, fs, "\n]\n"); + +err: + if (ofile != NULL) + WT_TRET(__wt_fclose(session, &fs)); + + return (ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn_nsnap.c b/src/third_party/wiredtiger/src/txn/txn_nsnap.c index 533c67b70b0..f652e23c87d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_nsnap.c +++ b/src/third_party/wiredtiger/src/txn/txn_nsnap.c @@ -10,420 +10,397 @@ /* * __nsnap_destroy -- - * Destroy a named snapshot structure. + * Destroy a named snapshot structure. */ static void __nsnap_destroy(WT_SESSION_IMPL *session, WT_NAMED_SNAPSHOT *nsnap) { - __wt_free(session, nsnap->name); - __wt_free(session, nsnap->snapshot); - __wt_free(session, nsnap); + __wt_free(session, nsnap->name); + __wt_free(session, nsnap->snapshot); + __wt_free(session, nsnap); } /* * __nsnap_drop_one -- - * Drop a single named snapshot. The named snapshot lock must be held - * write locked. + * Drop a single named snapshot. The named snapshot lock must be held write locked. */ static int __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) { - WT_NAMED_SNAPSHOT *found; - WT_TXN_GLOBAL *txn_global; - - txn_global = &S2C(session)->txn_global; - - TAILQ_FOREACH(found, &txn_global->nsnaph, q) - if (WT_STRING_MATCH(found->name, name->str, name->len)) - break; - - if (found == NULL) - return (WT_NOTFOUND); - - /* Bump the global ID if we are removing the first entry */ - if (found == TAILQ_FIRST(&txn_global->nsnaph)) { - WT_ASSERT(session, !__wt_txn_visible_all( - session, txn_global->nsnap_oldest_id, WT_TS_NONE)); - txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ? - TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE; - WT_DIAGNOSTIC_YIELD; - WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE || - !__wt_txn_visible_all( - session, txn_global->nsnap_oldest_id, WT_TS_NONE)); - } - TAILQ_REMOVE(&txn_global->nsnaph, found, q); - __nsnap_destroy(session, found); - WT_STAT_CONN_INCR(session, txn_snapshots_dropped); - - return (0); + WT_NAMED_SNAPSHOT *found; + WT_TXN_GLOBAL *txn_global; + + txn_global = &S2C(session)->txn_global; + + TAILQ_FOREACH (found, &txn_global->nsnaph, q) + if (WT_STRING_MATCH(found->name, name->str, name->len)) + break; + + if (found == NULL) + return (WT_NOTFOUND); + + /* Bump the global ID if we are removing the first entry */ + if (found == TAILQ_FIRST(&txn_global->nsnaph)) { + WT_ASSERT(session, !__wt_txn_visible_all(session, txn_global->nsnap_oldest_id, WT_TS_NONE)); + txn_global->nsnap_oldest_id = + (TAILQ_NEXT(found, q) != NULL) ? TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE || + !__wt_txn_visible_all(session, txn_global->nsnap_oldest_id, WT_TS_NONE)); + } + TAILQ_REMOVE(&txn_global->nsnaph, found, q); + __nsnap_destroy(session, found); + WT_STAT_CONN_INCR(session, txn_snapshots_dropped); + + return (0); } /* * __nsnap_drop_to -- - * Drop named snapshots, if the name is NULL all snapshots will be - * dropped. The named snapshot lock must be held write locked. + * Drop named snapshots, if the name is NULL all snapshots will be dropped. The named snapshot + * lock must be held write locked. */ static int __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) { - WT_NAMED_SNAPSHOT *last, *nsnap, *prev; - WT_TXN_GLOBAL *txn_global; - uint64_t new_nsnap_oldest; - - last = nsnap = prev = NULL; - txn_global = &S2C(session)->txn_global; - - if (TAILQ_EMPTY(&txn_global->nsnaph)) { - if (name == NULL) - return (0); - /* - * Dropping specific snapshots when there aren't any it's an - * error. - */ - WT_RET_MSG(session, EINVAL, - "Named snapshot '%.*s' for drop not found", - (int)name->len, name->str); - } - - /* - * The new ID will be none if we are removing all named snapshots - * which is the default behavior of this loop. - */ - new_nsnap_oldest = WT_TXN_NONE; - if (name != NULL) { - TAILQ_FOREACH(last, &txn_global->nsnaph, q) { - if (WT_STRING_MATCH(last->name, name->str, name->len)) - break; - prev = last; - } - if (last == NULL) - WT_RET_MSG(session, EINVAL, - "Named snapshot '%.*s' for drop not found", - (int)name->len, name->str); - - if (!inclusive) { - /* We are done if a drop before points to the head */ - if (prev == 0) - return (0); - last = prev; - } - - if (TAILQ_NEXT(last, q) != NULL) - new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id; - } - - do { - nsnap = TAILQ_FIRST(&txn_global->nsnaph); - WT_ASSERT(session, nsnap != NULL); - TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); - __nsnap_destroy(session, nsnap); - WT_STAT_CONN_INCR(session, txn_snapshots_dropped); - /* Last will be NULL in the all case so it will never match */ - } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); - - /* Now that the queue of named snapshots is updated, update the ID */ - WT_ASSERT(session, !__wt_txn_visible_all( - session, txn_global->nsnap_oldest_id, WT_TS_NONE) && - (new_nsnap_oldest == WT_TXN_NONE || - WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest))); - txn_global->nsnap_oldest_id = new_nsnap_oldest; - WT_DIAGNOSTIC_YIELD; - WT_ASSERT(session, - new_nsnap_oldest == WT_TXN_NONE || - !__wt_txn_visible_all(session, new_nsnap_oldest, WT_TS_NONE)); - - return (0); + WT_NAMED_SNAPSHOT *last, *nsnap, *prev; + WT_TXN_GLOBAL *txn_global; + uint64_t new_nsnap_oldest; + + last = nsnap = prev = NULL; + txn_global = &S2C(session)->txn_global; + + if (TAILQ_EMPTY(&txn_global->nsnaph)) { + if (name == NULL) + return (0); + /* + * Dropping specific snapshots when there aren't any it's an error. + */ + WT_RET_MSG( + session, EINVAL, "Named snapshot '%.*s' for drop not found", (int)name->len, name->str); + } + + /* + * The new ID will be none if we are removing all named snapshots which is the default behavior + * of this loop. + */ + new_nsnap_oldest = WT_TXN_NONE; + if (name != NULL) { + TAILQ_FOREACH (last, &txn_global->nsnaph, q) { + if (WT_STRING_MATCH(last->name, name->str, name->len)) + break; + prev = last; + } + if (last == NULL) + WT_RET_MSG(session, EINVAL, "Named snapshot '%.*s' for drop not found", (int)name->len, + name->str); + + if (!inclusive) { + /* We are done if a drop before points to the head */ + if (prev == 0) + return (0); + last = prev; + } + + if (TAILQ_NEXT(last, q) != NULL) + new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id; + } + + do { + nsnap = TAILQ_FIRST(&txn_global->nsnaph); + WT_ASSERT(session, nsnap != NULL); + TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); + __nsnap_destroy(session, nsnap); + WT_STAT_CONN_INCR(session, txn_snapshots_dropped); + /* Last will be NULL in the all case so it will never match */ + } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); + + /* Now that the queue of named snapshots is updated, update the ID */ + WT_ASSERT(session, !__wt_txn_visible_all(session, txn_global->nsnap_oldest_id, WT_TS_NONE) && + (new_nsnap_oldest == WT_TXN_NONE || + WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest))); + txn_global->nsnap_oldest_id = new_nsnap_oldest; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, new_nsnap_oldest == WT_TXN_NONE || + !__wt_txn_visible_all(session, new_nsnap_oldest, WT_TS_NONE)); + + return (0); } /* * __wt_txn_named_snapshot_begin -- - * Begin an named in-memory snapshot. + * Begin an named in-memory snapshot. */ int __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_DECL_RET; - WT_NAMED_SNAPSHOT *nsnap, *nsnap_new; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - const char *txn_cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), - "isolation=snapshot", NULL }; - bool include_updates, started_txn; - - started_txn = false; - nsnap_new = NULL; - txn_global = &S2C(session)->txn_global; - txn = &session->txn; - - WT_RET(__wt_config_gets_def(session, cfg, "include_updates", 0, &cval)); - include_updates = cval.val != 0; - - WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); - WT_ASSERT(session, cval.len != 0); - - if (!F_ISSET(txn, WT_TXN_RUNNING)) { - if (include_updates) - WT_RET_MSG(session, EINVAL, "A transaction must be " - "running to include updates in a named snapshot"); - - WT_RET(__wt_txn_begin(session, txn_cfg)); - started_txn = true; - } - if (!include_updates) - F_SET(txn, WT_TXN_READONLY); - - /* Save a copy of the transaction's snapshot. */ - WT_ERR(__wt_calloc_one(session, &nsnap_new)); - nsnap = nsnap_new; - WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name)); - - /* - * To include updates from a writing transaction, make sure a - * transaction ID has been allocated. - */ - if (include_updates) { - WT_ERR(__wt_txn_id_check(session)); - WT_ASSERT(session, txn->id != WT_TXN_NONE); - nsnap->id = txn->id; - } else - nsnap->id = WT_TXN_NONE; - nsnap->pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; - nsnap->snap_min = txn->snap_min; - nsnap->snap_max = txn->snap_max; - if (txn->snapshot_count > 0) { - WT_ERR(__wt_calloc_def( - session, txn->snapshot_count, &nsnap->snapshot)); - memcpy(nsnap->snapshot, txn->snapshot, - txn->snapshot_count * sizeof(*nsnap->snapshot)); - } - nsnap->snapshot_count = txn->snapshot_count; - - /* Update the list. */ - - /* - * The semantic is that a new snapshot with the same name as an - * existing snapshot will replace the old one. - */ - WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); - - if (TAILQ_EMPTY(&txn_global->nsnaph)) { - WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && - !__wt_txn_visible_all( - session, nsnap_new->pinned_id, WT_TS_NONE)); - __wt_readlock(session, &txn_global->rwlock); - txn_global->nsnap_oldest_id = nsnap_new->pinned_id; - __wt_readunlock(session, &txn_global->rwlock); - } - TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); - WT_STAT_CONN_INCR(session, txn_snapshots_created); - nsnap_new = NULL; - -err: if (started_txn) { + WT_CONFIG_ITEM cval; + WT_DECL_RET; + WT_NAMED_SNAPSHOT *nsnap, *nsnap_new; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + const char *txn_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL}; + bool include_updates, started_txn; + + started_txn = false; + nsnap_new = NULL; + txn_global = &S2C(session)->txn_global; + txn = &session->txn; + + WT_RET(__wt_config_gets_def(session, cfg, "include_updates", 0, &cval)); + include_updates = cval.val != 0; + + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + WT_ASSERT(session, cval.len != 0); + + if (!F_ISSET(txn, WT_TXN_RUNNING)) { + if (include_updates) + WT_RET_MSG(session, EINVAL, + "A transaction must be " + "running to include updates in a named snapshot"); + + WT_RET(__wt_txn_begin(session, txn_cfg)); + started_txn = true; + } + if (!include_updates) + F_SET(txn, WT_TXN_READONLY); + + /* Save a copy of the transaction's snapshot. */ + WT_ERR(__wt_calloc_one(session, &nsnap_new)); + nsnap = nsnap_new; + WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name)); + + /* + * To include updates from a writing transaction, make sure a transaction ID has been allocated. + */ + if (include_updates) { + WT_ERR(__wt_txn_id_check(session)); + WT_ASSERT(session, txn->id != WT_TXN_NONE); + nsnap->id = txn->id; + } else + nsnap->id = WT_TXN_NONE; + nsnap->pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; + nsnap->snap_min = txn->snap_min; + nsnap->snap_max = txn->snap_max; + if (txn->snapshot_count > 0) { + WT_ERR(__wt_calloc_def(session, txn->snapshot_count, &nsnap->snapshot)); + memcpy(nsnap->snapshot, txn->snapshot, txn->snapshot_count * sizeof(*nsnap->snapshot)); + } + nsnap->snapshot_count = txn->snapshot_count; + + /* Update the list. */ + + /* + * The semantic is that a new snapshot with the same name as an existing snapshot will replace + * the old one. + */ + WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); + + if (TAILQ_EMPTY(&txn_global->nsnaph)) { + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && + !__wt_txn_visible_all(session, nsnap_new->pinned_id, WT_TS_NONE)); + __wt_readlock(session, &txn_global->rwlock); + txn_global->nsnap_oldest_id = nsnap_new->pinned_id; + __wt_readunlock(session, &txn_global->rwlock); + } + TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); + WT_STAT_CONN_INCR(session, txn_snapshots_created); + nsnap_new = NULL; + +err: + if (started_txn) { #ifdef HAVE_DIAGNOSTIC - uint64_t pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; + uint64_t pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; #endif - WT_TRET(__wt_txn_rollback(session, NULL)); - WT_DIAGNOSTIC_YIELD; - WT_ASSERT(session, - !__wt_txn_visible_all(session, pinned_id, WT_TS_NONE)); - } + WT_TRET(__wt_txn_rollback(session, NULL)); + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, !__wt_txn_visible_all(session, pinned_id, WT_TS_NONE)); + } - if (nsnap_new != NULL) - __nsnap_destroy(session, nsnap_new); + if (nsnap_new != NULL) + __nsnap_destroy(session, nsnap_new); - return (ret); + return (ret); } /* * __wt_txn_named_snapshot_drop -- - * Drop named snapshots + * Drop named snapshots */ int __wt_txn_named_snapshot_drop(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG objectconf; - WT_CONFIG_ITEM all_config, before_config, k, names_config, to_config, v; - WT_DECL_RET; - - WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config)); - WT_RET(__wt_config_gets_def( - session, cfg, "drop.names", 0, &names_config)); - WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config)); - WT_RET(__wt_config_gets_def( - session, cfg, "drop.before", 0, &before_config)); - - if (all_config.val != 0) - WT_RET(__nsnap_drop_to(session, NULL, true)); - else if (before_config.len != 0) - WT_RET(__nsnap_drop_to(session, &before_config, false)); - else if (to_config.len != 0) - WT_RET(__nsnap_drop_to(session, &to_config, true)); - - /* We are done if there are no named drops */ - - if (names_config.len != 0) { - __wt_config_subinit(session, &objectconf, &names_config); - while ((ret = __wt_config_next(&objectconf, &k, &v)) == 0) { - ret = __nsnap_drop_one(session, &k); - if (ret != 0) - WT_RET_MSG(session, EINVAL, - "Named snapshot '%.*s' for drop not found", - (int)k.len, k.str); - } - if (ret == WT_NOTFOUND) - ret = 0; - } - - return (ret); + WT_CONFIG objectconf; + WT_CONFIG_ITEM all_config, before_config, k, names_config, to_config, v; + WT_DECL_RET; + + WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.names", 0, &names_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.before", 0, &before_config)); + + if (all_config.val != 0) + WT_RET(__nsnap_drop_to(session, NULL, true)); + else if (before_config.len != 0) + WT_RET(__nsnap_drop_to(session, &before_config, false)); + else if (to_config.len != 0) + WT_RET(__nsnap_drop_to(session, &to_config, true)); + + /* We are done if there are no named drops */ + + if (names_config.len != 0) { + __wt_config_subinit(session, &objectconf, &names_config); + while ((ret = __wt_config_next(&objectconf, &k, &v)) == 0) { + ret = __nsnap_drop_one(session, &k); + if (ret != 0) + WT_RET_MSG( + session, EINVAL, "Named snapshot '%.*s' for drop not found", (int)k.len, k.str); + } + if (ret == WT_NOTFOUND) + ret = 0; + } + + return (ret); } /* * __wt_txn_named_snapshot_get -- - * Lookup a named snapshot for a transaction. + * Lookup a named snapshot for a transaction. */ int __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) { - WT_NAMED_SNAPSHOT *nsnap; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *txn_state; - - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); - - txn->isolation = WT_ISO_SNAPSHOT; - if (session->ncursors > 0) - WT_RET(__wt_session_copy_values(session)); - - __wt_readlock(session, &txn_global->nsnap_rwlock); - TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) - if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { - /* - * Acquire the scan lock so the oldest ID can't move - * forward without seeing our pinned ID. - */ - __wt_readlock(session, &txn_global->rwlock); - txn_state->pinned_id = nsnap->pinned_id; - __wt_readunlock(session, &txn_global->rwlock); - - WT_ASSERT(session, !__wt_txn_visible_all( - session, txn_state->pinned_id, WT_TS_NONE) && - txn_global->nsnap_oldest_id != WT_TXN_NONE && - WT_TXNID_LE(txn_global->nsnap_oldest_id, - txn_state->pinned_id)); - txn->snap_min = nsnap->snap_min; - txn->snap_max = nsnap->snap_max; - if ((txn->snapshot_count = nsnap->snapshot_count) != 0) - memcpy(txn->snapshot, nsnap->snapshot, - nsnap->snapshot_count * - sizeof(*nsnap->snapshot)); - if (nsnap->id != WT_TXN_NONE) { - WT_ASSERT(session, txn->id == WT_TXN_NONE); - txn->id = nsnap->id; - F_SET(txn, WT_TXN_READONLY); - } - F_SET(txn, WT_TXN_HAS_SNAPSHOT); - break; - } - __wt_readunlock(session, &txn_global->nsnap_rwlock); - - if (nsnap == NULL) - WT_RET_MSG(session, EINVAL, - "Named snapshot '%.*s' not found", - (int)nameval->len, nameval->str); - - /* Flag that this transaction is opened on a named snapshot */ - F_SET(txn, WT_TXN_NAMED_SNAPSHOT); - - return (0); + WT_NAMED_SNAPSHOT *nsnap; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); + + txn->isolation = WT_ISO_SNAPSHOT; + if (session->ncursors > 0) + WT_RET(__wt_session_copy_values(session)); + + __wt_readlock(session, &txn_global->nsnap_rwlock); + TAILQ_FOREACH (nsnap, &txn_global->nsnaph, q) + if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { + /* + * Acquire the scan lock so the oldest ID can't move forward without seeing our pinned + * ID. + */ + __wt_readlock(session, &txn_global->rwlock); + txn_state->pinned_id = nsnap->pinned_id; + __wt_readunlock(session, &txn_global->rwlock); + + WT_ASSERT(session, !__wt_txn_visible_all(session, txn_state->pinned_id, WT_TS_NONE) && + txn_global->nsnap_oldest_id != WT_TXN_NONE && + WT_TXNID_LE(txn_global->nsnap_oldest_id, txn_state->pinned_id)); + txn->snap_min = nsnap->snap_min; + txn->snap_max = nsnap->snap_max; + if ((txn->snapshot_count = nsnap->snapshot_count) != 0) + memcpy( + txn->snapshot, nsnap->snapshot, nsnap->snapshot_count * sizeof(*nsnap->snapshot)); + if (nsnap->id != WT_TXN_NONE) { + WT_ASSERT(session, txn->id == WT_TXN_NONE); + txn->id = nsnap->id; + F_SET(txn, WT_TXN_READONLY); + } + F_SET(txn, WT_TXN_HAS_SNAPSHOT); + break; + } + __wt_readunlock(session, &txn_global->nsnap_rwlock); + + if (nsnap == NULL) + WT_RET_MSG( + session, EINVAL, "Named snapshot '%.*s' not found", (int)nameval->len, nameval->str); + + /* Flag that this transaction is opened on a named snapshot */ + F_SET(txn, WT_TXN_NAMED_SNAPSHOT); + + return (0); } /* * __wt_txn_named_snapshot_config -- - * Check the configuration for a named snapshot + * Check the configuration for a named snapshot */ int -__wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, - const char *cfg[], bool *has_create, bool *has_drops) +__wt_txn_named_snapshot_config( + WT_SESSION_IMPL *session, const char *cfg[], bool *has_create, bool *has_drops) { - WT_CONFIG_ITEM all_config, before_config, names_config, to_config; - WT_CONFIG_ITEM cval; - WT_TXN *txn; - - txn = &session->txn; - *has_create = *has_drops = false; - - /* Verify that the name is legal. */ - WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); - if (cval.len != 0) { - if (WT_STRING_MATCH("all", cval.str, cval.len)) - WT_RET_MSG(session, EINVAL, - "Can't create snapshot with reserved \"all\" name"); - - WT_RET(__wt_name_check(session, cval.str, cval.len)); - - if (F_ISSET(txn, WT_TXN_RUNNING) && - txn->isolation != WT_ISO_SNAPSHOT) - WT_RET_MSG(session, EINVAL, - "Can't create a named snapshot from a running " - "transaction that isn't snapshot isolation"); - else if (F_ISSET(txn, WT_TXN_RUNNING) && txn->mod_count != 0) - WT_RET_MSG(session, EINVAL, - "Can't create a named snapshot from a running " - "transaction that has made updates"); - *has_create = true; - } - - /* Verify that the drop configuration is sane. */ - WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config)); - WT_RET(__wt_config_gets_def( - session, cfg, "drop.names", 0, &names_config)); - WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config)); - WT_RET(__wt_config_gets_def( - session, cfg, "drop.before", 0, &before_config)); - - /* Avoid more work if no drops are configured. */ - if (all_config.val != 0 || names_config.len != 0 || - before_config.len != 0 || to_config.len != 0) { - if (before_config.len != 0 && to_config.len != 0) - WT_RET_MSG(session, EINVAL, - "Illegal configuration; named snapshot drop can't " - "specify both before and to options"); - if (all_config.val != 0 && (names_config.len != 0 || - to_config.len != 0 || before_config.len != 0)) - WT_RET_MSG(session, EINVAL, - "Illegal configuration; named snapshot drop can't " - "specify all and any other options"); - *has_drops = true; - } - - if (!*has_create && !*has_drops) - WT_RET_MSG(session, EINVAL, - "WT_SESSION::snapshot API called without any drop or " - "name option"); - - return (0); + WT_CONFIG_ITEM all_config, before_config, names_config, to_config; + WT_CONFIG_ITEM cval; + WT_TXN *txn; + + txn = &session->txn; + *has_create = *has_drops = false; + + /* Verify that the name is legal. */ + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + if (cval.len != 0) { + if (WT_STRING_MATCH("all", cval.str, cval.len)) + WT_RET_MSG(session, EINVAL, "Can't create snapshot with reserved \"all\" name"); + + WT_RET(__wt_name_check(session, cval.str, cval.len)); + + if (F_ISSET(txn, WT_TXN_RUNNING) && txn->isolation != WT_ISO_SNAPSHOT) + WT_RET_MSG(session, EINVAL, + "Can't create a named snapshot from a running " + "transaction that isn't snapshot isolation"); + else if (F_ISSET(txn, WT_TXN_RUNNING) && txn->mod_count != 0) + WT_RET_MSG(session, EINVAL, + "Can't create a named snapshot from a running " + "transaction that has made updates"); + *has_create = true; + } + + /* Verify that the drop configuration is sane. */ + WT_RET(__wt_config_gets_def(session, cfg, "drop.all", 0, &all_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.names", 0, &names_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.to", 0, &to_config)); + WT_RET(__wt_config_gets_def(session, cfg, "drop.before", 0, &before_config)); + + /* Avoid more work if no drops are configured. */ + if (all_config.val != 0 || names_config.len != 0 || before_config.len != 0 || + to_config.len != 0) { + if (before_config.len != 0 && to_config.len != 0) + WT_RET_MSG(session, EINVAL, + "Illegal configuration; named snapshot drop can't " + "specify both before and to options"); + if (all_config.val != 0 && + (names_config.len != 0 || to_config.len != 0 || before_config.len != 0)) + WT_RET_MSG(session, EINVAL, + "Illegal configuration; named snapshot drop can't " + "specify all and any other options"); + *has_drops = true; + } + + if (!*has_create && !*has_drops) + WT_RET_MSG(session, EINVAL, + "WT_SESSION::snapshot API called without any drop or " + "name option"); + + return (0); } /* * __wt_txn_named_snapshot_destroy -- - * Destroy all named snapshots on connection close + * Destroy all named snapshots on connection close */ void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session) { - WT_NAMED_SNAPSHOT *nsnap; - WT_TXN_GLOBAL *txn_global; + WT_NAMED_SNAPSHOT *nsnap; + WT_TXN_GLOBAL *txn_global; - txn_global = &S2C(session)->txn_global; - txn_global->nsnap_oldest_id = WT_TXN_NONE; + txn_global = &S2C(session)->txn_global; + txn_global->nsnap_oldest_id = WT_TXN_NONE; - while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) { - TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); - __nsnap_destroy(session, nsnap); - } + while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) { + TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); + __nsnap_destroy(session, nsnap); + } } diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 504b2c0e8b4..17e0b61c904 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -10,776 +10,717 @@ /* State maintained during recovery. */ typedef struct { - const char *uri; /* File URI. */ - WT_CURSOR *c; /* Cursor used for recovery. */ - WT_LSN ckpt_lsn; /* File's checkpoint LSN. */ + const char *uri; /* File URI. */ + WT_CURSOR *c; /* Cursor used for recovery. */ + WT_LSN ckpt_lsn; /* File's checkpoint LSN. */ } WT_RECOVERY_FILE; typedef struct { - WT_SESSION_IMPL *session; - - /* Files from the metadata, indexed by file ID. */ - WT_RECOVERY_FILE *files; - size_t file_alloc; /* Allocated size of files array. */ - u_int max_fileid; /* Maximum file ID seen. */ - u_int nfiles; /* Number of files in the metadata. */ - - WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */ - WT_LSN max_ckpt_lsn; /* Maximum checkpoint LSN seen. */ - WT_LSN max_rec_lsn; /* Maximum recovery LSN seen. */ - - bool missing; /* Were there missing files? */ - bool metadata_only; /* - * Set during the first recovery pass, - * when only the metadata is recovered. - */ + WT_SESSION_IMPL *session; + + /* Files from the metadata, indexed by file ID. */ + WT_RECOVERY_FILE *files; + size_t file_alloc; /* Allocated size of files array. */ + u_int max_fileid; /* Maximum file ID seen. */ + u_int nfiles; /* Number of files in the metadata. */ + + WT_LSN ckpt_lsn; /* Start LSN for main recovery loop. */ + WT_LSN max_ckpt_lsn; /* Maximum checkpoint LSN seen. */ + WT_LSN max_rec_lsn; /* Maximum recovery LSN seen. */ + + bool missing; /* Were there missing files? */ + bool metadata_only; /* + * Set during the first recovery pass, + * when only the metadata is recovered. + */ } WT_RECOVERY; /* * __recovery_cursor -- - * Get a cursor for a recovery operation. + * Get a cursor for a recovery operation. */ static int -__recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, - WT_LSN *lsnp, u_int id, bool duplicate, WT_CURSOR **cp) +__recovery_cursor( + WT_SESSION_IMPL *session, WT_RECOVERY *r, WT_LSN *lsnp, u_int id, bool duplicate, WT_CURSOR **cp) { - WT_CURSOR *c; - bool metadata_op; - const char *cfg[] = { WT_CONFIG_BASE( - session, WT_SESSION_open_cursor), "overwrite", NULL }; - - c = NULL; - - /* - * File ids with the bit set to ignore this operation are skipped. - */ - if (WT_LOGOP_IS_IGNORED(id)) - return (0); - /* - * Metadata operations have an id of 0. Match operations based - * on the id and the current pass of recovery for metadata. - * - * Only apply operations in the correct metadata phase, and if the LSN - * is more recent than the last checkpoint. If there is no entry for a - * file, assume it was dropped or missing after a hot backup. - */ - metadata_op = id == WT_METAFILE_ID; - if (r->metadata_only != metadata_op) - ; - else if (id >= r->nfiles || r->files[id].uri == NULL) { - /* If a file is missing, output a verbose message once. */ - if (!r->missing) - __wt_verbose(session, WT_VERB_RECOVERY, - "No file found with ID %u (max %u)", - id, r->nfiles); - r->missing = true; - } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) { - /* - * We're going to apply the operation. Get the cursor, opening - * one if none is cached. - */ - if ((c = r->files[id].c) == NULL) { - WT_RET(__wt_open_cursor( - session, r->files[id].uri, NULL, cfg, &c)); - r->files[id].c = c; - } - } - - if (duplicate && c != NULL) - WT_RET(__wt_open_cursor( - session, r->files[id].uri, NULL, cfg, &c)); - - *cp = c; - return (0); + WT_CURSOR *c; + const char *cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), "overwrite", NULL}; + bool metadata_op; + + c = NULL; + + /* + * File ids with the bit set to ignore this operation are skipped. + */ + if (WT_LOGOP_IS_IGNORED(id)) + return (0); + /* + * Metadata operations have an id of 0. Match operations based + * on the id and the current pass of recovery for metadata. + * + * Only apply operations in the correct metadata phase, and if the LSN + * is more recent than the last checkpoint. If there is no entry for a + * file, assume it was dropped or missing after a hot backup. + */ + metadata_op = id == WT_METAFILE_ID; + if (r->metadata_only != metadata_op) + ; + else if (id >= r->nfiles || r->files[id].uri == NULL) { + /* If a file is missing, output a verbose message once. */ + if (!r->missing) + __wt_verbose( + session, WT_VERB_RECOVERY, "No file found with ID %u (max %u)", id, r->nfiles); + r->missing = true; + } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) { + /* + * We're going to apply the operation. Get the cursor, opening one if none is cached. + */ + if ((c = r->files[id].c) == NULL) { + WT_RET(__wt_open_cursor(session, r->files[id].uri, NULL, cfg, &c)); + r->files[id].c = c; + } + } + + if (duplicate && c != NULL) + WT_RET(__wt_open_cursor(session, r->files[id].uri, NULL, cfg, &c)); + + *cp = c; + return (0); } /* * Helper to a cursor if this operation is to be applied during recovery. */ -#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \ - ret = __recovery_cursor(session, r, lsnp, fileid, false, cp); \ - __wt_verbose(session, WT_VERB_RECOVERY, \ - "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \ - "/%" PRIu32, \ - ret != 0 ? "Error" : \ - cursor == NULL ? "Skipping" : "Applying", \ - optype, fileid, (lsnp)->l.file, (lsnp)->l.offset); \ - WT_ERR(ret); \ - if (cursor == NULL) \ - break +#define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \ + ret = __recovery_cursor(session, r, lsnp, fileid, false, cp); \ + __wt_verbose(session, WT_VERB_RECOVERY, \ + "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 "/%" PRIu32, \ + ret != 0 ? "Error" : cursor == NULL ? "Skipping" : "Applying", optype, fileid, \ + (lsnp)->l.file, (lsnp)->l.offset); \ + WT_ERR(ret); \ + if (cursor == NULL) \ + break /* * __txn_op_apply -- - * Apply a transactional operation during recovery. + * Apply a transactional operation during recovery. */ static int -__txn_op_apply( - WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) +__txn_op_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) { - WT_CURSOR *cursor, *start, *stop; - WT_DECL_RET; - WT_ITEM key, start_key, stop_key, value; - WT_SESSION_IMPL *session; - wt_timestamp_t commit, durable, first, prepare, read; - uint64_t recno, start_recno, stop_recno, t_nsec, t_sec; - uint32_t fileid, mode, optype, opsize; - - session = r->session; - cursor = NULL; - - /* Peek at the size and the type. */ - WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); - end = *pp + opsize; - - /* - * If it is an operation type that should be ignored, we're done. - * Note that file ids within known operations also use the same - * macros to indicate that operation should be ignored. - */ - if (WT_LOGOP_IS_IGNORED(optype)) { - *pp += opsize; - goto done; - } - - switch (optype) { - case WT_LOGOP_COL_MODIFY: - WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, - &fileid, &recno, &value)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - cursor->set_key(cursor, recno); - if ((ret = cursor->search(cursor)) != 0) - WT_ERR_NOTFOUND_OK(ret); - else { - /* - * Build/insert a complete value during recovery rather - * than using cursor modify to create a partial update - * (for no particular reason than simplicity). - */ - WT_ERR(__wt_modify_apply(cursor, value.data)); - WT_ERR(cursor->insert(cursor)); - } - break; - - case WT_LOGOP_COL_PUT: - WT_ERR(__wt_logop_col_put_unpack(session, pp, end, - &fileid, &recno, &value)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - cursor->set_key(cursor, recno); - __wt_cursor_set_raw_value(cursor, &value); - WT_ERR(cursor->insert(cursor)); - break; - - case WT_LOGOP_COL_REMOVE: - WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, - &fileid, &recno)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - cursor->set_key(cursor, recno); - WT_ERR(cursor->remove(cursor)); - break; - - case WT_LOGOP_COL_TRUNCATE: - WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, - &fileid, &start_recno, &stop_recno)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - - /* Set up the cursors. */ - if (start_recno == WT_RECNO_OOB) { - start = NULL; - stop = cursor; - } else if (stop_recno == WT_RECNO_OOB) { - start = cursor; - stop = NULL; - } else { - start = cursor; - WT_ERR(__recovery_cursor( - session, r, lsnp, fileid, true, &stop)); - } - - /* Set the keys. */ - if (start != NULL) - start->set_key(start, start_recno); - if (stop != NULL) - stop->set_key(stop, stop_recno); - - WT_TRET(session->iface.truncate(&session->iface, NULL, - start, stop, NULL)); - /* If we opened a duplicate cursor, close it now. */ - if (stop != NULL && stop != cursor) - WT_TRET(stop->close(stop)); - WT_ERR(ret); - break; - - case WT_LOGOP_ROW_MODIFY: - WT_ERR(__wt_logop_row_modify_unpack(session, pp, end, - &fileid, &key, &value)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - __wt_cursor_set_raw_key(cursor, &key); - if ((ret = cursor->search(cursor)) != 0) - WT_ERR_NOTFOUND_OK(ret); - else { - /* - * Build/insert a complete value during recovery rather - * than using cursor modify to create a partial update - * (for no particular reason than simplicity). - */ - WT_ERR(__wt_modify_apply(cursor, value.data)); - WT_ERR(cursor->insert(cursor)); - } - break; - - case WT_LOGOP_ROW_PUT: - WT_ERR(__wt_logop_row_put_unpack(session, pp, end, - &fileid, &key, &value)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - __wt_cursor_set_raw_key(cursor, &key); - __wt_cursor_set_raw_value(cursor, &value); - WT_ERR(cursor->insert(cursor)); - break; - - case WT_LOGOP_ROW_REMOVE: - WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, - &fileid, &key)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - __wt_cursor_set_raw_key(cursor, &key); - WT_ERR(cursor->remove(cursor)); - break; - - case WT_LOGOP_ROW_TRUNCATE: - WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, - &fileid, &start_key, &stop_key, &mode)); - GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); - /* Set up the cursors. */ - start = stop = NULL; - switch (mode) { - case WT_TXN_TRUNC_ALL: - /* Both cursors stay NULL. */ - break; - case WT_TXN_TRUNC_BOTH: - start = cursor; - WT_ERR(__recovery_cursor( - session, r, lsnp, fileid, true, &stop)); - break; - case WT_TXN_TRUNC_START: - start = cursor; - break; - case WT_TXN_TRUNC_STOP: - stop = cursor; - break; - default: - WT_ERR(__wt_illegal_value(session, mode)); - } - - /* Set the keys. */ - if (start != NULL) - __wt_cursor_set_raw_key(start, &start_key); - if (stop != NULL) - __wt_cursor_set_raw_key(stop, &stop_key); - - WT_TRET(session->iface.truncate(&session->iface, NULL, - start, stop, NULL)); - /* If we opened a duplicate cursor, close it now. */ - if (stop != NULL && stop != cursor) - WT_TRET(stop->close(stop)); - WT_ERR(ret); - break; - case WT_LOGOP_TXN_TIMESTAMP: - /* - * Timestamp records are informational only. We have to - * unpack it to properly move forward in the log record - * to the next operation, but otherwise ignore. - */ - WT_ERR(__wt_logop_txn_timestamp_unpack(session, pp, end, &t_sec, - &t_nsec, &commit, &durable, &first, &prepare, &read)); - break; - default: - WT_ERR(__wt_illegal_value(session, optype)); - } + WT_CURSOR *cursor, *start, *stop; + WT_DECL_RET; + WT_ITEM key, start_key, stop_key, value; + WT_SESSION_IMPL *session; + wt_timestamp_t commit, durable, first, prepare, read; + uint64_t recno, start_recno, stop_recno, t_nsec, t_sec; + uint32_t fileid, mode, optype, opsize; + + session = r->session; + cursor = NULL; + + /* Peek at the size and the type. */ + WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); + end = *pp + opsize; + + /* + * If it is an operation type that should be ignored, we're done. Note that file ids within + * known operations also use the same macros to indicate that operation should be ignored. + */ + if (WT_LOGOP_IS_IGNORED(optype)) { + *pp += opsize; + goto done; + } + + switch (optype) { + case WT_LOGOP_COL_MODIFY: + WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, &fileid, &recno, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + if ((ret = cursor->search(cursor)) != 0) + WT_ERR_NOTFOUND_OK(ret); + else { + /* + * Build/insert a complete value during recovery rather + * than using cursor modify to create a partial update + * (for no particular reason than simplicity). + */ + WT_ERR(__wt_modify_apply(cursor, value.data)); + WT_ERR(cursor->insert(cursor)); + } + break; + + case WT_LOGOP_COL_PUT: + WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + __wt_cursor_set_raw_value(cursor, &value); + WT_ERR(cursor->insert(cursor)); + break; + + case WT_LOGOP_COL_REMOVE: + WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + cursor->set_key(cursor, recno); + WT_ERR(cursor->remove(cursor)); + break; + + case WT_LOGOP_COL_TRUNCATE: + WT_ERR( + __wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + + /* Set up the cursors. */ + if (start_recno == WT_RECNO_OOB) { + start = NULL; + stop = cursor; + } else if (stop_recno == WT_RECNO_OOB) { + start = cursor; + stop = NULL; + } else { + start = cursor; + WT_ERR(__recovery_cursor(session, r, lsnp, fileid, true, &stop)); + } + + /* Set the keys. */ + if (start != NULL) + start->set_key(start, start_recno); + if (stop != NULL) + stop->set_key(stop, stop_recno); + + WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); + /* If we opened a duplicate cursor, close it now. */ + if (stop != NULL && stop != cursor) + WT_TRET(stop->close(stop)); + WT_ERR(ret); + break; + + case WT_LOGOP_ROW_MODIFY: + WT_ERR(__wt_logop_row_modify_unpack(session, pp, end, &fileid, &key, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + if ((ret = cursor->search(cursor)) != 0) + WT_ERR_NOTFOUND_OK(ret); + else { + /* + * Build/insert a complete value during recovery rather + * than using cursor modify to create a partial update + * (for no particular reason than simplicity). + */ + WT_ERR(__wt_modify_apply(cursor, value.data)); + WT_ERR(cursor->insert(cursor)); + } + break; + + case WT_LOGOP_ROW_PUT: + WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + __wt_cursor_set_raw_value(cursor, &value); + WT_ERR(cursor->insert(cursor)); + break; + + case WT_LOGOP_ROW_REMOVE: + WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + __wt_cursor_set_raw_key(cursor, &key); + WT_ERR(cursor->remove(cursor)); + break; + + case WT_LOGOP_ROW_TRUNCATE: + WT_ERR( + __wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode)); + GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); + /* Set up the cursors. */ + start = stop = NULL; + switch (mode) { + case WT_TXN_TRUNC_ALL: + /* Both cursors stay NULL. */ + break; + case WT_TXN_TRUNC_BOTH: + start = cursor; + WT_ERR(__recovery_cursor(session, r, lsnp, fileid, true, &stop)); + break; + case WT_TXN_TRUNC_START: + start = cursor; + break; + case WT_TXN_TRUNC_STOP: + stop = cursor; + break; + default: + WT_ERR(__wt_illegal_value(session, mode)); + } + + /* Set the keys. */ + if (start != NULL) + __wt_cursor_set_raw_key(start, &start_key); + if (stop != NULL) + __wt_cursor_set_raw_key(stop, &stop_key); + + WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); + /* If we opened a duplicate cursor, close it now. */ + if (stop != NULL && stop != cursor) + WT_TRET(stop->close(stop)); + WT_ERR(ret); + break; + case WT_LOGOP_TXN_TIMESTAMP: + /* + * Timestamp records are informational only. We have to unpack it to properly move forward + * in the log record to the next operation, but otherwise ignore. + */ + WT_ERR(__wt_logop_txn_timestamp_unpack( + session, pp, end, &t_sec, &t_nsec, &commit, &durable, &first, &prepare, &read)); + break; + default: + WT_ERR(__wt_illegal_value(session, optype)); + } done: - /* Reset the cursor so it doesn't block eviction. */ - if (cursor != NULL) - WT_ERR(cursor->reset(cursor)); - return (0); - -err: __wt_err(session, ret, - "operation apply failed during recovery: operation type %" - PRIu32 " at LSN %" PRIu32 "/%" PRIu32, - optype, lsnp->l.file, lsnp->l.offset); - return (ret); + /* Reset the cursor so it doesn't block eviction. */ + if (cursor != NULL) + WT_ERR(cursor->reset(cursor)); + return (0); + +err: + __wt_err(session, ret, "operation apply failed during recovery: operation type %" PRIu32 + " at LSN %" PRIu32 "/%" PRIu32, + optype, lsnp->l.file, lsnp->l.offset); + return (ret); } /* * __txn_commit_apply -- - * Apply a commit record during recovery. + * Apply a commit record during recovery. */ static int -__txn_commit_apply( - WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) +__txn_commit_apply(WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) { - /* The logging subsystem zero-pads records. */ - while (*pp < end && **pp) - WT_RET(__txn_op_apply(r, lsnp, pp, end)); + /* The logging subsystem zero-pads records. */ + while (*pp < end && **pp) + WT_RET(__txn_op_apply(r, lsnp, pp, end)); - return (0); + return (0); } /* * __txn_log_recover -- - * Roll the log forward to recover committed changes. + * Roll the log forward to recover committed changes. */ static int -__txn_log_recover(WT_SESSION_IMPL *session, - WT_ITEM *logrec, WT_LSN *lsnp, WT_LSN *next_lsnp, - void *cookie, int firstrecord) +__txn_log_recover(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, WT_LSN *next_lsnp, + void *cookie, int firstrecord) { - WT_DECL_RET; - WT_RECOVERY *r; - uint64_t txnid_unused; - uint32_t rectype; - const uint8_t *end, *p; - - r = cookie; - p = WT_LOG_SKIP_HEADER(logrec->data); - end = (const uint8_t *)logrec->data + logrec->size; - WT_UNUSED(firstrecord); - - /* First, peek at the log record type. */ - WT_RET(__wt_logrec_read(session, &p, end, &rectype)); - - /* - * Record the highest LSN we process during the metadata phase. - * If not the metadata phase, then stop at that LSN. - */ - if (r->metadata_only) - r->max_rec_lsn = *next_lsnp; - else if (__wt_log_cmp(lsnp, &r->max_rec_lsn) >= 0) - return (0); - - switch (rectype) { - case WT_LOGREC_CHECKPOINT: - if (r->metadata_only) - WT_RET(__wt_txn_checkpoint_logread( - session, &p, end, &r->ckpt_lsn)); - break; - - case WT_LOGREC_COMMIT: - if ((ret = __wt_vunpack_uint( - &p, WT_PTRDIFF(end, p), &txnid_unused)) != 0) - WT_RET_MSG( - session, ret, "txn_log_recover: unpack failure"); - WT_RET(__txn_commit_apply(r, lsnp, &p, end)); - break; - } - - return (0); + WT_DECL_RET; + WT_RECOVERY *r; + uint64_t txnid_unused; + uint32_t rectype; + const uint8_t *end, *p; + + r = cookie; + p = WT_LOG_SKIP_HEADER(logrec->data); + end = (const uint8_t *)logrec->data + logrec->size; + WT_UNUSED(firstrecord); + + /* First, peek at the log record type. */ + WT_RET(__wt_logrec_read(session, &p, end, &rectype)); + + /* + * Record the highest LSN we process during the metadata phase. If not the metadata phase, then + * stop at that LSN. + */ + if (r->metadata_only) + r->max_rec_lsn = *next_lsnp; + else if (__wt_log_cmp(lsnp, &r->max_rec_lsn) >= 0) + return (0); + + switch (rectype) { + case WT_LOGREC_CHECKPOINT: + if (r->metadata_only) + WT_RET(__wt_txn_checkpoint_logread(session, &p, end, &r->ckpt_lsn)); + break; + + case WT_LOGREC_COMMIT: + if ((ret = __wt_vunpack_uint(&p, WT_PTRDIFF(end, p), &txnid_unused)) != 0) + WT_RET_MSG(session, ret, "txn_log_recover: unpack failure"); + WT_RET(__txn_commit_apply(r, lsnp, &p, end)); + break; + } + + return (0); } /* * __recovery_set_checkpoint_timestamp -- - * Set the checkpoint timestamp as retrieved from the metadata file. + * Set the checkpoint timestamp as retrieved from the metadata file. */ static int __recovery_set_checkpoint_timestamp(WT_RECOVERY *r) { - WT_CONFIG_ITEM cval; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_SESSION_IMPL *session; - wt_timestamp_t ckpt_timestamp; - char ts_string[WT_TS_INT_STRING_SIZE], *sys_config; - - sys_config = NULL; - - session = r->session; - conn = S2C(session); - /* - * Read the system checkpoint information from the metadata file and - * save the stable timestamp of the last checkpoint for later query. - * This gets saved in the connection. - */ - ckpt_timestamp = 0; - - /* Search in the metadata for the system information. */ - WT_ERR_NOTFOUND_OK( - __wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config)); - if (sys_config != NULL) { - WT_CLEAR(cval); - WT_ERR_NOTFOUND_OK(__wt_config_getones( - session, sys_config, "checkpoint_timestamp", &cval)); - if (cval.len != 0) { - __wt_verbose(session, WT_VERB_RECOVERY, - "Recovery timestamp %.*s", - (int)cval.len, cval.str); - WT_ERR(__wt_txn_parse_timestamp_raw(session, - "recovery", &ckpt_timestamp, &cval)); - } - } - - /* - * Set the recovery checkpoint timestamp and the metadata checkpoint - * timestamp so that the checkpoint after recovery writes the correct - * value into the metadata. - */ - conn->txn_global.meta_ckpt_timestamp = - conn->txn_global.recovery_timestamp = ckpt_timestamp; - - __wt_verbose(session, - WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, - "Set global recovery timestamp: %s", - __wt_timestamp_to_string( - conn->txn_global.recovery_timestamp, ts_string)); - -err: __wt_free(session, sys_config); - return (ret); + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session; + wt_timestamp_t ckpt_timestamp; + char ts_string[WT_TS_INT_STRING_SIZE], *sys_config; + + sys_config = NULL; + + session = r->session; + conn = S2C(session); + /* + * Read the system checkpoint information from the metadata file and save the stable timestamp + * of the last checkpoint for later query. This gets saved in the connection. + */ + ckpt_timestamp = 0; + + /* Search in the metadata for the system information. */ + WT_ERR_NOTFOUND_OK(__wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config)); + if (sys_config != NULL) { + WT_CLEAR(cval); + WT_ERR_NOTFOUND_OK(__wt_config_getones(session, sys_config, "checkpoint_timestamp", &cval)); + if (cval.len != 0) { + __wt_verbose( + session, WT_VERB_RECOVERY, "Recovery timestamp %.*s", (int)cval.len, cval.str); + WT_ERR(__wt_txn_parse_timestamp_raw(session, "recovery", &ckpt_timestamp, &cval)); + } + } + + /* + * Set the recovery checkpoint timestamp and the metadata checkpoint timestamp so that the + * checkpoint after recovery writes the correct value into the metadata. + */ + conn->txn_global.meta_ckpt_timestamp = conn->txn_global.recovery_timestamp = ckpt_timestamp; + + __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, + "Set global recovery timestamp: %s", + __wt_timestamp_to_string(conn->txn_global.recovery_timestamp, ts_string)); + +err: + __wt_free(session, sys_config); + return (ret); } /* * __recovery_setup_file -- - * Set up the recovery slot for a file. + * Set up the recovery slot for a file. */ static int __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) { - WT_CONFIG_ITEM cval; - WT_LSN lsn; - uint32_t fileid, lsnfile, lsnoffset; - - WT_RET(__wt_config_getones(r->session, config, "id", &cval)); - fileid = (uint32_t)cval.val; - - /* Track the largest file ID we have seen. */ - if (fileid > r->max_fileid) - r->max_fileid = fileid; - - if (r->nfiles <= fileid) { - WT_RET(__wt_realloc_def( - r->session, &r->file_alloc, fileid + 1, &r->files)); - r->nfiles = fileid + 1; - } - - if (r->files[fileid].uri != NULL) - WT_PANIC_RET(r->session, WT_PANIC, - "metadata corruption: files %s and %s have the same " - "file ID %u", - uri, r->files[fileid].uri, fileid); - WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); - WT_RET( - __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); - /* If there is checkpoint logged for the file, apply everything. */ - if (cval.type != WT_CONFIG_ITEM_STRUCT) - WT_INIT_LSN(&lsn); - /* NOLINTNEXTLINE(cert-err34-c) */ - else if (sscanf(cval.str, - "(%" SCNu32 ",%" SCNu32 ")", &lsnfile, &lsnoffset) == 2) - WT_SET_LSN(&lsn, lsnfile, lsnoffset); - else - WT_RET_MSG(r->session, EINVAL, - "Failed to parse checkpoint LSN '%.*s'", - (int)cval.len, cval.str); - r->files[fileid].ckpt_lsn = lsn; - - __wt_verbose(r->session, WT_VERB_RECOVERY, - "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", - uri, fileid, lsn.l.file, lsn.l.offset); - - if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) && - (WT_IS_MAX_LSN(&r->max_ckpt_lsn) || - __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0)) - r->max_ckpt_lsn = lsn; - - return (0); + WT_CONFIG_ITEM cval; + WT_LSN lsn; + uint32_t fileid, lsnfile, lsnoffset; + + WT_RET(__wt_config_getones(r->session, config, "id", &cval)); + fileid = (uint32_t)cval.val; + + /* Track the largest file ID we have seen. */ + if (fileid > r->max_fileid) + r->max_fileid = fileid; + + if (r->nfiles <= fileid) { + WT_RET(__wt_realloc_def(r->session, &r->file_alloc, fileid + 1, &r->files)); + r->nfiles = fileid + 1; + } + + if (r->files[fileid].uri != NULL) + WT_PANIC_RET(r->session, WT_PANIC, + "metadata corruption: files %s and %s have the same " + "file ID %u", + uri, r->files[fileid].uri, fileid); + WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); + WT_RET(__wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); + /* If there is checkpoint logged for the file, apply everything. */ + if (cval.type != WT_CONFIG_ITEM_STRUCT) + WT_INIT_LSN(&lsn); + /* NOLINTNEXTLINE(cert-err34-c) */ + else if (sscanf(cval.str, "(%" SCNu32 ",%" SCNu32 ")", &lsnfile, &lsnoffset) == 2) + WT_SET_LSN(&lsn, lsnfile, lsnoffset); + else + WT_RET_MSG( + r->session, EINVAL, "Failed to parse checkpoint LSN '%.*s'", (int)cval.len, cval.str); + r->files[fileid].ckpt_lsn = lsn; + + __wt_verbose(r->session, WT_VERB_RECOVERY, + "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file, + lsn.l.offset); + + if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) && + (WT_IS_MAX_LSN(&r->max_ckpt_lsn) || __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0)) + r->max_ckpt_lsn = lsn; + + return (0); } /* * __recovery_free -- - * Free the recovery state. + * Free the recovery state. */ static int __recovery_free(WT_RECOVERY *r) { - WT_CURSOR *c; - WT_DECL_RET; - WT_SESSION_IMPL *session; - u_int i; - - session = r->session; - for (i = 0; i < r->nfiles; i++) { - __wt_free(session, r->files[i].uri); - if ((c = r->files[i].c) != NULL) - WT_TRET(c->close(c)); - } - - __wt_free(session, r->files); - return (ret); + WT_CURSOR *c; + WT_DECL_RET; + WT_SESSION_IMPL *session; + u_int i; + + session = r->session; + for (i = 0; i < r->nfiles; i++) { + __wt_free(session, r->files[i].uri); + if ((c = r->files[i].c) != NULL) + WT_TRET(c->close(c)); + } + + __wt_free(session, r->files); + return (ret); } /* * __recovery_file_scan -- - * Scan the files referenced from the metadata and gather information - * about them for recovery. + * Scan the files referenced from the metadata and gather information about them for recovery. */ static int __recovery_file_scan(WT_RECOVERY *r) { - WT_CURSOR *c; - WT_DECL_RET; - int cmp; - const char *uri, *config; - - /* Scan through all files in the metadata. */ - c = r->files[0].c; - c->set_key(c, "file:"); - if ((ret = c->search_near(c, &cmp)) != 0) { - /* Is the metadata empty? */ - WT_RET_NOTFOUND_OK(ret); - return (0); - } - if (cmp < 0) - WT_RET_NOTFOUND_OK(c->next(c)); - for (; ret == 0; ret = c->next(c)) { - WT_RET(c->get_key(c, &uri)); - if (!WT_PREFIX_MATCH(uri, "file:")) - break; - WT_RET(c->get_value(c, &config)); - WT_RET(__recovery_setup_file(r, uri, config)); - } - WT_RET_NOTFOUND_OK(ret); - return (0); + WT_CURSOR *c; + WT_DECL_RET; + int cmp; + const char *uri, *config; + + /* Scan through all files in the metadata. */ + c = r->files[0].c; + c->set_key(c, "file:"); + if ((ret = c->search_near(c, &cmp)) != 0) { + /* Is the metadata empty? */ + WT_RET_NOTFOUND_OK(ret); + return (0); + } + if (cmp < 0) + WT_RET_NOTFOUND_OK(c->next(c)); + for (; ret == 0; ret = c->next(c)) { + WT_RET(c->get_key(c, &uri)); + if (!WT_PREFIX_MATCH(uri, "file:")) + break; + WT_RET(c->get_value(c, &config)); + WT_RET(__recovery_setup_file(r, uri, config)); + } + WT_RET_NOTFOUND_OK(ret); + return (0); } /* * __wt_txn_recover -- - * Run recovery. + * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_CURSOR *metac; - WT_DECL_RET; - WT_RECOVERY r; - WT_RECOVERY_FILE *metafile; - char *config; - bool do_checkpoint, eviction_started, needs_rec, was_backup; - - conn = S2C(session); - WT_CLEAR(r); - WT_INIT_LSN(&r.ckpt_lsn); - config = NULL; - do_checkpoint = true; - eviction_started = false; - was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); - - /* We need a real session for recovery. */ - WT_RET(__wt_open_internal_session(conn, "txn-recover", - false, WT_SESSION_NO_LOGGING, &session)); - r.session = session; - WT_MAX_LSN(&r.max_ckpt_lsn); - WT_MAX_LSN(&r.max_rec_lsn); - conn->txn_global.recovery_timestamp = - conn->txn_global.meta_ckpt_timestamp = 0; - - F_SET(conn, WT_CONN_RECOVERING); - WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); - WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); - WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); - metafile = &r.files[WT_METAFILE_ID]; - metafile->c = metac; - - /* - * If no log was found (including if logging is disabled), or if the - * last checkpoint was done with logging disabled, recovery should not - * run. Scan the metadata to figure out the largest file ID. - */ - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) || - WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { - /* - * Detect if we're going from logging disabled to enabled. - * We need to know this to verify LSNs and start at the correct - * log file later. If someone ran with logging, then disabled - * it and removed all the log files and then turned logging back - * on, we have to start logging in the log file number that is - * larger than any checkpoint LSN we have from the earlier time. - */ - WT_ERR(__recovery_file_scan(&r)); - /* - * The array can be re-allocated in recovery_file_scan. Reset - * our pointer after scanning all the files. - */ - metafile = &r.files[WT_METAFILE_ID]; - conn->next_file_id = r.max_fileid; - - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && - WT_IS_MAX_LSN(&metafile->ckpt_lsn) && - !WT_IS_MAX_LSN(&r.max_ckpt_lsn)) - WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file)); - else - do_checkpoint = false; - goto done; - } - - /* - * First, do a pass through the log to recover the metadata, and - * establish the last checkpoint LSN. Skip this when opening a hot - * backup: we already have the correct metadata in that case. - * - * If we're running with salvage and we hit an error, we ignore it - * and continue. In salvage we want to recover whatever part of the - * data we can from the last checkpoint up until whatever problem we - * detect in the log file. In salvage, we ignore errors from scanning - * the log so recovery can continue. Other errors remain errors. - */ - if (!was_backup) { - r.metadata_only = true; - /* - * If this is a read-only connection, check if the checkpoint - * LSN in the metadata file is up to date, indicating a clean - * shutdown. - */ - if (F_ISSET(conn, WT_CONN_READONLY)) { - WT_ERR(__wt_log_needs_recovery( - session, &metafile->ckpt_lsn, &needs_rec)); - if (needs_rec) - WT_ERR_MSG(session, WT_RUN_RECOVERY, - "Read-only database needs recovery"); - } - if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) - ret = __wt_log_scan(session, - NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r); - else { - /* - * Start at the last checkpoint LSN referenced in the - * metadata. If we see the end of a checkpoint while - * scanning, we will change the full scan to start from - * there. - */ - r.ckpt_lsn = metafile->ckpt_lsn; - ret = __wt_log_scan(session, - &metafile->ckpt_lsn, WT_LOGSCAN_RECOVER_METADATA, - __txn_log_recover, &r); - } - if (F_ISSET(conn, WT_CONN_SALVAGE)) - ret = 0; - /* - * If log scan couldn't find a file we expected to be around, - * this indicates a corruption of some sort. - */ - if (ret == ENOENT) { - F_SET(conn, WT_CONN_DATA_CORRUPTION); - ret = WT_ERROR; - } - - WT_ERR(ret); - } - - /* Scan the metadata to find the live files and their IDs. */ - WT_ERR(__recovery_file_scan(&r)); - /* - * Clear this out. We no longer need it and it could have been - * re-allocated when scanning the files. - */ - WT_NOT_READ(metafile, NULL); - - /* - * We no longer need the metadata cursor: close it to avoid pinning any - * resources that could block eviction during recovery. - */ - r.files[0].c = NULL; - WT_ERR(metac->close(metac)); - - /* - * Now, recover all the files apart from the metadata. - * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. - */ - r.metadata_only = false; - __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, - "Main recovery loop: starting at %" PRIu32 "/%" PRIu32 - " to %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset, - r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset); - WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); - /* - * Check if the database was shut down cleanly. If not - * return an error if the user does not want automatic - * recovery. - */ - if (needs_rec && - (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || - F_ISSET(conn, WT_CONN_READONLY))) { - if (F_ISSET(conn, WT_CONN_READONLY)) - WT_ERR_MSG(session, WT_RUN_RECOVERY, - "Read-only database needs recovery"); - WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery"); - } - - if (F_ISSET(conn, WT_CONN_READONLY)) { - do_checkpoint = false; - goto done; - } - - /* - * Recovery can touch more data than fits in cache, so it relies on - * regular eviction to manage paging. Start eviction threads for - * recovery without LAS cursors. - */ - WT_ERR(__wt_evict_create(session)); - eviction_started = true; - - /* - * Always run recovery even if it was a clean shutdown only if - * this is not a read-only connection. - * We can consider skipping it in the future. - */ - if (needs_rec) - FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); - if (WT_IS_INIT_LSN(&r.ckpt_lsn)) - ret = __wt_log_scan(session, NULL, - WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, - __txn_log_recover, &r); - else - ret = __wt_log_scan(session, &r.ckpt_lsn, - WT_LOGSCAN_RECOVER, __txn_log_recover, &r); - if (F_ISSET(conn, WT_CONN_SALVAGE)) - ret = 0; - WT_ERR(ret); - - conn->next_file_id = r.max_fileid; - -done: WT_ERR(__recovery_set_checkpoint_timestamp(&r)); - if (do_checkpoint) - /* - * Forcibly log a checkpoint so the next open is fast and keep - * the metadata up to date with the checkpoint LSN and - * archiving. - */ - WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); - - /* - * If we're downgrading and have newer log files, force an archive, - * no matter what the archive setting is. - */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) - WT_ERR(__wt_log_truncate_files(session, NULL, true)); - FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); - -err: WT_TRET(__recovery_free(&r)); - __wt_free(session, config); - FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); - - if (ret != 0) { - FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED); - __wt_err(session, ret, "Recovery failed"); - } - - /* - * Destroy the eviction threads that were started in support of - * recovery. They will be restarted once the lookaside table is - * created. - */ - if (eviction_started) - WT_TRET(__wt_evict_destroy(session)); - - WT_TRET(session->iface.close(&session->iface, NULL)); - F_CLR(conn, WT_CONN_RECOVERING); - - return (ret); + WT_CONNECTION_IMPL *conn; + WT_CURSOR *metac; + WT_DECL_RET; + WT_RECOVERY r; + WT_RECOVERY_FILE *metafile; + char *config; + bool do_checkpoint, eviction_started, needs_rec, was_backup; + + conn = S2C(session); + WT_CLEAR(r); + WT_INIT_LSN(&r.ckpt_lsn); + config = NULL; + do_checkpoint = true; + eviction_started = false; + was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); + + /* We need a real session for recovery. */ + WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); + r.session = session; + WT_MAX_LSN(&r.max_ckpt_lsn); + WT_MAX_LSN(&r.max_rec_lsn); + conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = 0; + + F_SET(conn, WT_CONN_RECOVERING); + WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); + WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); + WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); + metafile = &r.files[WT_METAFILE_ID]; + metafile->c = metac; + + /* + * If no log was found (including if logging is disabled), or if the last checkpoint was done + * with logging disabled, recovery should not run. Scan the metadata to figure out the largest + * file ID. + */ + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { + /* + * Detect if we're going from logging disabled to enabled. We need to know this to verify + * LSNs and start at the correct log file later. If someone ran with logging, then disabled + * it and removed all the log files and then turned logging back on, we have to start + * logging in the log file number that is larger than any checkpoint LSN we have from the + * earlier time. + */ + WT_ERR(__recovery_file_scan(&r)); + /* + * The array can be re-allocated in recovery_file_scan. Reset our pointer after scanning all + * the files. + */ + metafile = &r.files[WT_METAFILE_ID]; + conn->next_file_id = r.max_fileid; + + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && WT_IS_MAX_LSN(&metafile->ckpt_lsn) && + !WT_IS_MAX_LSN(&r.max_ckpt_lsn)) + WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file)); + else + do_checkpoint = false; + goto done; + } + + /* + * First, do a pass through the log to recover the metadata, and + * establish the last checkpoint LSN. Skip this when opening a hot + * backup: we already have the correct metadata in that case. + * + * If we're running with salvage and we hit an error, we ignore it + * and continue. In salvage we want to recover whatever part of the + * data we can from the last checkpoint up until whatever problem we + * detect in the log file. In salvage, we ignore errors from scanning + * the log so recovery can continue. Other errors remain errors. + */ + if (!was_backup) { + r.metadata_only = true; + /* + * If this is a read-only connection, check if the checkpoint LSN in the metadata file is up + * to date, indicating a clean shutdown. + */ + if (F_ISSET(conn, WT_CONN_READONLY)) { + WT_ERR(__wt_log_needs_recovery(session, &metafile->ckpt_lsn, &needs_rec)); + if (needs_rec) + WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); + } + if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) + ret = __wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r); + else { + /* + * Start at the last checkpoint LSN referenced in the metadata. If we see the end of a + * checkpoint while scanning, we will change the full scan to start from there. + */ + r.ckpt_lsn = metafile->ckpt_lsn; + ret = __wt_log_scan( + session, &metafile->ckpt_lsn, WT_LOGSCAN_RECOVER_METADATA, __txn_log_recover, &r); + } + if (F_ISSET(conn, WT_CONN_SALVAGE)) + ret = 0; + /* + * If log scan couldn't find a file we expected to be around, this indicates a corruption of + * some sort. + */ + if (ret == ENOENT) { + F_SET(conn, WT_CONN_DATA_CORRUPTION); + ret = WT_ERROR; + } + + WT_ERR(ret); + } + + /* Scan the metadata to find the live files and their IDs. */ + WT_ERR(__recovery_file_scan(&r)); + /* + * Clear this out. We no longer need it and it could have been re-allocated when scanning the + * files. + */ + WT_NOT_READ(metafile, NULL); + + /* + * We no longer need the metadata cursor: close it to avoid pinning any resources that could + * block eviction during recovery. + */ + r.files[0].c = NULL; + WT_ERR(metac->close(metac)); + + /* + * Now, recover all the files apart from the metadata. Pass WT_LOGSCAN_RECOVER so that old logs + * get truncated. + */ + r.metadata_only = false; + __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, + "Main recovery loop: starting at %" PRIu32 "/%" PRIu32 " to %" PRIu32 "/%" PRIu32, + r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset, r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset); + WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); + /* + * Check if the database was shut down cleanly. If not return an error if the user does not want + * automatic recovery. + */ + if (needs_rec && + (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); + WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery"); + } + + if (F_ISSET(conn, WT_CONN_READONLY)) { + do_checkpoint = false; + goto done; + } + + /* + * Recovery can touch more data than fits in cache, so it relies on regular eviction to manage + * paging. Start eviction threads for recovery without LAS cursors. + */ + WT_ERR(__wt_evict_create(session)); + eviction_started = true; + + /* + * Always run recovery even if it was a clean shutdown only if this is not a read-only + * connection. We can consider skipping it in the future. + */ + if (needs_rec) + FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); + if (WT_IS_INIT_LSN(&r.ckpt_lsn)) + ret = __wt_log_scan( + session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r); + else + ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); + if (F_ISSET(conn, WT_CONN_SALVAGE)) + ret = 0; + WT_ERR(ret); + + conn->next_file_id = r.max_fileid; + +done: + WT_ERR(__recovery_set_checkpoint_timestamp(&r)); + if (do_checkpoint) + /* + * Forcibly log a checkpoint so the next open is fast and keep the metadata up to date with + * the checkpoint LSN and archiving. + */ + WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); + + /* + * If we're downgrading and have newer log files, force an archive, no matter what the archive + * setting is. + */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) + WT_ERR(__wt_log_truncate_files(session, NULL, true)); + FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); + +err: + WT_TRET(__recovery_free(&r)); + __wt_free(session, config); + FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); + + if (ret != 0) { + FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED); + __wt_err(session, ret, "Recovery failed"); + } + + /* + * Destroy the eviction threads that were started in support of recovery. They will be restarted + * once the lookaside table is created. + */ + if (eviction_started) + WT_TRET(__wt_evict_destroy(session)); + + WT_TRET(session->iface.close(&session->iface, NULL)); + F_CLR(conn, WT_CONN_RECOVERING); + + return (ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 01dad40f85f..97c83c47414 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -10,556 +10,524 @@ /* * __txn_rollback_to_stable_lookaside_fixup -- - * Remove any updates that need to be rolled back from the lookaside file. + * Remove any updates that need to be rolled back from the lookaside file. */ static int __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_CURSOR *cursor; - WT_DECL_RET; - WT_ITEM las_key, las_value; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t durable_timestamp, las_timestamp, rollback_timestamp; - uint64_t las_counter, las_pageid, las_total, las_txnid; - uint32_t las_id, session_flags; - uint8_t prepare_state, upd_type; - - conn = S2C(session); - cursor = NULL; - las_total = 0; - session_flags = 0; /* [-Werror=maybe-uninitialized] */ - - /* - * Copy the stable timestamp, otherwise we'd need to lock it each time - * it's accessed. Even though the stable timestamp isn't supposed to be - * updated while rolling back, accessing it without a lock would - * violate protocol. - */ - txn_global = &conn->txn_global; - WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp); - - __wt_las_cursor(session, &cursor, &session_flags); - - /* Discard pages we read as soon as we're done with them. */ - F_SET(session, WT_SESSION_READ_WONT_NEED); - - /* Walk the file. */ - __wt_writelock(session, &conn->cache->las_sweepwalk_lock); - while ((ret = cursor->next(cursor)) == 0) { - ++las_total; - WT_ERR(cursor->get_key(cursor, - &las_pageid, &las_id, &las_counter, &las_key)); - - /* Check the file ID so we can skip durable tables */ - if (las_id >= conn->stable_rollback_maxfile) - WT_PANIC_RET(session, EINVAL, "file ID %" PRIu32 - " in lookaside table larger than max %" PRIu32, - las_id, conn->stable_rollback_maxfile); - if (__bit_test(conn->stable_rollback_bitstring, las_id)) - continue; - - WT_ERR(cursor->get_value( - cursor, &las_txnid, &las_timestamp, - &durable_timestamp, &prepare_state, &upd_type, &las_value)); - - /* - * Entries with no timestamp will have a timestamp of zero, - * which will fail the following check and cause them to never - * be removed. - */ - if (rollback_timestamp < durable_timestamp) { - WT_ERR(cursor->remove(cursor)); - WT_STAT_CONN_INCR(session, txn_rollback_las_removed); - --las_total; - } - } - WT_ERR_NOTFOUND_OK(ret); -err: if (ret == 0) { - conn->cache->las_insert_count = las_total; - conn->cache->las_remove_count = 0; - } - __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); - WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); - - F_CLR(session, WT_SESSION_READ_WONT_NEED); - - return (ret); + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_ITEM las_key, las_value; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t durable_timestamp, las_timestamp, rollback_timestamp; + uint64_t las_counter, las_pageid, las_total, las_txnid; + uint32_t las_id, session_flags; + uint8_t prepare_state, upd_type; + + conn = S2C(session); + cursor = NULL; + las_total = 0; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + /* + * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even + * though the stable timestamp isn't supposed to be updated while rolling back, accessing it + * without a lock would violate protocol. + */ + txn_global = &conn->txn_global; + WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp); + + __wt_las_cursor(session, &cursor, &session_flags); + + /* Discard pages we read as soon as we're done with them. */ + F_SET(session, WT_SESSION_READ_WONT_NEED); + + /* Walk the file. */ + __wt_writelock(session, &conn->cache->las_sweepwalk_lock); + while ((ret = cursor->next(cursor)) == 0) { + ++las_total; + WT_ERR(cursor->get_key(cursor, &las_pageid, &las_id, &las_counter, &las_key)); + + /* Check the file ID so we can skip durable tables */ + if (las_id >= conn->stable_rollback_maxfile) + WT_PANIC_RET(session, EINVAL, + "file ID %" PRIu32 " in lookaside table larger than max %" PRIu32, las_id, + conn->stable_rollback_maxfile); + if (__bit_test(conn->stable_rollback_bitstring, las_id)) + continue; + + WT_ERR(cursor->get_value(cursor, &las_txnid, &las_timestamp, &durable_timestamp, + &prepare_state, &upd_type, &las_value)); + + /* + * Entries with no timestamp will have a timestamp of zero, which will fail the following + * check and cause them to never be removed. + */ + if (rollback_timestamp < durable_timestamp) { + WT_ERR(cursor->remove(cursor)); + WT_STAT_CONN_INCR(session, txn_rollback_las_removed); + --las_total; + } + } + WT_ERR_NOTFOUND_OK(ret); +err: + if (ret == 0) { + conn->cache->las_insert_count = las_total; + conn->cache->las_remove_count = 0; + } + __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + F_CLR(session, WT_SESSION_READ_WONT_NEED); + + return (ret); } /* * __txn_abort_newer_update -- - * Abort updates in an update change with timestamps newer than the - * rollback timestamp. + * Abort updates in an update change with timestamps newer than the rollback timestamp. */ static void -__txn_abort_newer_update(WT_SESSION_IMPL *session, - WT_UPDATE *first_upd, wt_timestamp_t rollback_timestamp) +__txn_abort_newer_update( + WT_SESSION_IMPL *session, WT_UPDATE *first_upd, wt_timestamp_t rollback_timestamp) { - WT_UPDATE *upd; - - for (upd = first_upd; upd != NULL; upd = upd->next) { - /* - * Updates with no timestamp will have a timestamp of zero and - * will never be rolled back. If the table is configured for - * strict timestamp checking, assert that all more recent - * updates were also rolled back. - */ - if (upd->txnid == WT_TXN_ABORTED || - upd->start_ts == WT_TS_NONE) { - if (upd == first_upd) - first_upd = upd->next; - } else if (rollback_timestamp < upd->durable_ts) { - /* - * If any updates are aborted, all newer updates - * better be aborted as well. - * - * Timestamp ordering relies on the validations at - * the time of commit. Thus if the table is not - * configured for key consistency check, the - * the timestamps could be out of order here. - */ - WT_ASSERT(session, - !FLD_ISSET(S2BT(session)->assert_flags, - WT_ASSERT_COMMIT_TS_KEYS) || - upd == first_upd); - first_upd = upd->next; - - upd->txnid = WT_TXN_ABORTED; - WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted); - upd->durable_ts = upd->start_ts = WT_TS_NONE; - } - } + WT_UPDATE *upd; + + for (upd = first_upd; upd != NULL; upd = upd->next) { + /* + * Updates with no timestamp will have a timestamp of zero and will never be rolled back. If + * the table is configured for strict timestamp checking, assert that all more recent + * updates were also rolled back. + */ + if (upd->txnid == WT_TXN_ABORTED || upd->start_ts == WT_TS_NONE) { + if (upd == first_upd) + first_upd = upd->next; + } else if (rollback_timestamp < upd->durable_ts) { + /* + * If any updates are aborted, all newer updates + * better be aborted as well. + * + * Timestamp ordering relies on the validations at + * the time of commit. Thus if the table is not + * configured for key consistency check, the + * the timestamps could be out of order here. + */ + WT_ASSERT(session, !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) || + upd == first_upd); + first_upd = upd->next; + + upd->txnid = WT_TXN_ABORTED; + WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted); + upd->durable_ts = upd->start_ts = WT_TS_NONE; + } + } } /* * __txn_abort_newer_insert -- - * Apply the update abort check to each entry in an insert skip list + * Apply the update abort check to each entry in an insert skip list */ static void -__txn_abort_newer_insert(WT_SESSION_IMPL *session, - WT_INSERT_HEAD *head, wt_timestamp_t rollback_timestamp) +__txn_abort_newer_insert( + WT_SESSION_IMPL *session, WT_INSERT_HEAD *head, wt_timestamp_t rollback_timestamp) { - WT_INSERT *ins; + WT_INSERT *ins; - WT_SKIP_FOREACH(ins, head) - __txn_abort_newer_update(session, ins->upd, rollback_timestamp); + WT_SKIP_FOREACH (ins, head) + __txn_abort_newer_update(session, ins->upd, rollback_timestamp); } /* * __txn_abort_newer_col_var -- - * Abort updates on a variable length col leaf page with timestamps newer - * than the rollback timestamp. + * Abort updates on a variable length col leaf page with timestamps newer than the rollback + * timestamp. */ static void __txn_abort_newer_col_var( - WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) { - WT_COL *cip; - WT_INSERT_HEAD *ins; - uint32_t i; - - /* Review the changes to the original on-page data items */ - WT_COL_FOREACH(page, cip, i) - if ((ins = WT_COL_UPDATE(page, cip)) != NULL) - __txn_abort_newer_insert(session, - ins, rollback_timestamp); - - /* Review the append list */ - if ((ins = WT_COL_APPEND(page)) != NULL) - __txn_abort_newer_insert(session, ins, rollback_timestamp); + WT_COL *cip; + WT_INSERT_HEAD *ins; + uint32_t i; + + /* Review the changes to the original on-page data items */ + WT_COL_FOREACH (page, cip, i) + if ((ins = WT_COL_UPDATE(page, cip)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); + + /* Review the append list */ + if ((ins = WT_COL_APPEND(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); } /* * __txn_abort_newer_col_fix -- - * Abort updates on a fixed length col leaf page with timestamps newer than - * the rollback timestamp. + * Abort updates on a fixed length col leaf page with timestamps newer than the rollback + * timestamp. */ static void __txn_abort_newer_col_fix( - WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) { - WT_INSERT_HEAD *ins; + WT_INSERT_HEAD *ins; - /* Review the changes to the original on-page data items */ - if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL) - __txn_abort_newer_insert(session, ins, rollback_timestamp); + /* Review the changes to the original on-page data items */ + if ((ins = WT_COL_UPDATE_SINGLE(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); - /* Review the append list */ - if ((ins = WT_COL_APPEND(page)) != NULL) - __txn_abort_newer_insert(session, ins, rollback_timestamp); + /* Review the append list */ + if ((ins = WT_COL_APPEND(page)) != NULL) + __txn_abort_newer_insert(session, ins, rollback_timestamp); } /* * __txn_abort_newer_row_leaf -- - * Abort updates on a row leaf page with timestamps newer than the - * rollback timestamp. + * Abort updates on a row leaf page with timestamps newer than the rollback timestamp. */ static void __txn_abort_newer_row_leaf( - WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) + WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t rollback_timestamp) { - WT_INSERT_HEAD *insert; - WT_ROW *rip; - WT_UPDATE *upd; - uint32_t i; - - /* - * Review the insert list for keys before the first entry on the disk - * page. - */ - if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) - __txn_abort_newer_insert(session, insert, rollback_timestamp); - - /* - * Review updates that belong to keys that are on the disk image, - * as well as for keys inserted since the page was read from disk. - */ - WT_ROW_FOREACH(page, rip, i) { - if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) - __txn_abort_newer_update( - session, upd, rollback_timestamp); - - if ((insert = WT_ROW_INSERT(page, rip)) != NULL) - __txn_abort_newer_insert( - session, insert, rollback_timestamp); - } + WT_INSERT_HEAD *insert; + WT_ROW *rip; + WT_UPDATE *upd; + uint32_t i; + + /* + * Review the insert list for keys before the first entry on the disk page. + */ + if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) + __txn_abort_newer_insert(session, insert, rollback_timestamp); + + /* + * Review updates that belong to keys that are on the disk image, as well as for keys inserted + * since the page was read from disk. + */ + WT_ROW_FOREACH (page, rip, i) { + if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) + __txn_abort_newer_update(session, upd, rollback_timestamp); + + if ((insert = WT_ROW_INSERT(page, rip)) != NULL) + __txn_abort_newer_insert(session, insert, rollback_timestamp); + } } /* * __txn_abort_newer_updates -- - * Abort updates on this page newer than the timestamp. + * Abort updates on this page newer than the timestamp. */ static int -__txn_abort_newer_updates( - WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) +__txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) { - WT_DECL_RET; - WT_PAGE *page; - uint32_t read_flags; - bool local_read; - - /* - * If we created a page image with updates the need to be rolled back, - * read the history into cache now and make sure the page is marked - * dirty. Otherwise, the history we need could be swept from the - * lookaside table before the page is read because the lookaside sweep - * code has no way to tell that the page image is invalid. - * - * So, if there is lookaside history for a page, first check if the - * history needs to be rolled back make sure that history is loaded - * into cache. That is, if skew_newest is true, so the disk image - * potentially contained unstable updates, and the history is more - * recent than the rollback timestamp. - * - * Also, we have separately discarded any lookaside history more recent - * than the rollback timestamp. For page_las structures in cache, - * reset any future timestamps back to the rollback timestamp. This - * allows those structures to be discarded once the rollback timestamp - * is stable (crucially for tests, they can be discarded if the - * connection is closed right after a rollback_to_stable call). - */ - local_read = false; - read_flags = WT_READ_WONT_NEED; - if (ref->page_las != NULL) { - if (ref->page_las->skew_newest && rollback_timestamp < - ref->page_las->unstable_durable_timestamp) { - /* - * Make sure we get back a page with history, not a - * limbo page. - */ - WT_ASSERT(session, - !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); - WT_RET(__wt_page_in(session, ref, read_flags)); - WT_ASSERT(session, ref->state != WT_REF_LIMBO && - ref->page != NULL && - __wt_page_is_modified(ref->page)); - local_read = true; - } - if (ref->page_las->max_timestamp > rollback_timestamp) - ref->page_las->max_timestamp = rollback_timestamp; - if (ref->page_las->unstable_durable_timestamp > - rollback_timestamp) - ref->page_las->unstable_durable_timestamp = - rollback_timestamp; - if (ref->page_las->unstable_timestamp > rollback_timestamp) - ref->page_las->unstable_timestamp = rollback_timestamp; - } - - /* Review deleted page saved to the ref */ - if (ref->page_del != NULL && - rollback_timestamp < ref->page_del->durable_timestamp) - WT_ERR(__wt_delete_page_rollback(session, ref)); - - /* - * If we have a ref with no page, or the page is clean, there is - * nothing to roll back. - * - * This check for a clean page is partly an optimization (checkpoint - * only marks pages clean when they have no unwritten updates so - * there's no point visiting them again), but also covers a corner case - * of a checkpoint with use_timestamp=false. Such a checkpoint - * effectively moves the stable timestamp forward, because changes that - * are written in the checkpoint cannot be reliably rolled back. The - * actual stable timestamp doesn't change, though, so if we try to roll - * back clean pages the in-memory tree can get out of sync with the - * on-disk tree. - */ - if ((page = ref->page) == NULL || !__wt_page_is_modified(page)) - goto err; - - switch (page->type) { - case WT_PAGE_COL_FIX: - __txn_abort_newer_col_fix(session, page, rollback_timestamp); - break; - case WT_PAGE_COL_VAR: - __txn_abort_newer_col_var(session, page, rollback_timestamp); - break; - case WT_PAGE_COL_INT: - case WT_PAGE_ROW_INT: - /* - * There is nothing to do for internal pages, since we aren't - * rolling back far enough to potentially include reconciled - * changes - and thus won't need to roll back structure - * changes on internal pages. - */ - break; - case WT_PAGE_ROW_LEAF: - __txn_abort_newer_row_leaf(session, page, rollback_timestamp); - break; - default: - WT_ERR(__wt_illegal_value(session, page->type)); - } - -err: if (local_read) - WT_TRET(__wt_page_release(session, ref, read_flags)); - return (ret); + WT_DECL_RET; + WT_PAGE *page; + uint32_t read_flags; + bool local_read; + + /* + * If we created a page image with updates the need to be rolled back, + * read the history into cache now and make sure the page is marked + * dirty. Otherwise, the history we need could be swept from the + * lookaside table before the page is read because the lookaside sweep + * code has no way to tell that the page image is invalid. + * + * So, if there is lookaside history for a page, first check if the + * history needs to be rolled back make sure that history is loaded + * into cache. That is, if skew_newest is true, so the disk image + * potentially contained unstable updates, and the history is more + * recent than the rollback timestamp. + * + * Also, we have separately discarded any lookaside history more recent + * than the rollback timestamp. For page_las structures in cache, + * reset any future timestamps back to the rollback timestamp. This + * allows those structures to be discarded once the rollback timestamp + * is stable (crucially for tests, they can be discarded if the + * connection is closed right after a rollback_to_stable call). + */ + local_read = false; + read_flags = WT_READ_WONT_NEED; + if (ref->page_las != NULL) { + if (ref->page_las->skew_newest && + rollback_timestamp < ref->page_las->unstable_durable_timestamp) { + /* + * Make sure we get back a page with history, not a limbo page. + */ + WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); + WT_RET(__wt_page_in(session, ref, read_flags)); + WT_ASSERT(session, + ref->state != WT_REF_LIMBO && ref->page != NULL && __wt_page_is_modified(ref->page)); + local_read = true; + } + if (ref->page_las->max_timestamp > rollback_timestamp) + ref->page_las->max_timestamp = rollback_timestamp; + if (ref->page_las->unstable_durable_timestamp > rollback_timestamp) + ref->page_las->unstable_durable_timestamp = rollback_timestamp; + if (ref->page_las->unstable_timestamp > rollback_timestamp) + ref->page_las->unstable_timestamp = rollback_timestamp; + } + + /* Review deleted page saved to the ref */ + if (ref->page_del != NULL && rollback_timestamp < ref->page_del->durable_timestamp) + WT_ERR(__wt_delete_page_rollback(session, ref)); + + /* + * If we have a ref with no page, or the page is clean, there is + * nothing to roll back. + * + * This check for a clean page is partly an optimization (checkpoint + * only marks pages clean when they have no unwritten updates so + * there's no point visiting them again), but also covers a corner case + * of a checkpoint with use_timestamp=false. Such a checkpoint + * effectively moves the stable timestamp forward, because changes that + * are written in the checkpoint cannot be reliably rolled back. The + * actual stable timestamp doesn't change, though, so if we try to roll + * back clean pages the in-memory tree can get out of sync with the + * on-disk tree. + */ + if ((page = ref->page) == NULL || !__wt_page_is_modified(page)) + goto err; + + switch (page->type) { + case WT_PAGE_COL_FIX: + __txn_abort_newer_col_fix(session, page, rollback_timestamp); + break; + case WT_PAGE_COL_VAR: + __txn_abort_newer_col_var(session, page, rollback_timestamp); + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + /* + * There is nothing to do for internal pages, since we aren't rolling back far enough to + * potentially include reconciled changes - and thus won't need to roll back structure + * changes on internal pages. + */ + break; + case WT_PAGE_ROW_LEAF: + __txn_abort_newer_row_leaf(session, page, rollback_timestamp); + break; + default: + WT_ERR(__wt_illegal_value(session, page->type)); + } + +err: + if (local_read) + WT_TRET(__wt_page_release(session, ref, read_flags)); + return (ret); } /* * __txn_rollback_to_stable_btree_walk -- - * Called for each open handle - choose to either skip or wipe the commits + * Called for each open handle - choose to either skip or wipe the commits */ static int -__txn_rollback_to_stable_btree_walk( - WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) +__txn_rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) { - WT_DECL_RET; - WT_REF *child_ref, *ref; - - /* Walk the tree, marking commits aborted where appropriate. */ - ref = NULL; - while ((ret = __wt_tree_walk(session, &ref, - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 && - ref != NULL) { - if (WT_PAGE_IS_INTERNAL(ref->page)) { - WT_INTL_FOREACH_BEGIN(session, ref->page, child_ref) { - WT_RET(__txn_abort_newer_updates( - session, child_ref, rollback_timestamp)); - } WT_INTL_FOREACH_END; - } else - WT_RET(__txn_abort_newer_updates( - session, ref, rollback_timestamp)); - } - return (ret); + WT_DECL_RET; + WT_REF *child_ref, *ref; + + /* Walk the tree, marking commits aborted where appropriate. */ + ref = NULL; + while ((ret = __wt_tree_walk( + session, &ref, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 && + ref != NULL) { + if (WT_PAGE_IS_INTERNAL(ref->page)) { + WT_INTL_FOREACH_BEGIN (session, ref->page, child_ref) { + WT_RET(__txn_abort_newer_updates(session, child_ref, rollback_timestamp)); + } + WT_INTL_FOREACH_END; + } else + WT_RET(__txn_abort_newer_updates(session, ref, rollback_timestamp)); + } + return (ret); } /* * __txn_rollback_eviction_drain -- - * Wait for eviction to drain from a tree. + * Wait for eviction to drain from a tree. */ static int __txn_rollback_eviction_drain(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_UNUSED(cfg); + WT_UNUSED(cfg); - WT_RET(__wt_evict_file_exclusive_on(session)); - __wt_evict_file_exclusive_off(session); - return (0); + WT_RET(__wt_evict_file_exclusive_on(session)); + __wt_evict_file_exclusive_off(session); + return (0); } /* * __txn_rollback_to_stable_btree -- - * Called for each open handle - choose to either skip or wipe the commits + * Called for each open handle - choose to either skip or wipe the commits */ static int __txn_rollback_to_stable_btree(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t rollback_timestamp; - - WT_UNUSED(cfg); - - btree = S2BT(session); - conn = S2C(session); - txn_global = &conn->txn_global; - - /* - * Immediately durable files don't get their commits wiped. This case - * mostly exists to support the semantic required for the oplog in - * MongoDB - updates that have been made to the oplog should not be - * aborted. It also wouldn't be safe to roll back updates for any - * table that had it's records logged, since those updates would be - * recovered after a crash making them inconsistent. - */ - if (__wt_btree_immediately_durable(session)) { - /* - * Add the btree ID to the bitstring, so we can exclude any - * lookaside entries for this btree. - */ - if (btree->id >= conn->stable_rollback_maxfile) - WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32 - " larger than max %" PRIu32, - btree->id, conn->stable_rollback_maxfile); - __bit_set(conn->stable_rollback_bitstring, btree->id); - return (0); - } - - /* There is never anything to do for checkpoint handles */ - if (session->dhandle->checkpoint != NULL) - return (0); - - /* There is nothing to do on an empty tree. */ - if (btree->root.page == NULL) - return (0); - - /* - * Copy the stable timestamp, otherwise we'd need to lock it each time - * it's accessed. Even though the stable timestamp isn't supposed to be - * updated while rolling back, accessing it without a lock would - * violate protocol. - */ - WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp); - - /* - * Ensure the eviction server is out of the file - we don't - * want it messing with us. This step shouldn't be required, but - * it simplifies some of the reasoning about what state trees can - * be in. - */ - WT_RET(__wt_evict_file_exclusive_on(session)); - WT_WITH_PAGE_INDEX(session, ret = - __txn_rollback_to_stable_btree_walk(session, rollback_timestamp)); - __wt_evict_file_exclusive_off(session); - - return (ret); + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t rollback_timestamp; + + WT_UNUSED(cfg); + + btree = S2BT(session); + conn = S2C(session); + txn_global = &conn->txn_global; + + /* + * Immediately durable files don't get their commits wiped. This case mostly exists to support + * the semantic required for the oplog in MongoDB - updates that have been made to the oplog + * should not be aborted. It also wouldn't be safe to roll back updates for any table that had + * it's records logged, since those updates would be recovered after a crash making them + * inconsistent. + */ + if (__wt_btree_immediately_durable(session)) { + /* + * Add the btree ID to the bitstring, so we can exclude any lookaside entries for this + * btree. + */ + if (btree->id >= conn->stable_rollback_maxfile) + WT_PANIC_RET(session, EINVAL, "btree file ID %" PRIu32 " larger than max %" PRIu32, + btree->id, conn->stable_rollback_maxfile); + __bit_set(conn->stable_rollback_bitstring, btree->id); + return (0); + } + + /* There is never anything to do for checkpoint handles */ + if (session->dhandle->checkpoint != NULL) + return (0); + + /* There is nothing to do on an empty tree. */ + if (btree->root.page == NULL) + return (0); + + /* + * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even + * though the stable timestamp isn't supposed to be updated while rolling back, accessing it + * without a lock would violate protocol. + */ + WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp); + + /* + * Ensure the eviction server is out of the file - we don't want it messing with us. This step + * shouldn't be required, but it simplifies some of the reasoning about what state trees can be + * in. + */ + WT_RET(__wt_evict_file_exclusive_on(session)); + WT_WITH_PAGE_INDEX( + session, ret = __txn_rollback_to_stable_btree_walk(session, rollback_timestamp)); + __wt_evict_file_exclusive_off(session); + + return (ret); } /* * __txn_rollback_to_stable_check -- - * Ensure the rollback request is reasonable. + * Ensure the rollback request is reasonable. */ static int __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - bool txn_active; - - conn = S2C(session); - txn_global = &conn->txn_global; - if (!txn_global->has_stable_timestamp) - WT_RET_MSG(session, EINVAL, - "rollback_to_stable requires a stable timestamp"); - - /* - * Help the user comply with the requirement that there are no - * concurrent operations. Protect against spurious conflicts with the - * sweep server: we exclude it from running concurrent with rolling - * back the lookaside contents. - */ - __wt_writelock(session, &conn->cache->las_sweepwalk_lock); - ret = __wt_txn_activity_check(session, &txn_active); - __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); - - if (ret == 0 && txn_active) - WT_RET_MSG(session, EINVAL, - "rollback_to_stable illegal with active transactions"); - - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + bool txn_active; + + conn = S2C(session); + txn_global = &conn->txn_global; + if (!txn_global->has_stable_timestamp) + WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a stable timestamp"); + + /* + * Help the user comply with the requirement that there are no concurrent operations. Protect + * against spurious conflicts with the sweep server: we exclude it from running concurrent with + * rolling back the lookaside contents. + */ + __wt_writelock(session, &conn->cache->las_sweepwalk_lock); + ret = __wt_txn_activity_check(session, &txn_active); +#ifdef HAVE_DIAGNOSTIC + if (txn_active) + WT_TRET(__wt_verbose_dump_txn(session)); +#endif + __wt_writeunlock(session, &conn->cache->las_sweepwalk_lock); + + if (ret == 0 && txn_active) + WT_RET_MSG(session, EINVAL, "rollback_to_stable illegal with active transactions"); + + return (ret); } /* * __txn_rollback_to_stable -- - * Rollback all in-memory state related to timestamps more recent than - * the passed in timestamp. + * Rollback all in-memory state related to timestamps more recent than the passed in timestamp. */ static int __txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - - conn = S2C(session); - - WT_STAT_CONN_INCR(session, txn_rollback_to_stable); - /* - * Mark that a rollback operation is in progress and wait for eviction - * to drain. This is necessary because lookaside eviction uses - * transactions and causes the check for a quiescent system to fail. - * - * Configuring lookaside eviction off isn't atomic, safe because the - * flag is only otherwise set when closing down the database. Assert - * to avoid confusion in the future. - */ - WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)); - F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE); - - WT_ERR(__wt_conn_btree_apply(session, - NULL, __txn_rollback_eviction_drain, NULL, cfg)); - - WT_ERR(__txn_rollback_to_stable_check(session)); - - F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE); - - /* - * Allocate a non-durable btree bitstring. We increment the global - * value before using it, so the current value is already in use, and - * hence we need to add one here. - */ - conn->stable_rollback_maxfile = conn->next_file_id + 1; - WT_ERR(__bit_alloc(session, - conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring)); - WT_ERR(__wt_conn_btree_apply(session, - NULL, __txn_rollback_to_stable_btree, NULL, cfg)); - - /* - * Clear any offending content from the lookaside file. This must be - * done after the in-memory application, since the process of walking - * trees in cache populates a list that is used to check which - * lookaside records should be removed. - */ - if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) - WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session)); - -err: F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE); - __wt_free(session, conn->stable_rollback_bitstring); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + WT_STAT_CONN_INCR(session, txn_rollback_to_stable); + /* + * Mark that a rollback operation is in progress and wait for eviction + * to drain. This is necessary because lookaside eviction uses + * transactions and causes the check for a quiescent system to fail. + * + * Configuring lookaside eviction off isn't atomic, safe because the + * flag is only otherwise set when closing down the database. Assert + * to avoid confusion in the future. + */ + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_EVICTION_NO_LOOKASIDE)); + F_SET(conn, WT_CONN_EVICTION_NO_LOOKASIDE); + + WT_ERR(__wt_conn_btree_apply(session, NULL, __txn_rollback_eviction_drain, NULL, cfg)); + + WT_ERR(__txn_rollback_to_stable_check(session)); + + F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE); + + /* + * Allocate a non-durable btree bitstring. We increment the global value before using it, so the + * current value is already in use, and hence we need to add one here. + */ + conn->stable_rollback_maxfile = conn->next_file_id + 1; + WT_ERR(__bit_alloc(session, conn->stable_rollback_maxfile, &conn->stable_rollback_bitstring)); + WT_ERR(__wt_conn_btree_apply(session, NULL, __txn_rollback_to_stable_btree, NULL, cfg)); + + /* + * Clear any offending content from the lookaside file. This must be done after the in-memory + * application, since the process of walking trees in cache populates a list that is used to + * check which lookaside records should be removed. + */ + if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) + WT_ERR(__txn_rollback_to_stable_lookaside_fixup(session)); + +err: + F_CLR(conn, WT_CONN_EVICTION_NO_LOOKASIDE); + __wt_free(session, conn->stable_rollback_bitstring); + return (ret); } /* * __wt_txn_rollback_to_stable -- - * Rollback all in-memory state related to timestamps more recent than - * the passed in timestamp. + * Rollback all in-memory state related to timestamps more recent than the passed in timestamp. */ int __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_DECL_RET; - - /* - * Don't use the connection's default session: we are working on data - * handles and (a) don't want to cache all of them forever, plus (b) - * can't guarantee that no other method will be called concurrently. - */ - WT_RET(__wt_open_internal_session(S2C(session), - "txn rollback_to_stable", true, 0, &session)); - ret = __txn_rollback_to_stable(session, cfg); - WT_TRET(session->iface.close(&session->iface, NULL)); - - return (ret); + WT_DECL_RET; + + /* + * Don't use the connection's default session: we are working on data handles and (a) don't want + * to cache all of them forever, plus (b) can't guarantee that no other method will be called + * concurrently. + */ + WT_RET(__wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, &session)); + ret = __txn_rollback_to_stable(session, cfg); + WT_TRET(session->iface.close(&session->iface, NULL)); + + return (ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 84b9c290641..2d9291ebbce 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -10,1397 +10,1285 @@ /* * __wt_timestamp_to_string -- - * Convert a timestamp to the MongoDB string representation. + * Convert a timestamp to the MongoDB string representation. */ char * __wt_timestamp_to_string(wt_timestamp_t ts, char *ts_string) { - WT_IGNORE_RET(__wt_snprintf(ts_string, WT_TS_INT_STRING_SIZE, - "(%" PRIu32 ",%" PRIu32 ")", - (uint32_t)((ts >> 32) & 0xffffffff), (uint32_t)(ts & 0xffffffff))); - return (ts_string); + WT_IGNORE_RET(__wt_snprintf(ts_string, WT_TS_INT_STRING_SIZE, "(%" PRIu32 ",%" PRIu32 ")", + (uint32_t)((ts >> 32) & 0xffffffff), (uint32_t)(ts & 0xffffffff))); + return (ts_string); } /* * __wt_timestamp_to_hex_string -- - * Convert a timestamp to hex string representation. + * Convert a timestamp to hex string representation. */ void __wt_timestamp_to_hex_string(wt_timestamp_t ts, char *hex_timestamp) { - char *p, v; - - if (ts == 0) { - hex_timestamp[0] = '0'; - hex_timestamp[1] = '\0'; - return; - } - if (ts == WT_TS_MAX) { -#define WT_TS_MAX_HEX_STRING "ffffffffffffffff" - (void)memcpy(hex_timestamp, - WT_TS_MAX_HEX_STRING, strlen(WT_TS_MAX_HEX_STRING) + 1); - return; - } - - for (p = hex_timestamp; ts != 0; ts >>= 4) - *p++ = (char)__wt_hex((u_char)(ts & 0x0f)); - *p = '\0'; - - /* Reverse the string. */ - for (--p; p > hex_timestamp;) { - v = *p; - *p-- = *hex_timestamp; - *hex_timestamp++ = v; - } + char *p, v; + + if (ts == 0) { + hex_timestamp[0] = '0'; + hex_timestamp[1] = '\0'; + return; + } + if (ts == WT_TS_MAX) { +#define WT_TS_MAX_HEX_STRING "ffffffffffffffff" + (void)memcpy(hex_timestamp, WT_TS_MAX_HEX_STRING, strlen(WT_TS_MAX_HEX_STRING) + 1); + return; + } + + for (p = hex_timestamp; ts != 0; ts >>= 4) + *p++ = (char)__wt_hex((u_char)(ts & 0x0f)); + *p = '\0'; + + /* Reverse the string. */ + for (--p; p > hex_timestamp;) { + v = *p; + *p-- = *hex_timestamp; + *hex_timestamp++ = v; + } } /* * __wt_verbose_timestamp -- - * Output a verbose message along with the specified timestamp. + * Output a verbose message along with the specified timestamp. */ void -__wt_verbose_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t ts, const char *msg) +__wt_verbose_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t ts, const char *msg) { - char ts_string[WT_TS_INT_STRING_SIZE]; + char ts_string[WT_TS_INT_STRING_SIZE]; - __wt_verbose(session, - WT_VERB_TIMESTAMP, "Timestamp %s: %s", - __wt_timestamp_to_string(ts, ts_string), msg); + __wt_verbose( + session, WT_VERB_TIMESTAMP, "Timestamp %s: %s", __wt_timestamp_to_string(ts, ts_string), msg); } /* * __wt_txn_parse_timestamp_raw -- - * Decodes and sets a timestamp. Don't do any checking. + * Decodes and sets a timestamp. Don't do any checking. */ int -__wt_txn_parse_timestamp_raw(WT_SESSION_IMPL *session, const char *name, - wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) +__wt_txn_parse_timestamp_raw( + WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) { - static const int8_t hextable[] = { - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, -1, -1, -1, -1, -1, -1, - -1, 10, 11, 12, 13, 14, 15, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, - -1, 10, 11, 12, 13, 14, 15, -1 - }; - wt_timestamp_t ts; - size_t len; - int hex_val; - const char *hex_itr; - - *timestamp = 0; - - if (cval->len == 0) - return (0); - - /* Protect against unexpectedly long hex strings. */ - if (cval->len > 2 * sizeof(wt_timestamp_t)) - WT_RET_MSG(session, EINVAL, - "%s timestamp too long '%.*s'", - name, (int)cval->len, cval->str); - - for (ts = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) { - if ((size_t)*hex_itr < WT_ELEMENTS(hextable)) - hex_val = hextable[(size_t)*hex_itr++]; - else - hex_val = -1; - if (hex_val < 0) - WT_RET_MSG(session, EINVAL, - "Failed to parse %s timestamp '%.*s'", - name, (int)cval->len, cval->str); - ts = (ts << 4) | (uint64_t)hex_val; - } - *timestamp = ts; - - return (0); + static const int8_t hextable[] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1}; + wt_timestamp_t ts; + size_t len; + int hex_val; + const char *hex_itr; + + *timestamp = 0; + + if (cval->len == 0) + return (0); + + /* Protect against unexpectedly long hex strings. */ + if (cval->len > 2 * sizeof(wt_timestamp_t)) + WT_RET_MSG( + session, EINVAL, "%s timestamp too long '%.*s'", name, (int)cval->len, cval->str); + + for (ts = 0, hex_itr = cval->str, len = cval->len; len > 0; --len) { + if ((size_t)*hex_itr < WT_ELEMENTS(hextable)) + hex_val = hextable[(size_t)*hex_itr++]; + else + hex_val = -1; + if (hex_val < 0) + WT_RET_MSG(session, EINVAL, "Failed to parse %s timestamp '%.*s'", name, (int)cval->len, + cval->str); + ts = (ts << 4) | (uint64_t)hex_val; + } + *timestamp = ts; + + return (0); } /* * __wt_txn_parse_timestamp -- - * Decodes and sets a timestamp checking it is non-zero. + * Decodes and sets a timestamp checking it is non-zero. */ int -__wt_txn_parse_timestamp(WT_SESSION_IMPL *session, const char *name, - wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) +__wt_txn_parse_timestamp( + WT_SESSION_IMPL *session, const char *name, wt_timestamp_t *timestamp, WT_CONFIG_ITEM *cval) { - WT_RET(__wt_txn_parse_timestamp_raw(session, name, timestamp, cval)); - if (cval->len != 0 && *timestamp == WT_TS_NONE) - WT_RET_MSG(session, EINVAL, - "Failed to parse %s timestamp '%.*s': zero not permitted", - name, (int)cval->len, cval->str); + WT_RET(__wt_txn_parse_timestamp_raw(session, name, timestamp, cval)); + if (cval->len != 0 && *timestamp == WT_TS_NONE) + WT_RET_MSG(session, EINVAL, "Failed to parse %s timestamp '%.*s': zero not permitted", name, + (int)cval->len, cval->str); - return (0); + return (0); } /* * __txn_get_read_timestamp -- - * Get the read timestamp from the transaction. Additionally - * return bool to specify whether the transaction has set - * clear read queue flag. + * Get the read timestamp from the transaction. Additionally return bool to specify whether the + * transaction has set clear read queue flag. */ static bool -__txn_get_read_timestamp( - WT_TXN *txn, wt_timestamp_t *read_timestampp) +__txn_get_read_timestamp(WT_TXN *txn, wt_timestamp_t *read_timestampp) { - WT_ORDERED_READ(*read_timestampp, txn->read_timestamp); - return (!txn->clear_read_q); + WT_ORDERED_READ(*read_timestampp, txn->read_timestamp); + return (!txn->clear_read_q); } /* * __wt_txn_get_pinned_timestamp -- - * Calculate the current pinned timestamp. + * Calculate the current pinned timestamp. */ int -__wt_txn_get_pinned_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags) +__wt_txn_get_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, uint32_t flags) { - WT_CONNECTION_IMPL *conn; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t tmp_read_ts, tmp_ts; - bool include_oldest, txn_has_write_lock; - - conn = S2C(session); - txn_global = &conn->txn_global; - include_oldest = LF_ISSET(WT_TXN_TS_INCLUDE_OLDEST); - txn_has_write_lock = LF_ISSET(WT_TXN_TS_ALREADY_LOCKED); - - if (include_oldest && !txn_global->has_oldest_timestamp) - return (WT_NOTFOUND); - - if (!txn_has_write_lock) - __wt_readlock(session, &txn_global->rwlock); - - tmp_ts = include_oldest ? txn_global->oldest_timestamp : 0; - - /* Check for a running checkpoint */ - if (LF_ISSET(WT_TXN_TS_INCLUDE_CKPT) && - txn_global->checkpoint_timestamp != WT_TS_NONE && - (tmp_ts == 0 || txn_global->checkpoint_timestamp < tmp_ts)) - tmp_ts = txn_global->checkpoint_timestamp; - if (!txn_has_write_lock) - __wt_readunlock(session, &txn_global->rwlock); - - /* Look for the oldest ordinary reader. */ - __wt_readlock(session, &txn_global->read_timestamp_rwlock); - TAILQ_FOREACH(txn, &txn_global->read_timestamph, read_timestampq) { - /* - * Skip any transactions on the queue that are not active. - * Copy out value of read timestamp to prevent possible - * race where a transaction resets its read timestamp while - * we traverse the queue. - */ - if (!__txn_get_read_timestamp(txn, &tmp_read_ts)) - continue; - /* - * A zero timestamp is possible here only when the oldest - * timestamp is not accounted for. - */ - if (tmp_ts == 0 || tmp_read_ts < tmp_ts) - tmp_ts = tmp_read_ts; - /* - * We break on the first active txn on the list. - */ - break; - } - __wt_readunlock(session, &txn_global->read_timestamp_rwlock); - - if (!include_oldest && tmp_ts == 0) - return (WT_NOTFOUND); - *tsp = tmp_ts; - - return (0); + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t tmp_read_ts, tmp_ts; + bool include_oldest, txn_has_write_lock; + + conn = S2C(session); + txn_global = &conn->txn_global; + include_oldest = LF_ISSET(WT_TXN_TS_INCLUDE_OLDEST); + txn_has_write_lock = LF_ISSET(WT_TXN_TS_ALREADY_LOCKED); + + if (include_oldest && !txn_global->has_oldest_timestamp) + return (WT_NOTFOUND); + + if (!txn_has_write_lock) + __wt_readlock(session, &txn_global->rwlock); + + tmp_ts = include_oldest ? txn_global->oldest_timestamp : 0; + + /* Check for a running checkpoint */ + if (LF_ISSET(WT_TXN_TS_INCLUDE_CKPT) && txn_global->checkpoint_timestamp != WT_TS_NONE && + (tmp_ts == 0 || txn_global->checkpoint_timestamp < tmp_ts)) + tmp_ts = txn_global->checkpoint_timestamp; + if (!txn_has_write_lock) + __wt_readunlock(session, &txn_global->rwlock); + + /* Look for the oldest ordinary reader. */ + __wt_readlock(session, &txn_global->read_timestamp_rwlock); + TAILQ_FOREACH (txn, &txn_global->read_timestamph, read_timestampq) { + /* + * Skip any transactions on the queue that are not active. Copy out value of read timestamp + * to prevent possible race where a transaction resets its read timestamp while we traverse + * the queue. + */ + if (!__txn_get_read_timestamp(txn, &tmp_read_ts)) + continue; + /* + * A zero timestamp is possible here only when the oldest timestamp is not accounted for. + */ + if (tmp_ts == 0 || tmp_read_ts < tmp_ts) + tmp_ts = tmp_read_ts; + /* + * We break on the first active txn on the list. + */ + break; + } + __wt_readunlock(session, &txn_global->read_timestamp_rwlock); + + if (!include_oldest && tmp_ts == 0) + return (WT_NOTFOUND); + *tsp = tmp_ts; + + return (0); } /* * __txn_get_published_timestamp -- - * Get the current durable timestamp for a given transaction. If there is - * an explicit durable timestamp, this function will return the commit - * timestamp since this is implied. If there is neither a commit nor a - * durable timestamp, this function will return 0. + * Get the current durable timestamp for a given transaction. If there is an explicit durable + * timestamp, this function will return the commit timestamp since this is implied. If there is + * neither a commit nor a durable timestamp, this function will return 0. */ static inline wt_timestamp_t __txn_get_published_timestamp(WT_SESSION_IMPL *session, WT_TXN *txn) { - wt_timestamp_t ts; - - /* - * Any checking of bit flags in this logic is invalid. __wt_txn_release - * may have already been called on this transaction which will set the - * flags member to 0. So we need to deduce which timestamp to use purely - * by inspecting the timestamp members which we deliberately preserve - * for reader threads such as ourselves. - * - * In the non-prepared case, the first commit will either be less than - * the commit (in the case of multiple commits) in which case we should - * return the first commit. Or it will be equal to the commit (in the - * case of a single commit) and we can return durable (which is mirrored - * from the commit timestamp). - * - * In the prepared case, the first commit will always be equal to the - * commit so we'll return durable. - */ - if (txn->commit_timestamp != txn->first_commit_timestamp) - ts = txn->first_commit_timestamp; - else - ts = txn->durable_timestamp; - - WT_ASSERT(session, ts != WT_TS_NONE); - return (ts); + wt_timestamp_t ts; + + /* + * Any checking of bit flags in this logic is invalid. __wt_txn_release + * may have already been called on this transaction which will set the + * flags member to 0. So we need to deduce which timestamp to use purely + * by inspecting the timestamp members which we deliberately preserve + * for reader threads such as ourselves. + * + * In the non-prepared case, the first commit will either be less than + * the commit (in the case of multiple commits) in which case we should + * return the first commit. Or it will be equal to the commit (in the + * case of a single commit) and we can return durable (which is mirrored + * from the commit timestamp). + * + * In the prepared case, the first commit will always be equal to the + * commit so we'll return durable. + */ + if (txn->commit_timestamp != txn->first_commit_timestamp) + ts = txn->first_commit_timestamp; + else + ts = txn->durable_timestamp; + + WT_ASSERT(session, ts != WT_TS_NONE); + return (ts); } /* * __txn_global_query_timestamp -- - * Query a timestamp on the global transaction. + * Query a timestamp on the global transaction. */ static int -__txn_global_query_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[]) +__txn_global_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_CONNECTION_IMPL *conn; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t ts, tmpts; - - conn = S2C(session); - txn_global = &conn->txn_global; - - WT_STAT_CONN_INCR(session, txn_query_ts); - WT_RET(__wt_config_gets(session, cfg, "get", &cval)); - if (WT_STRING_MATCH("all_committed", cval.str, cval.len) || - WT_STRING_MATCH("all_durable", cval.str, cval.len)) { - if (!txn_global->has_durable_timestamp) - return (WT_NOTFOUND); - ts = txn_global->durable_timestamp; - WT_ASSERT(session, ts != WT_TS_NONE); - - /* - * Skip straight to the commit queue if no running transactions - * have an explicit durable timestamp. - */ - if (TAILQ_EMPTY(&txn_global->durable_timestamph)) - goto done; - /* - * Compare with the least recently durable transaction. - */ - __wt_readlock(session, &txn_global->durable_timestamp_rwlock); - TAILQ_FOREACH(txn, &txn_global->durable_timestamph, - durable_timestampq) { - if (txn->clear_durable_q) - continue; - - tmpts = __txn_get_published_timestamp(session, txn) - 1; - if (tmpts < ts) - ts = tmpts; - break; - } - __wt_readunlock(session, &txn_global->durable_timestamp_rwlock); - - /* - * If a transaction is committing with a durable timestamp of 1, - * we could return zero here, which is unexpected. Fail instead. - */ - if (ts == WT_TS_NONE) - return (WT_NOTFOUND); - } else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len)) - /* Read-only value forever. No lock needed. */ - ts = txn_global->last_ckpt_timestamp; - else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) { - if (!txn_global->has_oldest_timestamp) - return (WT_NOTFOUND); - ts = txn_global->oldest_timestamp; - } else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len)) - WT_RET(__wt_txn_get_pinned_timestamp( - session, &ts, WT_TXN_TS_INCLUDE_CKPT)); - else if (WT_STRING_MATCH("pinned", cval.str, cval.len)) - WT_RET(__wt_txn_get_pinned_timestamp(session, &ts, - WT_TXN_TS_INCLUDE_CKPT | WT_TXN_TS_INCLUDE_OLDEST)); - else if (WT_STRING_MATCH("recovery", cval.str, cval.len)) - /* Read-only value forever. No lock needed. */ - ts = txn_global->recovery_timestamp; - else if (WT_STRING_MATCH("stable", cval.str, cval.len)) { - if (!txn_global->has_stable_timestamp) - return (WT_NOTFOUND); - ts = txn_global->stable_timestamp; - } else - WT_RET_MSG(session, EINVAL, - "unknown timestamp query %.*s", (int)cval.len, cval.str); - -done: *tsp = ts; - return (0); + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t ts, tmpts; + + conn = S2C(session); + txn_global = &conn->txn_global; + + WT_STAT_CONN_INCR(session, txn_query_ts); + WT_RET(__wt_config_gets(session, cfg, "get", &cval)); + if (WT_STRING_MATCH("all_committed", cval.str, cval.len) || + WT_STRING_MATCH("all_durable", cval.str, cval.len)) { + if (!txn_global->has_durable_timestamp) + return (WT_NOTFOUND); + ts = txn_global->durable_timestamp; + WT_ASSERT(session, ts != WT_TS_NONE); + + /* + * Skip straight to the commit queue if no running transactions have an explicit durable + * timestamp. + */ + if (TAILQ_EMPTY(&txn_global->durable_timestamph)) + goto done; + /* + * Compare with the least recently durable transaction. + */ + __wt_readlock(session, &txn_global->durable_timestamp_rwlock); + TAILQ_FOREACH (txn, &txn_global->durable_timestamph, durable_timestampq) { + if (txn->clear_durable_q) + continue; + + tmpts = __txn_get_published_timestamp(session, txn) - 1; + if (tmpts < ts) + ts = tmpts; + break; + } + __wt_readunlock(session, &txn_global->durable_timestamp_rwlock); + + /* + * If a transaction is committing with a durable timestamp of 1, we could return zero here, + * which is unexpected. Fail instead. + */ + if (ts == WT_TS_NONE) + return (WT_NOTFOUND); + } else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len)) + /* Read-only value forever. No lock needed. */ + ts = txn_global->last_ckpt_timestamp; + else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) { + if (!txn_global->has_oldest_timestamp) + return (WT_NOTFOUND); + ts = txn_global->oldest_timestamp; + } else if (WT_STRING_MATCH("oldest_reader", cval.str, cval.len)) + WT_RET(__wt_txn_get_pinned_timestamp(session, &ts, WT_TXN_TS_INCLUDE_CKPT)); + else if (WT_STRING_MATCH("pinned", cval.str, cval.len)) + WT_RET(__wt_txn_get_pinned_timestamp( + session, &ts, WT_TXN_TS_INCLUDE_CKPT | WT_TXN_TS_INCLUDE_OLDEST)); + else if (WT_STRING_MATCH("recovery", cval.str, cval.len)) + /* Read-only value forever. No lock needed. */ + ts = txn_global->recovery_timestamp; + else if (WT_STRING_MATCH("stable", cval.str, cval.len)) { + if (!txn_global->has_stable_timestamp) + return (WT_NOTFOUND); + ts = txn_global->stable_timestamp; + } else + WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); + +done: + *tsp = ts; + return (0); } /* * __txn_query_timestamp -- - * Query a timestamp within this session's transaction. + * Query a timestamp within this session's transaction. */ static int -__txn_query_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[]) +__txn_query_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *tsp, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_TXN *txn; - - txn = &session->txn; - - WT_STAT_CONN_INCR(session, session_query_ts); - if (!F_ISSET(txn, WT_TXN_RUNNING)) - return (WT_NOTFOUND); - - WT_RET(__wt_config_gets(session, cfg, "get", &cval)); - if (WT_STRING_MATCH("commit", cval.str, cval.len)) - *tsp = txn->commit_timestamp; - else if (WT_STRING_MATCH("first_commit", cval.str, cval.len)) - *tsp = txn->first_commit_timestamp; - else if (WT_STRING_MATCH("prepare", cval.str, cval.len)) - *tsp = txn->prepare_timestamp; - else if (WT_STRING_MATCH("read", cval.str, cval.len)) - *tsp = txn->read_timestamp; - else - WT_RET_MSG(session, EINVAL, - "unknown timestamp query %.*s", (int)cval.len, cval.str); - - return (0); + WT_CONFIG_ITEM cval; + WT_TXN *txn; + + txn = &session->txn; + + WT_STAT_CONN_INCR(session, session_query_ts); + if (!F_ISSET(txn, WT_TXN_RUNNING)) + return (WT_NOTFOUND); + + WT_RET(__wt_config_gets(session, cfg, "get", &cval)); + if (WT_STRING_MATCH("commit", cval.str, cval.len)) + *tsp = txn->commit_timestamp; + else if (WT_STRING_MATCH("first_commit", cval.str, cval.len)) + *tsp = txn->first_commit_timestamp; + else if (WT_STRING_MATCH("prepare", cval.str, cval.len)) + *tsp = txn->prepare_timestamp; + else if (WT_STRING_MATCH("read", cval.str, cval.len)) + *tsp = txn->read_timestamp; + else + WT_RET_MSG(session, EINVAL, "unknown timestamp query %.*s", (int)cval.len, cval.str); + + return (0); } /* * __wt_txn_query_timestamp -- - * Query a timestamp. The caller may query the global transaction or the - * session's transaction. + * Query a timestamp. The caller may query the global transaction or the session's transaction. */ int -__wt_txn_query_timestamp(WT_SESSION_IMPL *session, - char *hex_timestamp, const char *cfg[], bool global_txn) +__wt_txn_query_timestamp( + WT_SESSION_IMPL *session, char *hex_timestamp, const char *cfg[], bool global_txn) { - wt_timestamp_t ts; + wt_timestamp_t ts; - if (global_txn) - WT_RET(__txn_global_query_timestamp(session, &ts, cfg)); - else - WT_RET(__txn_query_timestamp(session, &ts, cfg)); + if (global_txn) + WT_RET(__txn_global_query_timestamp(session, &ts, cfg)); + else + WT_RET(__txn_query_timestamp(session, &ts, cfg)); - __wt_timestamp_to_hex_string(ts, hex_timestamp); - return (0); + __wt_timestamp_to_hex_string(ts, hex_timestamp); + return (0); } /* * __wt_txn_update_pinned_timestamp -- - * Update the pinned timestamp (the oldest timestamp that has to be - * maintained for current or future readers). + * Update the pinned timestamp (the oldest timestamp that has to be maintained for current or + * future readers). */ int __wt_txn_update_pinned_timestamp(WT_SESSION_IMPL *session, bool force) { - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t last_pinned_timestamp, pinned_timestamp; - - txn_global = &S2C(session)->txn_global; - - /* Skip locking and scanning when the oldest timestamp is pinned. */ - if (txn_global->oldest_is_pinned) - return (0); - - /* Scan to find the global pinned timestamp. */ - if ((ret = __wt_txn_get_pinned_timestamp( - session, &pinned_timestamp, WT_TXN_TS_INCLUDE_OLDEST)) != 0) - return (ret == WT_NOTFOUND ? 0 : ret); - - if (txn_global->has_pinned_timestamp && !force) { - last_pinned_timestamp = txn_global->pinned_timestamp; - - if (pinned_timestamp <= last_pinned_timestamp) - return (0); - } - - __wt_writelock(session, &txn_global->rwlock); - /* - * Scan the global pinned timestamp again, it's possible that it got - * changed after the previous scan. - */ - if ((ret = __wt_txn_get_pinned_timestamp(session, &pinned_timestamp, - WT_TXN_TS_ALREADY_LOCKED | WT_TXN_TS_INCLUDE_OLDEST)) != 0) { - __wt_writeunlock(session, &txn_global->rwlock); - return (ret == WT_NOTFOUND ? 0 : ret); - } - - if (!txn_global->has_pinned_timestamp || force || - txn_global->pinned_timestamp < pinned_timestamp) { - txn_global->pinned_timestamp = pinned_timestamp; - txn_global->has_pinned_timestamp = true; - txn_global->oldest_is_pinned = - txn_global->pinned_timestamp == - txn_global->oldest_timestamp; - txn_global->stable_is_pinned = - txn_global->pinned_timestamp == - txn_global->stable_timestamp; - __wt_verbose_timestamp(session, - pinned_timestamp, "Updated pinned timestamp"); - } - __wt_writeunlock(session, &txn_global->rwlock); - - return (0); + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t last_pinned_timestamp, pinned_timestamp; + + txn_global = &S2C(session)->txn_global; + + /* Skip locking and scanning when the oldest timestamp is pinned. */ + if (txn_global->oldest_is_pinned) + return (0); + + /* Scan to find the global pinned timestamp. */ + if ((ret = __wt_txn_get_pinned_timestamp( + session, &pinned_timestamp, WT_TXN_TS_INCLUDE_OLDEST)) != 0) + return (ret == WT_NOTFOUND ? 0 : ret); + + if (txn_global->has_pinned_timestamp && !force) { + last_pinned_timestamp = txn_global->pinned_timestamp; + + if (pinned_timestamp <= last_pinned_timestamp) + return (0); + } + + __wt_writelock(session, &txn_global->rwlock); + /* + * Scan the global pinned timestamp again, it's possible that it got changed after the previous + * scan. + */ + if ((ret = __wt_txn_get_pinned_timestamp( + session, &pinned_timestamp, WT_TXN_TS_ALREADY_LOCKED | WT_TXN_TS_INCLUDE_OLDEST)) != 0) { + __wt_writeunlock(session, &txn_global->rwlock); + return (ret == WT_NOTFOUND ? 0 : ret); + } + + if (!txn_global->has_pinned_timestamp || force || + txn_global->pinned_timestamp < pinned_timestamp) { + txn_global->pinned_timestamp = pinned_timestamp; + txn_global->has_pinned_timestamp = true; + txn_global->oldest_is_pinned = txn_global->pinned_timestamp == txn_global->oldest_timestamp; + txn_global->stable_is_pinned = txn_global->pinned_timestamp == txn_global->stable_timestamp; + __wt_verbose_timestamp(session, pinned_timestamp, "Updated pinned timestamp"); + } + __wt_writeunlock(session, &txn_global->rwlock); + + return (0); } /* * __wt_txn_global_set_timestamp -- - * Set a global transaction timestamp. + * Set a global transaction timestamp. */ int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_CONFIG_ITEM durable_cval, oldest_cval, stable_cval; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t durable_ts, oldest_ts, stable_ts; - wt_timestamp_t last_oldest_ts, last_stable_ts; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool force, has_durable, has_oldest, has_stable; - - txn_global = &S2C(session)->txn_global; - - WT_STAT_CONN_INCR(session, txn_set_ts); - - /* - * TODO: When we remove all_committed, we need to remove this too. - * For now, we're temporarily aliasing the global commit timestamp to - * the global durable timestamp. - */ - WT_RET(__wt_config_gets_def(session, - cfg, "commit_timestamp", 0, &durable_cval)); - has_durable = durable_cval.len != 0; - if (has_durable) - WT_STAT_CONN_INCR(session, txn_set_ts_durable); - - if (!has_durable) { - WT_RET(__wt_config_gets_def(session, - cfg, "durable_timestamp", 0, &durable_cval)); - has_durable = durable_cval.len != 0; - if (has_durable) - WT_STAT_CONN_INCR(session, txn_set_ts_durable); - } - - WT_RET(__wt_config_gets_def(session, - cfg, "oldest_timestamp", 0, &oldest_cval)); - has_oldest = oldest_cval.len != 0; - if (has_oldest) - WT_STAT_CONN_INCR(session, txn_set_ts_oldest); - - WT_RET(__wt_config_gets_def(session, - cfg, "stable_timestamp", 0, &stable_cval)); - has_stable = stable_cval.len != 0; - if (has_stable) - WT_STAT_CONN_INCR(session, txn_set_ts_stable); - - /* If no timestamp was supplied, there's nothing to do. */ - if (!has_durable && !has_oldest && !has_stable) - return (0); - - /* - * Parsing will initialize the timestamp to zero even if - * it is not configured. - */ - WT_RET(__wt_txn_parse_timestamp( - session, "durable", &durable_ts, &durable_cval)); - WT_RET(__wt_txn_parse_timestamp( - session, "oldest", &oldest_ts, &oldest_cval)); - WT_RET(__wt_txn_parse_timestamp( - session, "stable", &stable_ts, &stable_cval)); - - WT_RET(__wt_config_gets_def(session, - cfg, "force", 0, &cval)); - force = cval.val != 0; - - if (force) - goto set; - - __wt_readlock(session, &txn_global->rwlock); - - last_oldest_ts = txn_global->oldest_timestamp; - last_stable_ts = txn_global->stable_timestamp; - - /* - * First do error checking on the timestamp values. The - * oldest timestamp must always be less than or equal to - * the stable timestamp. If we're only setting one - * then compare against the system timestamp. If we're - * setting both then compare the passed in values. - */ - if (!has_durable && txn_global->has_durable_timestamp) - durable_ts = txn_global->durable_timestamp; - if (!has_oldest && txn_global->has_oldest_timestamp) - oldest_ts = last_oldest_ts; - if (!has_stable && txn_global->has_stable_timestamp) - stable_ts = last_stable_ts; - - /* - * If a durable timestamp was supplied, check that it is no older than - * either the stable timestamp or the oldest timestamp. - */ - if (has_durable && (has_oldest || - txn_global->has_oldest_timestamp) && oldest_ts > durable_ts) { - __wt_readunlock(session, &txn_global->rwlock); - WT_RET_MSG(session, EINVAL, - "set_timestamp: oldest timestamp %s must not be later than " - "durable timestamp %s", - __wt_timestamp_to_string(oldest_ts, ts_string[0]), - __wt_timestamp_to_string(durable_ts, ts_string[1])); - } - - if (has_durable && (has_stable || - txn_global->has_stable_timestamp) && stable_ts > durable_ts) { - __wt_readunlock(session, &txn_global->rwlock); - WT_RET_MSG(session, EINVAL, - "set_timestamp: stable timestamp %s must not be later than " - "durable timestamp %s", - __wt_timestamp_to_string(stable_ts, ts_string[0]), - __wt_timestamp_to_string(durable_ts, ts_string[1])); - } - - /* - * The oldest and stable timestamps must always satisfy the condition - * that oldest <= stable. - */ - if ((has_oldest || has_stable) && - (has_oldest || txn_global->has_oldest_timestamp) && - (has_stable || - txn_global->has_stable_timestamp) && oldest_ts > stable_ts) { - __wt_readunlock(session, &txn_global->rwlock); - WT_RET_MSG(session, EINVAL, - "set_timestamp: oldest timestamp %s must not be later than " - "stable timestamp %s", - __wt_timestamp_to_string(oldest_ts, ts_string[0]), - __wt_timestamp_to_string(stable_ts, ts_string[1])); - } - - __wt_readunlock(session, &txn_global->rwlock); - - /* Check if we are actually updating anything. */ - if (has_oldest && - txn_global->has_oldest_timestamp && oldest_ts <= last_oldest_ts) - has_oldest = false; - - if (has_stable && - txn_global->has_stable_timestamp && stable_ts <= last_stable_ts) - has_stable = false; - - if (!has_durable && !has_oldest && !has_stable) - return (0); - -set: __wt_writelock(session, &txn_global->rwlock); - /* - * This method can be called from multiple threads, check that we are - * moving the global timestamps forwards. - * - * The exception is the durable timestamp, where the application can - * move it backwards (in fact, it only really makes sense to explicitly - * move it backwards because it otherwise tracks the largest - * durable_timestamp so it moves forward whenever transactions are - * assigned timestamps). - */ - if (has_durable) { - txn_global->durable_timestamp = durable_ts; - txn_global->has_durable_timestamp = true; - WT_STAT_CONN_INCR(session, txn_set_ts_durable_upd); - __wt_verbose_timestamp(session, durable_ts, - "Updated global durable timestamp"); - } - - if (has_oldest && (!txn_global->has_oldest_timestamp || force || - oldest_ts > txn_global->oldest_timestamp)) { - txn_global->oldest_timestamp = oldest_ts; - WT_STAT_CONN_INCR(session, txn_set_ts_oldest_upd); - txn_global->has_oldest_timestamp = true; - txn_global->oldest_is_pinned = false; - __wt_verbose_timestamp(session, oldest_ts, - "Updated global oldest timestamp"); - } - - if (has_stable && (!txn_global->has_stable_timestamp || force || - stable_ts > txn_global->stable_timestamp)) { - txn_global->stable_timestamp = stable_ts; - WT_STAT_CONN_INCR(session, txn_set_ts_stable_upd); - txn_global->has_stable_timestamp = true; - txn_global->stable_is_pinned = false; - __wt_verbose_timestamp(session, stable_ts, - "Updated global stable timestamp"); - } - __wt_writeunlock(session, &txn_global->rwlock); - - if (has_oldest || has_stable) - WT_RET(__wt_txn_update_pinned_timestamp(session, force)); - - return (0); + WT_CONFIG_ITEM cval; + WT_CONFIG_ITEM durable_cval, oldest_cval, stable_cval; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t durable_ts, oldest_ts, stable_ts; + wt_timestamp_t last_oldest_ts, last_stable_ts; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool force, has_durable, has_oldest, has_stable; + + txn_global = &S2C(session)->txn_global; + + WT_STAT_CONN_INCR(session, txn_set_ts); + + /* + * TODO: When we remove all_committed, we need to remove this too. For now, we're temporarily + * aliasing the global commit timestamp to the global durable timestamp. + */ + WT_RET(__wt_config_gets_def(session, cfg, "commit_timestamp", 0, &durable_cval)); + has_durable = durable_cval.len != 0; + if (has_durable) + WT_STAT_CONN_INCR(session, txn_set_ts_durable); + + if (!has_durable) { + WT_RET(__wt_config_gets_def(session, cfg, "durable_timestamp", 0, &durable_cval)); + has_durable = durable_cval.len != 0; + if (has_durable) + WT_STAT_CONN_INCR(session, txn_set_ts_durable); + } + + WT_RET(__wt_config_gets_def(session, cfg, "oldest_timestamp", 0, &oldest_cval)); + has_oldest = oldest_cval.len != 0; + if (has_oldest) + WT_STAT_CONN_INCR(session, txn_set_ts_oldest); + + WT_RET(__wt_config_gets_def(session, cfg, "stable_timestamp", 0, &stable_cval)); + has_stable = stable_cval.len != 0; + if (has_stable) + WT_STAT_CONN_INCR(session, txn_set_ts_stable); + + /* If no timestamp was supplied, there's nothing to do. */ + if (!has_durable && !has_oldest && !has_stable) + return (0); + + /* + * Parsing will initialize the timestamp to zero even if it is not configured. + */ + WT_RET(__wt_txn_parse_timestamp(session, "durable", &durable_ts, &durable_cval)); + WT_RET(__wt_txn_parse_timestamp(session, "oldest", &oldest_ts, &oldest_cval)); + WT_RET(__wt_txn_parse_timestamp(session, "stable", &stable_ts, &stable_cval)); + + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; + + if (force) + goto set; + + __wt_readlock(session, &txn_global->rwlock); + + last_oldest_ts = txn_global->oldest_timestamp; + last_stable_ts = txn_global->stable_timestamp; + + /* + * First do error checking on the timestamp values. The oldest timestamp must always be less + * than or equal to the stable timestamp. If we're only setting one then compare against the + * system timestamp. If we're setting both then compare the passed in values. + */ + if (!has_durable && txn_global->has_durable_timestamp) + durable_ts = txn_global->durable_timestamp; + if (!has_oldest && txn_global->has_oldest_timestamp) + oldest_ts = last_oldest_ts; + if (!has_stable && txn_global->has_stable_timestamp) + stable_ts = last_stable_ts; + + /* + * If a durable timestamp was supplied, check that it is no older than either the stable + * timestamp or the oldest timestamp. + */ + if (has_durable && (has_oldest || txn_global->has_oldest_timestamp) && oldest_ts > durable_ts) { + __wt_readunlock(session, &txn_global->rwlock); + WT_RET_MSG(session, EINVAL, + "set_timestamp: oldest timestamp %s must not be later than " + "durable timestamp %s", + __wt_timestamp_to_string(oldest_ts, ts_string[0]), + __wt_timestamp_to_string(durable_ts, ts_string[1])); + } + + if (has_durable && (has_stable || txn_global->has_stable_timestamp) && stable_ts > durable_ts) { + __wt_readunlock(session, &txn_global->rwlock); + WT_RET_MSG(session, EINVAL, + "set_timestamp: stable timestamp %s must not be later than " + "durable timestamp %s", + __wt_timestamp_to_string(stable_ts, ts_string[0]), + __wt_timestamp_to_string(durable_ts, ts_string[1])); + } + + /* + * The oldest and stable timestamps must always satisfy the condition that oldest <= stable. + */ + if ((has_oldest || has_stable) && (has_oldest || txn_global->has_oldest_timestamp) && + (has_stable || txn_global->has_stable_timestamp) && oldest_ts > stable_ts) { + __wt_readunlock(session, &txn_global->rwlock); + WT_RET_MSG(session, EINVAL, + "set_timestamp: oldest timestamp %s must not be later than " + "stable timestamp %s", + __wt_timestamp_to_string(oldest_ts, ts_string[0]), + __wt_timestamp_to_string(stable_ts, ts_string[1])); + } + + __wt_readunlock(session, &txn_global->rwlock); + + /* Check if we are actually updating anything. */ + if (has_oldest && txn_global->has_oldest_timestamp && oldest_ts <= last_oldest_ts) + has_oldest = false; + + if (has_stable && txn_global->has_stable_timestamp && stable_ts <= last_stable_ts) + has_stable = false; + + if (!has_durable && !has_oldest && !has_stable) + return (0); + +set: + __wt_writelock(session, &txn_global->rwlock); + /* + * This method can be called from multiple threads, check that we are + * moving the global timestamps forwards. + * + * The exception is the durable timestamp, where the application can + * move it backwards (in fact, it only really makes sense to explicitly + * move it backwards because it otherwise tracks the largest + * durable_timestamp so it moves forward whenever transactions are + * assigned timestamps). + */ + if (has_durable) { + txn_global->durable_timestamp = durable_ts; + txn_global->has_durable_timestamp = true; + WT_STAT_CONN_INCR(session, txn_set_ts_durable_upd); + __wt_verbose_timestamp(session, durable_ts, "Updated global durable timestamp"); + } + + if (has_oldest && + (!txn_global->has_oldest_timestamp || force || oldest_ts > txn_global->oldest_timestamp)) { + txn_global->oldest_timestamp = oldest_ts; + WT_STAT_CONN_INCR(session, txn_set_ts_oldest_upd); + txn_global->has_oldest_timestamp = true; + txn_global->oldest_is_pinned = false; + __wt_verbose_timestamp(session, oldest_ts, "Updated global oldest timestamp"); + } + + if (has_stable && + (!txn_global->has_stable_timestamp || force || stable_ts > txn_global->stable_timestamp)) { + txn_global->stable_timestamp = stable_ts; + WT_STAT_CONN_INCR(session, txn_set_ts_stable_upd); + txn_global->has_stable_timestamp = true; + txn_global->stable_is_pinned = false; + __wt_verbose_timestamp(session, stable_ts, "Updated global stable timestamp"); + } + __wt_writeunlock(session, &txn_global->rwlock); + + if (has_oldest || has_stable) + WT_RET(__wt_txn_update_pinned_timestamp(session, force)); + + return (0); } /* * __txn_assert_after_reads -- - * Assert that commit and prepare timestamps are greater than the latest - * active read timestamp, if any. + * Assert that commit and prepare timestamps are greater than the latest active read timestamp, + * if any. */ static int __txn_assert_after_reads( - WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN **prevp) + WT_SESSION_IMPL *session, const char *op, wt_timestamp_t ts, WT_TXN **prevp) { #ifdef HAVE_DIAGNOSTIC - WT_TXN *prev, *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; - wt_timestamp_t tmp_timestamp; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - - __wt_readlock(session, &txn_global->read_timestamp_rwlock); - prev = TAILQ_LAST( - &txn_global->read_timestamph, __wt_txn_rts_qh); - while (prev != NULL) { - /* - * Skip self and non-active transactions. Copy out value of - * read timestamp to prevent possible race where a transaction - * resets its read timestamp while we traverse the queue. - */ - if (!__txn_get_read_timestamp(prev, &tmp_timestamp) || - prev == txn) { - prev = TAILQ_PREV( - prev, __wt_txn_rts_qh, read_timestampq); - continue; - } - - if (tmp_timestamp >= ts) { - __wt_readunlock(session, - &txn_global->read_timestamp_rwlock); - WT_RET_MSG(session, EINVAL, - "%s timestamp %s must be greater than the " - "latest active read timestamp %s ", - op, - __wt_timestamp_to_string(ts, ts_string[0]), - __wt_timestamp_to_string( - tmp_timestamp, ts_string[1])); - } - break; - } - - __wt_readunlock(session, &txn_global->read_timestamp_rwlock); - - if (prevp != NULL) - *prevp = prev; + WT_TXN *prev, *txn = &session->txn; + WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + wt_timestamp_t tmp_timestamp; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + + __wt_readlock(session, &txn_global->read_timestamp_rwlock); + prev = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh); + while (prev != NULL) { + /* + * Skip self and non-active transactions. Copy out value of read timestamp to prevent + * possible race where a transaction resets its read timestamp while we traverse the queue. + */ + if (!__txn_get_read_timestamp(prev, &tmp_timestamp) || prev == txn) { + prev = TAILQ_PREV(prev, __wt_txn_rts_qh, read_timestampq); + continue; + } + + if (tmp_timestamp >= ts) { + __wt_readunlock(session, &txn_global->read_timestamp_rwlock); + WT_RET_MSG(session, EINVAL, + "%s timestamp %s must be greater than the " + "latest active read timestamp %s ", + op, __wt_timestamp_to_string(ts, ts_string[0]), + __wt_timestamp_to_string(tmp_timestamp, ts_string[1])); + } + break; + } + + __wt_readunlock(session, &txn_global->read_timestamp_rwlock); + + if (prevp != NULL) + *prevp = prev; #else - WT_UNUSED(session); - WT_UNUSED(op); - WT_UNUSED(ts); - WT_UNUSED(prevp); + WT_UNUSED(session); + WT_UNUSED(op); + WT_UNUSED(ts); + WT_UNUSED(prevp); #endif - return (0); + return (0); } /* * __wt_txn_set_commit_timestamp -- - * Validate the commit timestamp of a transaction. - * If the commit timestamp is less than the oldest timestamp and - * transaction is configured to roundup timestamps of a prepared - * transaction, then we will roundup the commit timestamp to the prepare - * timestamp of the transaction. + * Validate the commit timestamp of a transaction. If the commit timestamp is less than the + * oldest timestamp and transaction is configured to roundup timestamps of a prepared + * transaction, then we will roundup the commit timestamp to the prepare timestamp of the + * transaction. */ int -__wt_txn_set_commit_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t commit_ts) +__wt_txn_set_commit_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t commit_ts) { - WT_TXN *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; - wt_timestamp_t oldest_ts, stable_ts; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool has_oldest_ts, has_stable_ts; - - /* Added this redundant initialization to circumvent build failure. */ - oldest_ts = stable_ts = WT_TS_NONE; - - if (txn->isolation != WT_ISO_SNAPSHOT) - WT_RET_MSG(session, EINVAL, "setting a commit_timestamp" - " requires a transaction running at snapshot" - " isolation"); - - /* - * Compare against the oldest and the stable timestamp. Return an error - * if the given timestamp is less than oldest and/or stable timestamp. - */ - has_oldest_ts = txn_global->has_oldest_timestamp; - if (has_oldest_ts) - oldest_ts = txn_global->oldest_timestamp; - has_stable_ts = txn_global->has_stable_timestamp; - if (has_stable_ts) - stable_ts = txn_global->stable_timestamp; - - if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) { - /* - * For a non-prepared transactions the commit timestamp should - * not be less than the stable timestamp. - */ - if (has_oldest_ts && commit_ts < oldest_ts) - WT_RET_MSG(session, EINVAL, - "commit timestamp %s is less than the oldest " - "timestamp %s", - __wt_timestamp_to_string(commit_ts, ts_string[0]), - __wt_timestamp_to_string(oldest_ts, ts_string[1])); - - if (has_stable_ts && commit_ts < stable_ts) - WT_RET_MSG(session, EINVAL, - "commit timestamp %s is less than the stable " - "timestamp %s", - __wt_timestamp_to_string(commit_ts, ts_string[0]), - __wt_timestamp_to_string(stable_ts, ts_string[1])); - - /* - * Compare against the commit timestamp of the current - * transaction. Return an error if the given timestamp is - * older than the first commit timestamp. - */ - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && - commit_ts < txn->first_commit_timestamp) - WT_RET_MSG(session, EINVAL, - "commit timestamp %s older than the first " - "commit timestamp %s for this transaction", - __wt_timestamp_to_string(commit_ts, ts_string[0]), - __wt_timestamp_to_string( - txn->first_commit_timestamp, ts_string[1])); - - /* - * FIXME: - * WT-4779 disabled to buy time to understand a test failure. - * WT_RET(__txn_assert_after_reads( - * session, "commit", commit_ts, NULL)); - */ - } else { - /* - * For a prepared transaction, the commit timestamp should not - * be less than the prepare timestamp. - */ - if (txn->prepare_timestamp > commit_ts) { - if (!F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED)) - WT_RET_MSG(session, EINVAL, - "commit timestamp %s is less than the " - "prepare timestamp %s for this transaction", - __wt_timestamp_to_string( - commit_ts, ts_string[0]), - __wt_timestamp_to_string( - txn->prepare_timestamp, ts_string[1])); - commit_ts = txn->prepare_timestamp; - } - } - - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) || - txn->durable_timestamp == txn->commit_timestamp); - txn->commit_timestamp = commit_ts; - /* - * First time copy the commit timestamp to the first commit timestamp. - */ - if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) - txn->first_commit_timestamp = commit_ts; - - /* - * Only mirror the commit timestamp if there isn't already an explicit - * durable timestamp. This might happen if we set a commit timestamp, - * set a durable timestamp and then subsequently set the commit - * timestamp again. - */ - if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) - txn->durable_timestamp = commit_ts; - - F_SET(txn, WT_TXN_HAS_TS_COMMIT); - return (0); + WT_TXN *txn = &session->txn; + WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + wt_timestamp_t oldest_ts, stable_ts; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool has_oldest_ts, has_stable_ts; + + /* Added this redundant initialization to circumvent build failure. */ + oldest_ts = stable_ts = WT_TS_NONE; + + if (txn->isolation != WT_ISO_SNAPSHOT) + WT_RET_MSG(session, EINVAL, + "setting a commit_timestamp" + " requires a transaction running at snapshot" + " isolation"); + + /* + * Compare against the oldest and the stable timestamp. Return an error if the given timestamp + * is less than oldest and/or stable timestamp. + */ + has_oldest_ts = txn_global->has_oldest_timestamp; + if (has_oldest_ts) + oldest_ts = txn_global->oldest_timestamp; + has_stable_ts = txn_global->has_stable_timestamp; + if (has_stable_ts) + stable_ts = txn_global->stable_timestamp; + + if (!F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) { + /* + * For a non-prepared transactions the commit timestamp should not be less than the stable + * timestamp. + */ + if (has_oldest_ts && commit_ts < oldest_ts) + WT_RET_MSG(session, EINVAL, + "commit timestamp %s is less than the oldest " + "timestamp %s", + __wt_timestamp_to_string(commit_ts, ts_string[0]), + __wt_timestamp_to_string(oldest_ts, ts_string[1])); + + if (has_stable_ts && commit_ts < stable_ts) + WT_RET_MSG(session, EINVAL, + "commit timestamp %s is less than the stable " + "timestamp %s", + __wt_timestamp_to_string(commit_ts, ts_string[0]), + __wt_timestamp_to_string(stable_ts, ts_string[1])); + + /* + * Compare against the commit timestamp of the current transaction. Return an error if the + * given timestamp is older than the first commit timestamp. + */ + if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && commit_ts < txn->first_commit_timestamp) + WT_RET_MSG(session, EINVAL, + "commit timestamp %s older than the first " + "commit timestamp %s for this transaction", + __wt_timestamp_to_string(commit_ts, ts_string[0]), + __wt_timestamp_to_string(txn->first_commit_timestamp, ts_string[1])); + + /* + * FIXME: + * WT-4779 disabled to buy time to understand a test failure. + * WT_RET(__txn_assert_after_reads( + * session, "commit", commit_ts, NULL)); + */ + } else { + /* + * For a prepared transaction, the commit timestamp should not be less than the prepare + * timestamp. + */ + if (txn->prepare_timestamp > commit_ts) { + if (!F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED)) + WT_RET_MSG(session, EINVAL, + "commit timestamp %s is less than the " + "prepare timestamp %s for this transaction", + __wt_timestamp_to_string(commit_ts, ts_string[0]), + __wt_timestamp_to_string(txn->prepare_timestamp, ts_string[1])); + commit_ts = txn->prepare_timestamp; + } + } + + WT_ASSERT(session, + !F_ISSET(txn, WT_TXN_HAS_TS_DURABLE) || txn->durable_timestamp == txn->commit_timestamp); + txn->commit_timestamp = commit_ts; + /* + * First time copy the commit timestamp to the first commit timestamp. + */ + if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + txn->first_commit_timestamp = commit_ts; + + /* + * Only mirror the commit timestamp if there isn't already an explicit durable timestamp. This + * might happen if we set a commit timestamp, set a durable timestamp and then subsequently set + * the commit timestamp again. + */ + if (!F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) + txn->durable_timestamp = commit_ts; + + F_SET(txn, WT_TXN_HAS_TS_COMMIT); + return (0); } /* * __wt_txn_set_durable_timestamp -- - * Validate the durable timestamp of a transaction. + * Validate the durable timestamp of a transaction. */ int -__wt_txn_set_durable_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t durable_ts) +__wt_txn_set_durable_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t durable_ts) { - WT_TXN *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; - wt_timestamp_t oldest_ts, stable_ts; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool has_oldest_ts, has_stable_ts; - - /* Added this redundant initialization to circumvent build failure. */ - oldest_ts = stable_ts = 0; - - if (!F_ISSET(txn, WT_TXN_PREPARE)) - WT_RET_MSG(session, EINVAL, - "durable timestamp should not be specified for " - "non-prepared transaction"); - - if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) - WT_RET_MSG(session, EINVAL, - "commit timestamp is needed before the durable timestamp"); - - /* - * Compare against the oldest and the stable timestamp. Return an error - * if the given timestamp is less than oldest and/or stable timestamp. - */ - has_oldest_ts = txn_global->has_oldest_timestamp; - if (has_oldest_ts) - oldest_ts = txn_global->oldest_timestamp; - has_stable_ts = txn_global->has_stable_timestamp; - if (has_stable_ts) - stable_ts = txn_global->stable_timestamp; - - /* - * For a non-prepared transactions the commit timestamp should - * not be less than the stable timestamp. - */ - if (has_oldest_ts && durable_ts < oldest_ts) - WT_RET_MSG(session, EINVAL, - "durable timestamp %s is less than the oldest timestamp %s", - __wt_timestamp_to_string(durable_ts, ts_string[0]), - __wt_timestamp_to_string(oldest_ts, ts_string[1])); - - if (has_stable_ts && durable_ts < stable_ts) - WT_RET_MSG(session, EINVAL, - "durable timestamp %s is less than the stable timestamp %s", - __wt_timestamp_to_string(durable_ts, ts_string[0]), - __wt_timestamp_to_string(stable_ts, ts_string[1])); - - /* Check if the durable timestamp is less than the commit timestamp. */ - if (durable_ts < txn->commit_timestamp) - WT_RET_MSG(session, EINVAL, - "durable timestamp %s is less than the commit timestamp %s " - "for this transaction", - __wt_timestamp_to_string(durable_ts, ts_string[0]), - __wt_timestamp_to_string( - txn->commit_timestamp, ts_string[1])); - - txn->durable_timestamp = durable_ts; - F_SET(txn, WT_TXN_HAS_TS_DURABLE); - - return (0); + WT_TXN *txn = &session->txn; + WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + wt_timestamp_t oldest_ts, stable_ts; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool has_oldest_ts, has_stable_ts; + + /* Added this redundant initialization to circumvent build failure. */ + oldest_ts = stable_ts = 0; + + if (!F_ISSET(txn, WT_TXN_PREPARE)) + WT_RET_MSG(session, EINVAL, + "durable timestamp should not be specified for " + "non-prepared transaction"); + + if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + WT_RET_MSG(session, EINVAL, "commit timestamp is needed before the durable timestamp"); + + /* + * Compare against the oldest and the stable timestamp. Return an error if the given timestamp + * is less than oldest and/or stable timestamp. + */ + has_oldest_ts = txn_global->has_oldest_timestamp; + if (has_oldest_ts) + oldest_ts = txn_global->oldest_timestamp; + has_stable_ts = txn_global->has_stable_timestamp; + if (has_stable_ts) + stable_ts = txn_global->stable_timestamp; + + /* + * For a non-prepared transactions the commit timestamp should not be less than the stable + * timestamp. + */ + if (has_oldest_ts && durable_ts < oldest_ts) + WT_RET_MSG(session, EINVAL, "durable timestamp %s is less than the oldest timestamp %s", + __wt_timestamp_to_string(durable_ts, ts_string[0]), + __wt_timestamp_to_string(oldest_ts, ts_string[1])); + + if (has_stable_ts && durable_ts < stable_ts) + WT_RET_MSG(session, EINVAL, "durable timestamp %s is less than the stable timestamp %s", + __wt_timestamp_to_string(durable_ts, ts_string[0]), + __wt_timestamp_to_string(stable_ts, ts_string[1])); + + /* Check if the durable timestamp is less than the commit timestamp. */ + if (durable_ts < txn->commit_timestamp) + WT_RET_MSG(session, EINVAL, + "durable timestamp %s is less than the commit timestamp %s " + "for this transaction", + __wt_timestamp_to_string(durable_ts, ts_string[0]), + __wt_timestamp_to_string(txn->commit_timestamp, ts_string[1])); + + txn->durable_timestamp = durable_ts; + F_SET(txn, WT_TXN_HAS_TS_DURABLE); + + return (0); } /* * __wt_txn_set_prepare_timestamp -- - * Validate and set the prepare timestamp of a transaction. + * Validate and set the prepare timestamp of a transaction. */ int -__wt_txn_set_prepare_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t prepare_ts) +__wt_txn_set_prepare_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t prepare_ts) { - WT_TXN *prev, *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; - wt_timestamp_t oldest_ts; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - - WT_RET(__wt_txn_context_prepare_check(session)); - - if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) - WT_RET_MSG(session, EINVAL, "prepare timestamp is already set"); - - if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) - WT_RET_MSG(session, EINVAL, "commit timestamp " - "should not have been set before the prepare timestamp"); - - WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev)); - - /* - * Check whether the prepare timestamp is less than the oldest - * timestamp. - */ - oldest_ts = txn_global->oldest_timestamp; - if (prepare_ts < oldest_ts) { - /* - * Check whether the prepare timestamp needs to be rounded up to - * the oldest timestamp. - */ - if (F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED)) { - /* - * Check that there are no active readers. That would - * be a violation of preconditions for rounding - * timestamps of prepared transactions. - */ - WT_ASSERT(session, prev == NULL); - - __wt_verbose(session, WT_VERB_TIMESTAMP, - "prepare timestamp %s rounded to oldest " - "timestamp %s", - __wt_timestamp_to_string(prepare_ts, ts_string[0]), - __wt_timestamp_to_string(oldest_ts, ts_string[1])); - - prepare_ts = oldest_ts; - } else - WT_RET_MSG(session, EINVAL, - "prepare timestamp %s is older than the oldest " - "timestamp %s", - __wt_timestamp_to_string(prepare_ts, ts_string[0]), - __wt_timestamp_to_string(oldest_ts, ts_string[1])); - } - txn->prepare_timestamp = prepare_ts; - F_SET(txn, WT_TXN_HAS_TS_PREPARE); - - return (0); + WT_TXN *prev, *txn = &session->txn; + WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + wt_timestamp_t oldest_ts; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + + WT_RET(__wt_txn_context_prepare_check(session)); + + if (F_ISSET(txn, WT_TXN_HAS_TS_PREPARE)) + WT_RET_MSG(session, EINVAL, "prepare timestamp is already set"); + + if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) + WT_RET_MSG(session, EINVAL, + "commit timestamp " + "should not have been set before the prepare timestamp"); + + WT_RET(__txn_assert_after_reads(session, "prepare", prepare_ts, &prev)); + + /* + * Check whether the prepare timestamp is less than the oldest timestamp. + */ + oldest_ts = txn_global->oldest_timestamp; + if (prepare_ts < oldest_ts) { + /* + * Check whether the prepare timestamp needs to be rounded up to the oldest timestamp. + */ + if (F_ISSET(txn, WT_TXN_TS_ROUND_PREPARED)) { + /* + * Check that there are no active readers. That would be a violation of preconditions + * for rounding timestamps of prepared transactions. + */ + WT_ASSERT(session, prev == NULL); + + __wt_verbose(session, WT_VERB_TIMESTAMP, + "prepare timestamp %s rounded to oldest " + "timestamp %s", + __wt_timestamp_to_string(prepare_ts, ts_string[0]), + __wt_timestamp_to_string(oldest_ts, ts_string[1])); + + prepare_ts = oldest_ts; + } else + WT_RET_MSG(session, EINVAL, + "prepare timestamp %s is older than the oldest " + "timestamp %s", + __wt_timestamp_to_string(prepare_ts, ts_string[0]), + __wt_timestamp_to_string(oldest_ts, ts_string[1])); + } + txn->prepare_timestamp = prepare_ts; + F_SET(txn, WT_TXN_HAS_TS_PREPARE); + + return (0); } /* * __wt_txn_set_read_timestamp -- - * Parse a request to set a transaction's read_timestamp. + * Parse a request to set a transaction's read_timestamp. */ int -__wt_txn_set_read_timestamp( - WT_SESSION_IMPL *session, wt_timestamp_t read_ts) +__wt_txn_set_read_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t read_ts) { - WT_TXN *txn = &session->txn; - WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; - wt_timestamp_t ts_oldest; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool did_roundup_to_oldest; - - WT_RET(__wt_txn_context_prepare_check(session)); - - /* Read timestamps imply / require snapshot isolation. */ - if (!F_ISSET(txn, WT_TXN_RUNNING)) - txn->isolation = WT_ISO_SNAPSHOT; - else if (txn->isolation != WT_ISO_SNAPSHOT) - WT_RET_MSG(session, EINVAL, "setting a read_timestamp" - " requires a transaction running at snapshot" - " isolation"); - - /* Read timestamps can't change once set. */ - if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) - WT_RET_MSG(session, EINVAL, "a read_timestamp" - " may only be set once per transaction"); - - /* - * This code is not using the timestamp validate function to - * avoid a race between checking and setting transaction - * timestamp. - */ - __wt_readlock(session, &txn_global->rwlock); - ts_oldest = txn_global->oldest_timestamp; - did_roundup_to_oldest = false; - if (read_ts < ts_oldest) { - /* - * If given read timestamp is earlier than oldest - * timestamp then round the read timestamp to - * oldest timestamp. - */ - if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) { - txn->read_timestamp = ts_oldest; - did_roundup_to_oldest = true; - } else { - __wt_readunlock(session, &txn_global->rwlock); - - /* - * In some cases, MongoDB sets a read timestamp older - * than the oldest timestamp, relying on WiredTiger's - * concurrency to detect and fail the set. In other - * cases it's a bug and MongoDB wants error context to - * make it easier to find those problems. Don't output - * an error message because that logs a MongoDB error, - * use an informational message to provide the context - * instead. - */ - WT_RET(__wt_msg(session, "read timestamp " - "%s less than the oldest timestamp %s", - __wt_timestamp_to_string(read_ts, ts_string[0]), - __wt_timestamp_to_string(ts_oldest, ts_string[1]))); - return (EINVAL); - } - } else - txn->read_timestamp = read_ts; - - __wt_txn_publish_read_timestamp(session); - __wt_readunlock(session, &txn_global->rwlock); - - /* - * This message is generated here to reduce the span of critical - * section. - */ - if (did_roundup_to_oldest) - __wt_verbose(session, WT_VERB_TIMESTAMP, "read " - "timestamp %s : rounded to oldest timestamp %s", - __wt_timestamp_to_string(read_ts, ts_string[0]), - __wt_timestamp_to_string(ts_oldest, ts_string[1])); - - /* - * If we already have a snapshot, it may be too early to match - * the timestamp (including the one we just read, if rounding - * to oldest). Get a new one. - */ - if (F_ISSET(txn, WT_TXN_RUNNING)) - __wt_txn_get_snapshot(session); - - return (0); + WT_TXN *txn = &session->txn; + WT_TXN_GLOBAL *txn_global = &S2C(session)->txn_global; + wt_timestamp_t ts_oldest; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool did_roundup_to_oldest; + + WT_RET(__wt_txn_context_prepare_check(session)); + + /* Read timestamps imply / require snapshot isolation. */ + if (!F_ISSET(txn, WT_TXN_RUNNING)) + txn->isolation = WT_ISO_SNAPSHOT; + else if (txn->isolation != WT_ISO_SNAPSHOT) + WT_RET_MSG(session, EINVAL, + "setting a read_timestamp" + " requires a transaction running at snapshot" + " isolation"); + + /* Read timestamps can't change once set. */ + if (F_ISSET(txn, WT_TXN_HAS_TS_READ)) + WT_RET_MSG(session, EINVAL, + "a read_timestamp" + " may only be set once per transaction"); + + /* + * This code is not using the timestamp validate function to avoid a race between checking and + * setting transaction timestamp. + */ + __wt_readlock(session, &txn_global->rwlock); + ts_oldest = txn_global->oldest_timestamp; + did_roundup_to_oldest = false; + if (read_ts < ts_oldest) { + /* + * If given read timestamp is earlier than oldest timestamp then round the read timestamp to + * oldest timestamp. + */ + if (F_ISSET(txn, WT_TXN_TS_ROUND_READ)) { + txn->read_timestamp = ts_oldest; + did_roundup_to_oldest = true; + } else { + __wt_readunlock(session, &txn_global->rwlock); + + /* + * In some cases, MongoDB sets a read timestamp older than the oldest timestamp, relying + * on WiredTiger's concurrency to detect and fail the set. In other cases it's a bug and + * MongoDB wants error context to make it easier to find those problems. Don't output an + * error message because that logs a MongoDB error, use an informational message to + * provide the context instead. + */ + WT_RET(__wt_msg(session, + "read timestamp " + "%s less than the oldest timestamp %s", + __wt_timestamp_to_string(read_ts, ts_string[0]), + __wt_timestamp_to_string(ts_oldest, ts_string[1]))); + return (EINVAL); + } + } else + txn->read_timestamp = read_ts; + + __wt_txn_publish_read_timestamp(session); + __wt_readunlock(session, &txn_global->rwlock); + + /* + * This message is generated here to reduce the span of critical section. + */ + if (did_roundup_to_oldest) + __wt_verbose(session, WT_VERB_TIMESTAMP, + "read " + "timestamp %s : rounded to oldest timestamp %s", + __wt_timestamp_to_string(read_ts, ts_string[0]), + __wt_timestamp_to_string(ts_oldest, ts_string[1])); + + /* + * If we already have a snapshot, it may be too early to match the timestamp (including the one + * we just read, if rounding to oldest). Get a new one. + */ + if (F_ISSET(txn, WT_TXN_RUNNING)) + __wt_txn_get_snapshot(session); + + return (0); } /* * __wt_txn_set_timestamp -- - * Parse a request to set a timestamp in a transaction. + * Parse a request to set a timestamp in a transaction. */ int __wt_txn_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONFIG_ITEM cval; - WT_DECL_RET; - wt_timestamp_t ts; - bool set_ts; - - set_ts = false; - WT_TRET(__wt_txn_context_check(session, true)); - - /* Look for a commit timestamp. */ - ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval); - WT_RET_NOTFOUND_OK(ret); - if (ret == 0 && cval.len != 0) { - WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); - WT_RET(__wt_txn_set_commit_timestamp(session, ts)); - set_ts = true; - } - - /* - * Look for a durable timestamp. Durable timestamp should be set only - * after setting the commit timestamp. - */ - ret = __wt_config_gets_def( - session, cfg, "durable_timestamp", 0, &cval); - WT_RET_NOTFOUND_OK(ret); - if (ret == 0 && cval.len != 0) { - WT_RET(__wt_txn_parse_timestamp( - session, "durable", &ts, &cval)); - WT_RET(__wt_txn_set_durable_timestamp(session, ts)); - } - - __wt_txn_publish_timestamp(session); - - /* Look for a read timestamp. */ - WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); - if (ret == 0 && cval.len != 0) { - WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); - set_ts = true; - WT_RET(__wt_txn_set_read_timestamp(session, ts)); - } - - /* Look for a prepare timestamp. */ - WT_RET(__wt_config_gets_def(session, - cfg, "prepare_timestamp", 0, &cval)); - if (ret == 0 && cval.len != 0) { - WT_RET(__wt_txn_parse_timestamp( - session, "prepare", &ts, &cval)); - WT_RET(__wt_txn_set_prepare_timestamp(session, ts)); - } - if (set_ts) - WT_RET(__wt_txn_ts_log(session)); - - return (0); + WT_CONFIG_ITEM cval; + WT_DECL_RET; + wt_timestamp_t ts; + bool set_ts; + + set_ts = false; + WT_TRET(__wt_txn_context_check(session, true)); + + /* Look for a commit timestamp. */ + ret = __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval); + WT_RET_NOTFOUND_OK(ret); + if (ret == 0 && cval.len != 0) { + WT_RET(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); + WT_RET(__wt_txn_set_commit_timestamp(session, ts)); + set_ts = true; + } + + /* + * Look for a durable timestamp. Durable timestamp should be set only after setting the commit + * timestamp. + */ + ret = __wt_config_gets_def(session, cfg, "durable_timestamp", 0, &cval); + WT_RET_NOTFOUND_OK(ret); + if (ret == 0 && cval.len != 0) { + WT_RET(__wt_txn_parse_timestamp(session, "durable", &ts, &cval)); + WT_RET(__wt_txn_set_durable_timestamp(session, ts)); + } + + __wt_txn_publish_timestamp(session); + + /* Look for a read timestamp. */ + WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); + if (ret == 0 && cval.len != 0) { + WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); + set_ts = true; + WT_RET(__wt_txn_set_read_timestamp(session, ts)); + } + + /* Look for a prepare timestamp. */ + WT_RET(__wt_config_gets_def(session, cfg, "prepare_timestamp", 0, &cval)); + if (ret == 0 && cval.len != 0) { + WT_RET(__wt_txn_parse_timestamp(session, "prepare", &ts, &cval)); + WT_RET(__wt_txn_set_prepare_timestamp(session, ts)); + } + if (set_ts) + WT_RET(__wt_txn_ts_log(session)); + + return (0); } /* * __wt_txn_publish_timestamp -- - * Publish a transaction's timestamp to the durable queue. + * Publish a transaction's timestamp to the durable queue. */ void __wt_txn_publish_timestamp(WT_SESSION_IMPL *session) { - WT_TXN *qtxn, *txn, *txn_tmp; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t ts; - uint64_t walked; - - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - - if (F_ISSET(txn, WT_TXN_TS_PUBLISHED)) - return; - - if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) - ts = txn->durable_timestamp; - else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { - /* - * If we know for a fact that this is a prepared transaction and - * we only have a commit timestamp, don't add to the durable - * queue. If we poll all_durable after setting the commit - * timestamp of a prepared transaction, that prepared - * transaction should NOT be visible. - */ - if (F_ISSET(txn, WT_TXN_PREPARE)) - return; - ts = txn->commit_timestamp; - } else - return; - - __wt_writelock(session, &txn_global->durable_timestamp_rwlock); - /* - * If our transaction is on the queue remove it first. The timestamp - * may move earlier so we otherwise might not remove ourselves before - * finding where to insert ourselves (which would result in a list - * loop) and we don't want to walk more of the list than needed. - */ - if (txn->clear_durable_q) { - TAILQ_REMOVE(&txn_global->durable_timestamph, - txn, durable_timestampq); - WT_PUBLISH(txn->clear_durable_q, false); - --txn_global->durable_timestampq_len; - } - /* - * Walk the list to look for where to insert our own transaction - * and remove any transactions that are not active. We stop when - * we get to the location where we want to insert. - */ - if (TAILQ_EMPTY(&txn_global->durable_timestamph)) { - TAILQ_INSERT_HEAD( - &txn_global->durable_timestamph, txn, durable_timestampq); - WT_STAT_CONN_INCR(session, txn_durable_queue_empty); - } else { - /* Walk from the start, removing cleared entries. */ - walked = 0; - TAILQ_FOREACH_SAFE(qtxn, &txn_global->durable_timestamph, - durable_timestampq, txn_tmp) { - ++walked; - /* - * Stop on the first entry that we cannot clear. - */ - if (!qtxn->clear_durable_q) - break; - - TAILQ_REMOVE(&txn_global->durable_timestamph, - qtxn, durable_timestampq); - WT_PUBLISH(qtxn->clear_durable_q, false); - --txn_global->durable_timestampq_len; - } - - /* - * Now walk backwards from the end to find the correct position - * for the insert. - */ - qtxn = TAILQ_LAST( - &txn_global->durable_timestamph, __wt_txn_dts_qh); - while (qtxn != NULL && - __txn_get_published_timestamp(session, qtxn) > ts) { - ++walked; - qtxn = TAILQ_PREV( - qtxn, __wt_txn_dts_qh, durable_timestampq); - } - if (qtxn == NULL) { - TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, - txn, durable_timestampq); - WT_STAT_CONN_INCR(session, txn_durable_queue_head); - } else - TAILQ_INSERT_AFTER(&txn_global->durable_timestamph, - qtxn, txn, durable_timestampq); - WT_STAT_CONN_INCRV(session, txn_durable_queue_walked, walked); - } - ++txn_global->durable_timestampq_len; - WT_STAT_CONN_INCR(session, txn_durable_queue_inserts); - txn->clear_durable_q = false; - F_SET(txn, WT_TXN_TS_PUBLISHED); - __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock); + WT_TXN *qtxn, *txn, *txn_tmp; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t ts; + uint64_t walked; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + + if (F_ISSET(txn, WT_TXN_TS_PUBLISHED)) + return; + + if (F_ISSET(txn, WT_TXN_HAS_TS_DURABLE)) + ts = txn->durable_timestamp; + else if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) { + /* + * If we know for a fact that this is a prepared transaction and we only have a commit + * timestamp, don't add to the durable queue. If we poll all_durable after setting the + * commit timestamp of a prepared transaction, that prepared transaction should NOT be + * visible. + */ + if (F_ISSET(txn, WT_TXN_PREPARE)) + return; + ts = txn->commit_timestamp; + } else + return; + + __wt_writelock(session, &txn_global->durable_timestamp_rwlock); + /* + * If our transaction is on the queue remove it first. The timestamp may move earlier so we + * otherwise might not remove ourselves before finding where to insert ourselves (which would + * result in a list loop) and we don't want to walk more of the list than needed. + */ + if (txn->clear_durable_q) { + TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq); + WT_PUBLISH(txn->clear_durable_q, false); + --txn_global->durable_timestampq_len; + } + /* + * Walk the list to look for where to insert our own transaction and remove any transactions + * that are not active. We stop when we get to the location where we want to insert. + */ + if (TAILQ_EMPTY(&txn_global->durable_timestamph)) { + TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq); + WT_STAT_CONN_INCR(session, txn_durable_queue_empty); + } else { + /* Walk from the start, removing cleared entries. */ + walked = 0; + TAILQ_FOREACH_SAFE(qtxn, &txn_global->durable_timestamph, durable_timestampq, txn_tmp) + { + ++walked; + /* + * Stop on the first entry that we cannot clear. + */ + if (!qtxn->clear_durable_q) + break; + + TAILQ_REMOVE(&txn_global->durable_timestamph, qtxn, durable_timestampq); + WT_PUBLISH(qtxn->clear_durable_q, false); + --txn_global->durable_timestampq_len; + } + + /* + * Now walk backwards from the end to find the correct position for the insert. + */ + qtxn = TAILQ_LAST(&txn_global->durable_timestamph, __wt_txn_dts_qh); + while (qtxn != NULL && __txn_get_published_timestamp(session, qtxn) > ts) { + ++walked; + qtxn = TAILQ_PREV(qtxn, __wt_txn_dts_qh, durable_timestampq); + } + if (qtxn == NULL) { + TAILQ_INSERT_HEAD(&txn_global->durable_timestamph, txn, durable_timestampq); + WT_STAT_CONN_INCR(session, txn_durable_queue_head); + } else + TAILQ_INSERT_AFTER(&txn_global->durable_timestamph, qtxn, txn, durable_timestampq); + WT_STAT_CONN_INCRV(session, txn_durable_queue_walked, walked); + } + ++txn_global->durable_timestampq_len; + WT_STAT_CONN_INCR(session, txn_durable_queue_inserts); + txn->clear_durable_q = false; + F_SET(txn, WT_TXN_TS_PUBLISHED); + __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock); } /* * __wt_txn_clear_durable_timestamp -- - * Clear a transaction's published durable timestamp. + * Clear a transaction's published durable timestamp. */ void __wt_txn_clear_durable_timestamp(WT_SESSION_IMPL *session) { - WT_TXN *txn; - uint32_t flags; - - txn = &session->txn; - - if (!F_ISSET(txn, WT_TXN_TS_PUBLISHED)) - return; - flags = txn->flags; - LF_CLR(WT_TXN_TS_PUBLISHED); - - /* - * Notify other threads that our transaction is inactive and can be - * cleaned up safely from the durable timestamp queue whenever the next - * thread walks the queue. We do not need to remove it now. - */ - WT_PUBLISH(txn->clear_durable_q, true); - WT_PUBLISH(txn->flags, flags); + WT_TXN *txn; + uint32_t flags; + + txn = &session->txn; + + if (!F_ISSET(txn, WT_TXN_TS_PUBLISHED)) + return; + flags = txn->flags; + LF_CLR(WT_TXN_TS_PUBLISHED); + + /* + * Notify other threads that our transaction is inactive and can be cleaned up safely from the + * durable timestamp queue whenever the next thread walks the queue. We do not need to remove it + * now. + */ + WT_PUBLISH(txn->clear_durable_q, true); + WT_PUBLISH(txn->flags, flags); } /* * __wt_txn_publish_read_timestamp -- - * Publish a transaction's read timestamp. + * Publish a transaction's read timestamp. */ void __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session) { - WT_TXN *qtxn, *txn, *txn_tmp; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t tmp_timestamp; - uint64_t walked; - - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - - if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) - return; - - __wt_writelock(session, &txn_global->read_timestamp_rwlock); - /* - * If our transaction is on the queue remove it first. The timestamp - * may move earlier so we otherwise might not remove ourselves before - * finding where to insert ourselves (which would result in a list - * loop) and we don't want to walk more of the list than needed. - */ - if (txn->clear_read_q) { - TAILQ_REMOVE(&txn_global->read_timestamph, - txn, read_timestampq); - WT_PUBLISH(txn->clear_read_q, false); - --txn_global->read_timestampq_len; - } - /* - * Walk the list to look for where to insert our own transaction - * and remove any transactions that are not active. We stop when - * we get to the location where we want to insert. - */ - if (TAILQ_EMPTY(&txn_global->read_timestamph)) { - TAILQ_INSERT_HEAD( - &txn_global->read_timestamph, txn, read_timestampq); - WT_STAT_CONN_INCR(session, txn_read_queue_empty); - } else { - /* Walk from the start, removing cleared entries. */ - walked = 0; - TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph, - read_timestampq, txn_tmp) { - ++walked; - if (!qtxn->clear_read_q) - break; - - TAILQ_REMOVE(&txn_global->read_timestamph, - qtxn, read_timestampq); - WT_PUBLISH(qtxn->clear_read_q, false); - --txn_global->read_timestampq_len; - } - - /* - * Now walk backwards from the end to find the correct position - * for the insert. - */ - qtxn = TAILQ_LAST( - &txn_global->read_timestamph, __wt_txn_rts_qh); - while (qtxn != NULL) { - if (!__txn_get_read_timestamp(qtxn, &tmp_timestamp) || - tmp_timestamp > txn->read_timestamp) { - ++walked; - qtxn = TAILQ_PREV(qtxn, - __wt_txn_rts_qh, read_timestampq); - } else - break; - } - if (qtxn == NULL) { - TAILQ_INSERT_HEAD(&txn_global->read_timestamph, - txn, read_timestampq); - WT_STAT_CONN_INCR(session, txn_read_queue_head); - } else - TAILQ_INSERT_AFTER(&txn_global->read_timestamph, - qtxn, txn, read_timestampq); - WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked); - } - /* - * We do not set the read timestamp here. It has been set in the caller - * because special processing for round to oldest. - */ - ++txn_global->read_timestampq_len; - WT_STAT_CONN_INCR(session, txn_read_queue_inserts); - txn->clear_read_q = false; - F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ); - __wt_writeunlock(session, &txn_global->read_timestamp_rwlock); + WT_TXN *qtxn, *txn, *txn_tmp; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t tmp_timestamp; + uint64_t walked; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + + if (F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) + return; + + __wt_writelock(session, &txn_global->read_timestamp_rwlock); + /* + * If our transaction is on the queue remove it first. The timestamp may move earlier so we + * otherwise might not remove ourselves before finding where to insert ourselves (which would + * result in a list loop) and we don't want to walk more of the list than needed. + */ + if (txn->clear_read_q) { + TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq); + WT_PUBLISH(txn->clear_read_q, false); + --txn_global->read_timestampq_len; + } + /* + * Walk the list to look for where to insert our own transaction and remove any transactions + * that are not active. We stop when we get to the location where we want to insert. + */ + if (TAILQ_EMPTY(&txn_global->read_timestamph)) { + TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq); + WT_STAT_CONN_INCR(session, txn_read_queue_empty); + } else { + /* Walk from the start, removing cleared entries. */ + walked = 0; + TAILQ_FOREACH_SAFE(qtxn, &txn_global->read_timestamph, read_timestampq, txn_tmp) + { + ++walked; + if (!qtxn->clear_read_q) + break; + + TAILQ_REMOVE(&txn_global->read_timestamph, qtxn, read_timestampq); + WT_PUBLISH(qtxn->clear_read_q, false); + --txn_global->read_timestampq_len; + } + + /* + * Now walk backwards from the end to find the correct position for the insert. + */ + qtxn = TAILQ_LAST(&txn_global->read_timestamph, __wt_txn_rts_qh); + while (qtxn != NULL) { + if (!__txn_get_read_timestamp(qtxn, &tmp_timestamp) || + tmp_timestamp > txn->read_timestamp) { + ++walked; + qtxn = TAILQ_PREV(qtxn, __wt_txn_rts_qh, read_timestampq); + } else + break; + } + if (qtxn == NULL) { + TAILQ_INSERT_HEAD(&txn_global->read_timestamph, txn, read_timestampq); + WT_STAT_CONN_INCR(session, txn_read_queue_head); + } else + TAILQ_INSERT_AFTER(&txn_global->read_timestamph, qtxn, txn, read_timestampq); + WT_STAT_CONN_INCRV(session, txn_read_queue_walked, walked); + } + /* + * We do not set the read timestamp here. It has been set in the caller because special + * processing for round to oldest. + */ + ++txn_global->read_timestampq_len; + WT_STAT_CONN_INCR(session, txn_read_queue_inserts); + txn->clear_read_q = false; + F_SET(txn, WT_TXN_HAS_TS_READ | WT_TXN_PUBLIC_TS_READ); + __wt_writeunlock(session, &txn_global->read_timestamp_rwlock); } /* * __wt_txn_clear_read_timestamp -- - * Clear a transaction's published read timestamp. + * Clear a transaction's published read timestamp. */ void __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session) { - WT_TXN *txn; - uint32_t flags; + WT_TXN *txn; + uint32_t flags; - txn = &session->txn; + txn = &session->txn; - if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) { - txn->read_timestamp = WT_TS_NONE; - return; - } + if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) { + txn->read_timestamp = WT_TS_NONE; + return; + } #ifdef HAVE_DIAGNOSTIC - { - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t pinned_ts; - - txn_global = &S2C(session)->txn_global; - pinned_ts = txn_global->pinned_timestamp; - WT_ASSERT(session, txn->read_timestamp >= pinned_ts); - } + { + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t pinned_ts; + + txn_global = &S2C(session)->txn_global; + pinned_ts = txn_global->pinned_timestamp; + WT_ASSERT(session, txn->read_timestamp >= pinned_ts); + } #endif - flags = txn->flags; - LF_CLR(WT_TXN_PUBLIC_TS_READ); - - /* - * Notify other threads that our transaction is inactive and can be - * cleaned up safely from the read timestamp queue whenever the - * next thread walks the queue. We do not need to remove it now. - */ - WT_PUBLISH(txn->clear_read_q, true); - WT_PUBLISH(txn->flags, flags); - txn->read_timestamp = WT_TS_NONE; + flags = txn->flags; + LF_CLR(WT_TXN_PUBLIC_TS_READ); + + /* + * Notify other threads that our transaction is inactive and can be cleaned up safely from the + * read timestamp queue whenever the next thread walks the queue. We do not need to remove it + * now. + */ + WT_PUBLISH(txn->clear_read_q, true); + WT_PUBLISH(txn->flags, flags); + txn->read_timestamp = WT_TS_NONE; } /* * __wt_txn_clear_timestamp_queues -- - * We're about to clear the session and overwrite the txn structure. - * Remove ourselves from the commit timestamp queue and the read - * timestamp queue if we're on either of them. + * We're about to clear the session and overwrite the txn structure. Remove ourselves from the + * commit timestamp queue and the read timestamp queue if we're on either of them. */ void __wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session) { - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - - txn = &session->txn; - txn_global = &S2C(session)->txn_global; - - if (!txn->clear_durable_q && !txn->clear_read_q) - return; - - if (txn->clear_durable_q) { - __wt_writelock(session, &txn_global->durable_timestamp_rwlock); - /* - * Recheck after acquiring the lock. - */ - if (txn->clear_durable_q) { - TAILQ_REMOVE(&txn_global->durable_timestamph, - txn, durable_timestampq); - --txn_global->durable_timestampq_len; - txn->clear_durable_q = false; - } - __wt_writeunlock( - session, &txn_global->durable_timestamp_rwlock); - } - if (txn->clear_read_q) { - __wt_writelock(session, &txn_global->read_timestamp_rwlock); - /* - * Recheck after acquiring the lock. - */ - if (txn->clear_read_q) { - TAILQ_REMOVE( - &txn_global->read_timestamph, txn, read_timestampq); - --txn_global->read_timestampq_len; - txn->clear_read_q = false; - } - __wt_writeunlock(session, &txn_global->read_timestamp_rwlock); - } + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + + txn = &session->txn; + txn_global = &S2C(session)->txn_global; + + if (!txn->clear_durable_q && !txn->clear_read_q) + return; + + if (txn->clear_durable_q) { + __wt_writelock(session, &txn_global->durable_timestamp_rwlock); + /* + * Recheck after acquiring the lock. + */ + if (txn->clear_durable_q) { + TAILQ_REMOVE(&txn_global->durable_timestamph, txn, durable_timestampq); + --txn_global->durable_timestampq_len; + txn->clear_durable_q = false; + } + __wt_writeunlock(session, &txn_global->durable_timestamp_rwlock); + } + if (txn->clear_read_q) { + __wt_writelock(session, &txn_global->read_timestamp_rwlock); + /* + * Recheck after acquiring the lock. + */ + if (txn->clear_read_q) { + TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq); + --txn_global->read_timestampq_len; + txn->clear_read_q = false; + } + __wt_writeunlock(session, &txn_global->read_timestamp_rwlock); + } } |