diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2015-07-01 17:58:06 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-07-02 13:37:37 +1000 |
commit | 7a9e1bdade725fd59a0fd87ca77c7dd66aeba1ec (patch) | |
tree | caa348cb25b880993b55a4e6efa599c173de4a11 | |
parent | 9f72da7968f828753276aaf92afbb0c82f75a3b4 (diff) | |
download | mongo-7a9e1bdade725fd59a0fd87ca77c7dd66aeba1ec.tar.gz |
WT-1982 Fix a window where reconciliation could go back in transaction time.
(cherry picked from commit cd1704d6c4d84c5db8ae6b471c658945ffa226f9)
-rw-r--r-- | src/btree/bt_sync.c | 12 | ||||
-rw-r--r-- | src/evict/evict_lru.c | 2 | ||||
-rw-r--r-- | src/evict/evict_page.c | 3 | ||||
-rw-r--r-- | src/include/btmem.h | 5 | ||||
-rw-r--r-- | src/include/txn.h | 5 | ||||
-rw-r--r-- | src/include/txn.i | 30 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 21 | ||||
-rw-r--r-- | src/txn/txn.c | 45 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 91 |
9 files changed, 129 insertions, 85 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index cc52f63f1f5..ca3b8f327b3 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -190,6 +190,18 @@ err: /* On error, clear any left-over tree walk. */ if (btree->checkpointing) { /* + * Update the checkpoint generation for this handle so visible + * updates newer than the checkpoint can be evicted. + * + * This has to be published before eviction is enabled again, + * so that eviction knows that the checkpoint has completed. + */ + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, btree->checkpoint_gen); + + /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 7b47effc86f..63a905539ce 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1479,7 +1479,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full) * to make sure there is free space in the cache. */ txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); busy = txn_state->id != WT_TXN_NONE || session->nhazard > 0 || (txn_state->snap_min != WT_TXN_NONE && diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 92ad8d296df..fe08916b24c 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -59,6 +59,9 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) conn = S2C(session); + /* Checkpoints should never do eviction. */ + WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session)); + page = ref->page; forced_eviction = (page->read_gen == WT_READGEN_OLDEST); inmem_split = 0; diff --git a/src/include/btmem.h b/src/include/btmem.h index 77ad31fc3f6..23b17ef2cd3 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -194,6 +194,11 @@ struct __wt_page_modify { /* The largest update transaction ID (approximate). */ uint64_t update_txn; +#ifdef HAVE_DIAGNOSTIC + /* Check that transaction time moves forward. */ + uint64_t last_oldest_id; +#endif + /* Dirty bytes added to the cache. */ size_t bytes_dirty; diff --git a/src/include/txn.h b/src/include/txn.h index dbaa11309ab..d2b369a41c4 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -25,6 +25,9 @@ #define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) +#define WT_SESSION_IS_CHECKPOINT(s) \ + ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id) + struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; volatile uint64_t snap_min; @@ -54,7 +57,7 @@ struct __wt_txn_global { */ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ volatile uint64_t checkpoint_gen; - volatile uint64_t checkpoint_snap_min; + volatile uint64_t checkpoint_pinned; WT_TXN_STATE *states; /* Per-session transaction states */ }; diff --git a/src/include/txn.i b/src/include/txn.i index f0b0534ff4a..a9b19ca1ff5 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -98,8 +98,8 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; - uint64_t checkpoint_snap_min, oldest_id; - uint32_t checkpoint_id; + uint64_t checkpoint_pinned, oldest_id; + uint32_t checkpoint_gen; txn_global = &S2C(session)->txn_global; btree = S2BT_SAFE(session); @@ -108,9 +108,9 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * Take a local copy of these IDs in case they are updated while we are * checking visibility. */ - checkpoint_id = txn_global->checkpoint_id; - checkpoint_snap_min = txn_global->checkpoint_snap_min; - oldest_id = txn_global->oldest_id; + WT_ORDERED_READ(oldest_id, txn_global->oldest_id); + WT_ORDERED_READ(checkpoint_gen, txn_global->checkpoint_gen); + WT_ORDERED_READ(checkpoint_pinned, txn_global->checkpoint_pinned); /* * Checkpoint transactions often fall behind ordinary application @@ -122,17 +122,13 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * checkpoint, or this handle is up to date with the active checkpoint * then it's safe to ignore the checkpoint ID in the visibility check. */ - if (checkpoint_snap_min != WT_TXN_NONE && - checkpoint_id != session->id && (btree == NULL || - btree->checkpoint_gen != txn_global->checkpoint_gen) && - TXNID_LT(checkpoint_snap_min, oldest_id)) - /* - * Use the checkpoint ID for the visibility check if it is the - * oldest ID in the system. - */ - oldest_id = checkpoint_snap_min; + if (checkpoint_pinned == WT_TXN_NONE || + TXNID_LT(oldest_id, checkpoint_pinned) || + WT_SESSION_IS_CHECKPOINT(session) || + (btree != NULL && btree->checkpoint_gen == checkpoint_gen)) + return (oldest_id); - return (oldest_id); + return (checkpoint_pinned); } /* @@ -340,7 +336,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) if (!F_ISSET(txn, TXN_HAS_ID)) { conn = S2C(session); txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn_state->id == WT_TXN_NONE); @@ -432,7 +428,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) txn = &session->txn; txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); /* * If there is no transaction running (so we don't have an ID), and no diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 6478bdd5613..14ab05fbb25 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -363,6 +363,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); } +#ifdef HAVE_DIAGNOSTIC + { + /* + * Check that transaction time always moves forward for a given page. + * If this check fails, reconciliation can free something that a future + * reconciliation will need. + */ + uint64_t oldest_id = __wt_txn_oldest_id(session); + WT_ASSERT(session, TXNID_LE(mod->last_oldest_id, oldest_id)); + mod->last_oldest_id = oldest_id; + } +#endif + /* Record the most recent transaction ID we will *not* write. */ mod->disk_snap_min = session->txn.snap_min; @@ -839,6 +852,7 @@ static inline int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { + WT_DECL_RET; WT_ITEM ovfl; WT_PAGE *page; WT_UPDATE *upd, *upd_list, *upd_ovfl; @@ -977,8 +991,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && !__wt_txn_visible_all(session, min_txn)) { - WT_RET(__wt_ovfl_txnc_search( - page, vpack->data, vpack->size, &ovfl)); + if ((ret = __wt_ovfl_txnc_search( + page, vpack->data, vpack->size, &ovfl)) != 0) + WT_PANIC_RET(session, ret, + "cached overflow item discarded early"); + /* * Create an update structure with an impossibly low transaction * ID and append it to the update list we're about to save. diff --git a/src/txn/txn.c b/src/txn/txn.c index d488f7929e1..f6f5a695b4f 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -60,7 +60,6 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE || session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || - session->id == S2C(session)->txn_global.checkpoint_id || !__wt_txn_visible_all(session, txn_state->snap_min)); txn_state->snap_min = WT_TXN_NONE; @@ -80,13 +79,13 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; - uint32_t ckpt_id, i, n, session_cnt; + uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -119,12 +118,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - ckpt_id = txn_global->checkpoint_id; for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip the checkpoint transaction; it is never read from. */ - if (i == ckpt_id) - continue; - /* * Build our snapshot of any concurrent transaction IDs. * @@ -183,7 +177,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min; - uint32_t ckpt_id, i, session_cnt; + uint32_t i, session_cnt; int32_t count; int last_running_moved; @@ -219,12 +213,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - ckpt_id = txn_global->checkpoint_id; for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip the checkpoint transaction; it is never read from. */ - if (i == ckpt_id) - continue; - /* * Update the oldest ID. * @@ -266,15 +255,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) if (TXNID_LT(prev_oldest_id, oldest_id) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); - ckpt_id = txn_global->checkpoint_id; for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* - * Skip the checkpoint transaction; it is never read - * from. - */ - if (i == ckpt_id) - continue; - if ((id = s->id) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; @@ -358,10 +339,17 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn->notify = NULL; txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); /* Clear the transaction's ID from the global table. */ - if (F_ISSET(txn, TXN_HAS_ID)) { + if (WT_SESSION_IS_CHECKPOINT(session)) { + WT_ASSERT(session, txn_state->id == WT_TXN_NONE); + txn->id = WT_TXN_NONE; + + /* Clear the global checkpoint transaction IDs. */ + txn_global->checkpoint_id = 0; + txn_global->checkpoint_pinned = WT_TXN_NONE; + } else if (F_ISSET(txn, TXN_HAS_ID)) { WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); @@ -418,6 +406,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_txn_release_snapshot(session); ret = __wt_txn_log_commit(session, cfg); + WT_ASSERT(session, ret == 0); } /* @@ -548,19 +537,19 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS *stats; - uint64_t checkpoint_snap_min; + uint64_t checkpoint_pinned; conn = S2C(session); txn_global = &conn->txn_global; stats = &conn->stats; - checkpoint_snap_min = txn_global->checkpoint_snap_min; + checkpoint_pinned = txn_global->checkpoint_pinned; WT_STAT_SET(stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); WT_STAT_SET(stats, txn_pinned_checkpoint_range, - checkpoint_snap_min == WT_TXN_NONE ? - 0 : txn_global->current - checkpoint_snap_min); + checkpoint_pinned == WT_TXN_NONE ? + 0 : txn_global->current - checkpoint_pinned); } /* diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 8be05734190..08d8b778371 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -349,6 +349,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; + WT_TXN_STATE *txn_state; const char *txn_cfg[] = { WT_CONFIG_BASE(session, session_begin_transaction), "isolation=snapshot", NULL }; @@ -358,6 +359,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; txn = &session->txn; full = idle = logging = tracking = 0; @@ -426,6 +428,22 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__checkpoint_verbose_track(session, "starting transaction", &verb_timer)); + if (full) + WT_ERR(__wt_epoch(session, &start)); + + /* + * Bump the global checkpoint generation, used to figure out whether + * checkpoint has visited a tree. There is no need for this to be + * atomic: it is only written while holding the checkpoint lock. + * + * We do need to update it before clearing the checkpoint's entry out + * of the transaction table, or a thread evicting in a tree could + * ignore the checkpoint's transaction. + */ + ++txn_global->checkpoint_gen; + WT_STAT_FAST_CONN_SET(session, + txn_checkpoint_generation, txn_global->checkpoint_gen); + /* * Start a snapshot transaction for the checkpoint. * @@ -433,30 +451,44 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * side effects on cursors, which applications can hold open across * calls to checkpoint. */ - if (full) - WT_ERR(__wt_epoch(session, &start)); WT_ERR(__wt_txn_begin(session, txn_cfg)); /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_ERR(__wt_txn_id_check(session)); /* - * Save a copy of the checkpoint session ID so that refresh can skip - * the checkpoint transactions. We never do checkpoints in the default - * session with id zero. Save a copy of the snap min so that visibility - * checks for the checkpoint use the right ID. + * Save the checkpoint session ID. We never do checkpoints in the + * default session (with id zero). */ - WT_ASSERT(session, session->id != 0); + WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); txn_global->checkpoint_id = session->id; - txn_global->checkpoint_snap_min = session->txn.snap_min; + + txn_global->checkpoint_pinned = + WT_MIN(txn_state->id, txn_state->snap_min); /* - * No need for this to be atomic it is only written while holding the - * checkpoint lock. + * We're about to clear the checkpoint transaction from the global + * state table so the oldest ID can move forward. Make sure everything + * we've done above is scheduled. */ - txn_global->checkpoint_gen += 1; - WT_STAT_FAST_CONN_SET(session, - txn_checkpoint_generation, txn_global->checkpoint_gen); + WT_FULL_BARRIER(); + + /* + * Sanity check that the oldest ID hasn't moved on before we have + * cleared our entry. + */ + WT_ASSERT(session, + TXNID_LE(txn_global->oldest_id, txn_state->id) && + TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); + + /* + * Clear our entry from the global transaction session table. Any + * operation that needs to know about the ID for this checkpoint will + * consider the checkpoint ID in the global structure. Most operations + * can safely ignore the checkpoint ID (see the visible all check for + * details). + */ + txn_state->id = txn_state->snap_min = WT_TXN_NONE; /* Tell logging that we have started a database checkpoint. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && full) { @@ -477,10 +509,6 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Release the snapshot so we aren't pinning pages in cache. */ __wt_txn_release_snapshot(session); - /* Clear the global checkpoint transaction IDs */ - txn_global->checkpoint_id = 0; - txn_global->checkpoint_snap_min = WT_TXN_NONE; - WT_ERR(__checkpoint_verbose_track(session, "committing transaction", &verb_timer)); @@ -553,10 +581,6 @@ err: /* WT_TRET(__wt_txn_rollback(session, NULL)); } - /* Ensure the checkpoint IDs are cleared on the error path. */ - txn_global->checkpoint_id = 0; - txn_global->checkpoint_snap_min = WT_TXN_NONE; - /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. @@ -809,10 +833,8 @@ __checkpoint_worker( force = 1; } if (!btree->modified && !force) { - if (!is_checkpoint) { - F_SET(btree, WT_BTREE_SKIP_CKPT); - goto done; - } + if (!is_checkpoint) + goto nockpt; deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -831,7 +853,12 @@ __checkpoint_worker( (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && deleted < 2) { - F_SET(btree, WT_BTREE_SKIP_CKPT); +nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, + btree->checkpoint_gen); goto done; } } @@ -1059,16 +1086,8 @@ fake: /* WT_ERR(__wt_txn_checkpoint_log( session, 0, WT_TXN_LOG_CKPT_STOP, NULL)); - /* - * Update the checkpoint generation for this handle so visible - * updates newer than the checkpoint can be evicted. - */ -done: btree->checkpoint_gen = conn->txn_global.checkpoint_gen; - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); - -err: - /* +done: +err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ |