summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2015-07-01 17:58:06 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2015-07-02 13:37:37 +1000
commit7a9e1bdade725fd59a0fd87ca77c7dd66aeba1ec (patch)
treecaa348cb25b880993b55a4e6efa599c173de4a11
parent9f72da7968f828753276aaf92afbb0c82f75a3b4 (diff)
downloadmongo-7a9e1bdade725fd59a0fd87ca77c7dd66aeba1ec.tar.gz
WT-1982 Fix a window where reconciliation could go back in transaction time.
(cherry picked from commit cd1704d6c4d84c5db8ae6b471c658945ffa226f9)
-rw-r--r--src/btree/bt_sync.c12
-rw-r--r--src/evict/evict_lru.c2
-rw-r--r--src/evict/evict_page.c3
-rw-r--r--src/include/btmem.h5
-rw-r--r--src/include/txn.h5
-rw-r--r--src/include/txn.i30
-rw-r--r--src/reconcile/rec_write.c21
-rw-r--r--src/txn/txn.c45
-rw-r--r--src/txn/txn_ckpt.c91
9 files changed, 129 insertions, 85 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index cc52f63f1f5..ca3b8f327b3 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -190,6 +190,18 @@ err: /* On error, clear any left-over tree walk. */
if (btree->checkpointing) {
/*
+ * Update the checkpoint generation for this handle so visible
+ * updates newer than the checkpoint can be evicted.
+ *
+ * This has to be published before eviction is enabled again,
+ * so that eviction knows that the checkpoint has completed.
+ */
+ WT_PUBLISH(btree->checkpoint_gen,
+ S2C(session)->txn_global.checkpoint_gen);
+ WT_STAT_FAST_DATA_SET(session,
+ btree_checkpoint_generation, btree->checkpoint_gen);
+
+ /*
* Clear the checkpoint flag and push the change; not required,
* but publishing the change means stalled eviction gets moving
* as soon as possible.
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 7b47effc86f..63a905539ce 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1479,7 +1479,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full)
* to make sure there is free space in the cache.
*/
txn_global = &S2C(session)->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
busy = txn_state->id != WT_TXN_NONE ||
session->nhazard > 0 ||
(txn_state->snap_min != WT_TXN_NONE &&
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 92ad8d296df..fe08916b24c 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -59,6 +59,9 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
conn = S2C(session);
+ /* Checkpoints should never do eviction. */
+ WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session));
+
page = ref->page;
forced_eviction = (page->read_gen == WT_READGEN_OLDEST);
inmem_split = 0;
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 77ad31fc3f6..23b17ef2cd3 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -194,6 +194,11 @@ struct __wt_page_modify {
/* The largest update transaction ID (approximate). */
uint64_t update_txn;
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that transaction time moves forward. */
+ uint64_t last_oldest_id;
+#endif
+
/* Dirty bytes added to the cache. */
size_t bytes_dirty;
diff --git a/src/include/txn.h b/src/include/txn.h
index dbaa11309ab..d2b369a41c4 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -25,6 +25,9 @@
#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id])
+#define WT_SESSION_IS_CHECKPOINT(s) \
+ ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id)
+
struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state {
volatile uint64_t id;
volatile uint64_t snap_min;
@@ -54,7 +57,7 @@ struct __wt_txn_global {
*/
volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
volatile uint64_t checkpoint_gen;
- volatile uint64_t checkpoint_snap_min;
+ volatile uint64_t checkpoint_pinned;
WT_TXN_STATE *states; /* Per-session transaction states */
};
diff --git a/src/include/txn.i b/src/include/txn.i
index f0b0534ff4a..a9b19ca1ff5 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -98,8 +98,8 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_TXN_GLOBAL *txn_global;
- uint64_t checkpoint_snap_min, oldest_id;
- uint32_t checkpoint_id;
+ uint64_t checkpoint_pinned, oldest_id;
+ uint32_t checkpoint_gen;
txn_global = &S2C(session)->txn_global;
btree = S2BT_SAFE(session);
@@ -108,9 +108,9 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
* Take a local copy of these IDs in case they are updated while we are
* checking visibility.
*/
- checkpoint_id = txn_global->checkpoint_id;
- checkpoint_snap_min = txn_global->checkpoint_snap_min;
- oldest_id = txn_global->oldest_id;
+ WT_ORDERED_READ(oldest_id, txn_global->oldest_id);
+ WT_ORDERED_READ(checkpoint_gen, txn_global->checkpoint_gen);
+ WT_ORDERED_READ(checkpoint_pinned, txn_global->checkpoint_pinned);
/*
* Checkpoint transactions often fall behind ordinary application
@@ -122,17 +122,13 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
* checkpoint, or this handle is up to date with the active checkpoint
* then it's safe to ignore the checkpoint ID in the visibility check.
*/
- if (checkpoint_snap_min != WT_TXN_NONE &&
- checkpoint_id != session->id && (btree == NULL ||
- btree->checkpoint_gen != txn_global->checkpoint_gen) &&
- TXNID_LT(checkpoint_snap_min, oldest_id))
- /*
- * Use the checkpoint ID for the visibility check if it is the
- * oldest ID in the system.
- */
- oldest_id = checkpoint_snap_min;
+ if (checkpoint_pinned == WT_TXN_NONE ||
+ TXNID_LT(oldest_id, checkpoint_pinned) ||
+ WT_SESSION_IS_CHECKPOINT(session) ||
+ (btree != NULL && btree->checkpoint_gen == checkpoint_gen))
+ return (oldest_id);
- return (oldest_id);
+ return (checkpoint_pinned);
}
/*
@@ -340,7 +336,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
if (!F_ISSET(txn, TXN_HAS_ID)) {
conn = S2C(session);
txn_global = &conn->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
@@ -432,7 +428,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
/*
* If there is no transaction running (so we don't have an ID), and no
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 6478bdd5613..14ab05fbb25 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -363,6 +363,19 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
}
+#ifdef HAVE_DIAGNOSTIC
+ {
+ /*
+ * Check that transaction time always moves forward for a given page.
+ * If this check fails, reconciliation can free something that a future
+ * reconciliation will need.
+ */
+ uint64_t oldest_id = __wt_txn_oldest_id(session);
+ WT_ASSERT(session, TXNID_LE(mod->last_oldest_id, oldest_id));
+ mod->last_oldest_id = oldest_id;
+ }
+#endif
+
/* Record the most recent transaction ID we will *not* write. */
mod->disk_snap_min = session->txn.snap_min;
@@ -839,6 +852,7 @@ static inline int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
+ WT_DECL_RET;
WT_ITEM ovfl;
WT_PAGE *page;
WT_UPDATE *upd, *upd_list, *upd_ovfl;
@@ -977,8 +991,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
!__wt_txn_visible_all(session, min_txn)) {
- WT_RET(__wt_ovfl_txnc_search(
- page, vpack->data, vpack->size, &ovfl));
+ if ((ret = __wt_ovfl_txnc_search(
+ page, vpack->data, vpack->size, &ovfl)) != 0)
+ WT_PANIC_RET(session, ret,
+ "cached overflow item discarded early");
+
/*
* Create an update structure with an impossibly low transaction
* ID and append it to the update list we're about to save.
diff --git a/src/txn/txn.c b/src/txn/txn.c
index d488f7929e1..f6f5a695b4f 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -60,7 +60,6 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
WT_ASSERT(session,
txn_state->snap_min == WT_TXN_NONE ||
session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
- session->id == S2C(session)->txn_global.checkpoint_id ||
!__wt_txn_visible_all(session, txn_state->snap_min));
txn_state->snap_min = WT_TXN_NONE;
@@ -80,13 +79,13 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_TXN_STATE *s, *txn_state;
uint64_t current_id, id;
uint64_t prev_oldest_id, snap_min;
- uint32_t ckpt_id, i, n, session_cnt;
+ uint32_t i, n, session_cnt;
int32_t count;
conn = S2C(session);
txn = &session->txn;
txn_global = &conn->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
current_id = snap_min = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
@@ -119,12 +118,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
- ckpt_id = txn_global->checkpoint_id;
for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip the checkpoint transaction; it is never read from. */
- if (i == ckpt_id)
- continue;
-
/*
* Build our snapshot of any concurrent transaction IDs.
*
@@ -183,7 +177,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s;
uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min;
- uint32_t ckpt_id, i, session_cnt;
+ uint32_t i, session_cnt;
int32_t count;
int last_running_moved;
@@ -219,12 +213,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
- ckpt_id = txn_global->checkpoint_id;
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip the checkpoint transaction; it is never read from. */
- if (i == ckpt_id)
- continue;
-
/*
* Update the oldest ID.
*
@@ -266,15 +255,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
if (TXNID_LT(prev_oldest_id, oldest_id) &&
WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
WT_ORDERED_READ(session_cnt, conn->session_cnt);
- ckpt_id = txn_global->checkpoint_id;
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /*
- * Skip the checkpoint transaction; it is never read
- * from.
- */
- if (i == ckpt_id)
- continue;
-
if ((id = s->id) != WT_TXN_NONE &&
TXNID_LT(id, oldest_id))
oldest_id = id;
@@ -358,10 +339,17 @@ __wt_txn_release(WT_SESSION_IMPL *session)
txn->notify = NULL;
txn_global = &S2C(session)->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
/* Clear the transaction's ID from the global table. */
- if (F_ISSET(txn, TXN_HAS_ID)) {
+ if (WT_SESSION_IS_CHECKPOINT(session)) {
+ WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
+ txn->id = WT_TXN_NONE;
+
+ /* Clear the global checkpoint transaction IDs. */
+ txn_global->checkpoint_id = 0;
+ txn_global->checkpoint_pinned = WT_TXN_NONE;
+ } else if (F_ISSET(txn, TXN_HAS_ID)) {
WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
txn->id != WT_TXN_NONE);
WT_PUBLISH(txn_state->id, WT_TXN_NONE);
@@ -418,6 +406,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
*/
__wt_txn_release_snapshot(session);
ret = __wt_txn_log_commit(session, cfg);
+ WT_ASSERT(session, ret == 0);
}
/*
@@ -548,19 +537,19 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
WT_TXN_GLOBAL *txn_global;
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS *stats;
- uint64_t checkpoint_snap_min;
+ uint64_t checkpoint_pinned;
conn = S2C(session);
txn_global = &conn->txn_global;
stats = &conn->stats;
- checkpoint_snap_min = txn_global->checkpoint_snap_min;
+ checkpoint_pinned = txn_global->checkpoint_pinned;
WT_STAT_SET(stats, txn_pinned_range,
txn_global->current - txn_global->oldest_id);
WT_STAT_SET(stats, txn_pinned_checkpoint_range,
- checkpoint_snap_min == WT_TXN_NONE ?
- 0 : txn_global->current - checkpoint_snap_min);
+ checkpoint_pinned == WT_TXN_NONE ?
+ 0 : txn_global->current - checkpoint_pinned);
}
/*
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 8be05734190..08d8b778371 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -349,6 +349,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_ISOLATION saved_isolation;
+ WT_TXN_STATE *txn_state;
const char *txn_cfg[] =
{ WT_CONFIG_BASE(session, session_begin_transaction),
"isolation=snapshot", NULL };
@@ -358,6 +359,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
txn_global = &conn->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
saved_isolation = session->isolation;
txn = &session->txn;
full = idle = logging = tracking = 0;
@@ -426,6 +428,22 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__checkpoint_verbose_track(session,
"starting transaction", &verb_timer));
+ if (full)
+ WT_ERR(__wt_epoch(session, &start));
+
+ /*
+ * Bump the global checkpoint generation, used to figure out whether
+ * checkpoint has visited a tree. There is no need for this to be
+ * atomic: it is only written while holding the checkpoint lock.
+ *
+ * We do need to update it before clearing the checkpoint's entry out
+ * of the transaction table, or a thread evicting in a tree could
+ * ignore the checkpoint's transaction.
+ */
+ ++txn_global->checkpoint_gen;
+ WT_STAT_FAST_CONN_SET(session,
+ txn_checkpoint_generation, txn_global->checkpoint_gen);
+
/*
* Start a snapshot transaction for the checkpoint.
*
@@ -433,30 +451,44 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* side effects on cursors, which applications can hold open across
* calls to checkpoint.
*/
- if (full)
- WT_ERR(__wt_epoch(session, &start));
WT_ERR(__wt_txn_begin(session, txn_cfg));
/* Ensure a transaction ID is allocated prior to sharing it globally */
WT_ERR(__wt_txn_id_check(session));
/*
- * Save a copy of the checkpoint session ID so that refresh can skip
- * the checkpoint transactions. We never do checkpoints in the default
- * session with id zero. Save a copy of the snap min so that visibility
- * checks for the checkpoint use the right ID.
+ * Save the checkpoint session ID. We never do checkpoints in the
+ * default session (with id zero).
*/
- WT_ASSERT(session, session->id != 0);
+ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
txn_global->checkpoint_id = session->id;
- txn_global->checkpoint_snap_min = session->txn.snap_min;
+
+ txn_global->checkpoint_pinned =
+ WT_MIN(txn_state->id, txn_state->snap_min);
/*
- * No need for this to be atomic it is only written while holding the
- * checkpoint lock.
+ * We're about to clear the checkpoint transaction from the global
+ * state table so the oldest ID can move forward. Make sure everything
+ * we've done above is scheduled.
*/
- txn_global->checkpoint_gen += 1;
- WT_STAT_FAST_CONN_SET(session,
- txn_checkpoint_generation, txn_global->checkpoint_gen);
+ WT_FULL_BARRIER();
+
+ /*
+ * Sanity check that the oldest ID hasn't moved on before we have
+ * cleared our entry.
+ */
+ WT_ASSERT(session,
+ TXNID_LE(txn_global->oldest_id, txn_state->id) &&
+ TXNID_LE(txn_global->oldest_id, txn_state->snap_min));
+
+ /*
+ * Clear our entry from the global transaction session table. Any
+ * operation that needs to know about the ID for this checkpoint will
+ * consider the checkpoint ID in the global structure. Most operations
+ * can safely ignore the checkpoint ID (see the visible all check for
+ * details).
+ */
+ txn_state->id = txn_state->snap_min = WT_TXN_NONE;
/* Tell logging that we have started a database checkpoint. */
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && full) {
@@ -477,10 +509,6 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/* Release the snapshot so we aren't pinning pages in cache. */
__wt_txn_release_snapshot(session);
- /* Clear the global checkpoint transaction IDs */
- txn_global->checkpoint_id = 0;
- txn_global->checkpoint_snap_min = WT_TXN_NONE;
-
WT_ERR(__checkpoint_verbose_track(session,
"committing transaction", &verb_timer));
@@ -553,10 +581,6 @@ err: /*
WT_TRET(__wt_txn_rollback(session, NULL));
}
- /* Ensure the checkpoint IDs are cleared on the error path. */
- txn_global->checkpoint_id = 0;
- txn_global->checkpoint_snap_min = WT_TXN_NONE;
-
/*
* Tell logging that we have finished a database checkpoint. Do not
* write a log record if the database was idle.
@@ -809,10 +833,8 @@ __checkpoint_worker(
force = 1;
}
if (!btree->modified && !force) {
- if (!is_checkpoint) {
- F_SET(btree, WT_BTREE_SKIP_CKPT);
- goto done;
- }
+ if (!is_checkpoint)
+ goto nockpt;
deleted = 0;
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -831,7 +853,12 @@ __checkpoint_worker(
(WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) &&
deleted < 2) {
- F_SET(btree, WT_BTREE_SKIP_CKPT);
+nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
+ WT_PUBLISH(btree->checkpoint_gen,
+ S2C(session)->txn_global.checkpoint_gen);
+ WT_STAT_FAST_DATA_SET(session,
+ btree_checkpoint_generation,
+ btree->checkpoint_gen);
goto done;
}
}
@@ -1059,16 +1086,8 @@ fake: /*
WT_ERR(__wt_txn_checkpoint_log(
session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
- /*
- * Update the checkpoint generation for this handle so visible
- * updates newer than the checkpoint can be evicted.
- */
-done: btree->checkpoint_gen = conn->txn_global.checkpoint_gen;
- WT_STAT_FAST_DATA_SET(session,
- btree_checkpoint_generation, btree->checkpoint_gen);
-
-err:
- /*
+done:
+err: /*
* If the checkpoint didn't complete successfully, make sure the
* tree is marked dirty.
*/