summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2016-10-04 10:11:27 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2016-10-04 10:11:27 +1100
commit213053e190333eecf91556875317a44ac7f283f6 (patch)
tree25404b2d9224b12ce5288ddbb06d1ce049227143
parent17221485164bd2041f0bfba2b9eff6d8a627344c (diff)
downloadmongo-213053e190333eecf91556875317a44ac7f283f6.tar.gz
WT-2933 Fix a race between named snapshots and checkpoints. (#3068)
-rw-r--r--src/btree/bt_sync.c6
-rw-r--r--src/evict/evict_lru.c2
-rw-r--r--src/include/cache.i2
-rw-r--r--src/include/txn.h10
-rw-r--r--src/include/txn.i6
-rw-r--r--src/lsm/lsm_cursor.c19
-rw-r--r--src/session/session_api.c11
-rw-r--r--src/txn/txn.c38
-rw-r--r--src/txn/txn_ckpt.c4
-rw-r--r--src/txn/txn_nsnap.c53
-rw-r--r--test/format/lrt.c16
11 files changed, 110 insertions, 57 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index c322548c059..c9234bb5a45 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -24,14 +24,14 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_REF *walk;
WT_TXN *txn;
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
- uint64_t oldest_id, saved_snap_min;
+ uint64_t oldest_id, saved_pinned_id;
uint32_t flags;
conn = S2C(session);
btree = S2BT(session);
walk = NULL;
txn = &session->txn;
- saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
+ saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
flags = WT_READ_CACHE | WT_READ_NO_GEN;
internal_bytes = leaf_bytes = 0;
@@ -225,7 +225,7 @@ err: /* On error, clear any left-over tree walk. */
* snapshot active when we started, release it.
*/
if (txn->isolation == WT_ISO_READ_COMMITTED &&
- saved_snap_min == WT_TXN_NONE)
+ saved_pinned_id == WT_TXN_NONE)
__wt_txn_release_snapshot(session);
/* Clear the checkpoint flag and push the change. */
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index acc81f566a5..91cc64e7d27 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1816,7 +1816,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
* limit the work to 5 evictions and return. If that's not the
* case, we can do more.
*/
- if (!busy && txn_state->snap_min != WT_TXN_NONE &&
+ if (!busy && txn_state->pinned_id != WT_TXN_NONE &&
txn_global->current != txn_global->oldest_id)
busy = true;
max_pages_evicted = busy ? 5 : 20;
diff --git a/src/include/cache.i b/src/include/cache.i
index 4255d04ec37..17ab39e97d2 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -355,7 +355,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp)
txn_state = WT_SESSION_TXN_STATE(session);
busy = busy || txn_state->id != WT_TXN_NONE ||
session->nhazard > 0 ||
- (txn_state->snap_min != WT_TXN_NONE &&
+ (txn_state->pinned_id != WT_TXN_NONE &&
txn_global->current != txn_global->oldest_id);
/*
diff --git a/src/include/txn.h b/src/include/txn.h
index 2e41ae8620d..8128e8e4cc2 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -49,9 +49,9 @@
WT_ASSERT((s), (s)->txn.forced_iso > 0); \
(s)->txn.forced_iso--; \
WT_ASSERT((s), txn_state->id == saved_state.id && \
- (txn_state->snap_min == saved_state.snap_min || \
- saved_state.snap_min == WT_TXN_NONE)); \
- txn_state->snap_min = saved_state.snap_min; \
+ (txn_state->pinned_id == saved_state.pinned_id || \
+ saved_state.pinned_id == WT_TXN_NONE)); \
+ txn_state->pinned_id = saved_state.pinned_id; \
} while (0)
struct __wt_named_snapshot {
@@ -59,14 +59,14 @@ struct __wt_named_snapshot {
TAILQ_ENTRY(__wt_named_snapshot) q;
- uint64_t snap_min, snap_max;
+ uint64_t pinned_id, snap_min, snap_max;
uint64_t *snapshot;
uint32_t snapshot_count;
};
struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state {
volatile uint64_t id;
- volatile uint64_t snap_min;
+ volatile uint64_t pinned_id;
};
struct __wt_txn_global {
diff --git a/src/include/txn.i b/src/include/txn.i
index 1a8851a9a2a..cf7e2eafc65 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -309,7 +309,7 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
* WT_TXN_HAS_SNAPSHOT.
*/
if (F_ISSET(txn, WT_TXN_RUNNING) &&
- !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->snap_min == WT_TXN_NONE)
+ !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->pinned_id == WT_TXN_NONE)
WT_RET(__wt_cache_eviction_check(session, false, NULL));
return (0);
@@ -480,8 +480,8 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
* positioned on a value, it can't be freed.
*/
if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
- if (txn_state->snap_min == WT_TXN_NONE)
- txn_state->snap_min = txn_global->last_running;
+ if (txn_state->pinned_id == WT_TXN_NONE)
+ txn_state->pinned_id = txn_global->last_running;
} else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
WT_RET(__wt_txn_get_snapshot(session));
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index e98f59e7b05..b9a6dd18b7a 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -165,8 +165,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
WT_LSM_TREE *lsm_tree;
WT_SESSION_IMPL *session;
WT_TXN *txn;
- uint64_t *switch_txnp;
- uint64_t snap_min;
+ uint64_t pinned_id, *switchp;
lsm_tree = clsm->lsm_tree;
session = (WT_SESSION_IMPL *)clsm->iface.session;
@@ -226,8 +225,8 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
* that overlaps with our snapshot is a potential
* conflict.
*
- * Note that the global snap_min is correct here: it
- * tracks concurrent transactions excluding special
+ * Note that the pinned ID is correct here: it tracks
+ * concurrent transactions excluding special
* transactions such as checkpoint (which we can't
* conflict with because checkpoint only writes the
* metadata, which is not an LSM tree).
@@ -237,17 +236,17 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
WT_ASSERT(session,
F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
- snap_min =
- WT_SESSION_TXN_STATE(session)->snap_min;
- for (switch_txnp =
+ pinned_id =
+ WT_SESSION_TXN_STATE(session)->pinned_id;
+ for (switchp =
&clsm->switch_txn[clsm->nchunks - 2];
clsm->nupdates < clsm->nchunks;
- clsm->nupdates++, switch_txnp--) {
- if (WT_TXNID_LT(*switch_txnp, snap_min))
+ clsm->nupdates++, switchp--) {
+ if (WT_TXNID_LT(*switchp, pinned_id))
break;
WT_ASSERT(session,
!__wt_txn_visible_all(
- session, *switch_txnp));
+ session, *switchp));
}
}
}
diff --git a/src/session/session_api.c b/src/session/session_api.c
index 0d3fcad3184..426354b4994 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -68,9 +68,10 @@ __wt_session_copy_values(WT_SESSION_IMPL *session)
* unless the cursor is reading from a checkpoint.
*/
WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session);
- WT_ASSERT(session, txn_state->snap_min != WT_TXN_NONE ||
- (WT_PREFIX_MATCH(cursor->uri, "file:") &&
- F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN)));
+ WT_ASSERT(session,
+ txn_state->pinned_id != WT_TXN_NONE ||
+ (WT_PREFIX_MATCH(cursor->uri, "file:") &&
+ F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN)));
#endif
F_CLR(cursor, WT_CURSTD_VALUE_INT);
@@ -1417,10 +1418,10 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange)
/* Assign pinned to the lesser of id or snap_min */
if (txn_state->id != WT_TXN_NONE &&
- WT_TXNID_LT(txn_state->id, txn_state->snap_min))
+ WT_TXNID_LT(txn_state->id, txn_state->pinned_id))
pinned = txn_state->id;
else
- pinned = txn_state->snap_min;
+ pinned = txn_state->pinned_id;
if (pinned == WT_TXN_NONE)
*prange = 0;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 2ba48954456..3b24bcd505d 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -96,11 +96,11 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
txn_state = WT_SESSION_TXN_STATE(session);
WT_ASSERT(session,
- txn_state->snap_min == WT_TXN_NONE ||
+ txn_state->pinned_id == WT_TXN_NONE ||
session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
- !__wt_txn_visible_all(session, txn_state->snap_min));
+ !__wt_txn_visible_all(session, txn_state->pinned_id));
- txn_state->snap_min = WT_TXN_NONE;
+ txn_state->pinned_id = WT_TXN_NONE;
F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
}
@@ -117,7 +117,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s, *txn_state;
uint64_t current_id, id;
- uint64_t prev_oldest_id, snap_min;
+ uint64_t prev_oldest_id, pinned_id;
uint32_t i, n, session_cnt;
conn = S2C(session);
@@ -135,21 +135,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_PAUSE();
WT_RET(ret);
- current_id = snap_min = txn_global->current;
+ current_id = pinned_id = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
/*
* Include the checkpoint transaction, if one is running: we should
* ignore any uncommitted changes the checkpoint has written to the
* metadata. We don't have to keep the checkpoint's changes pinned so
- * don't including it in the published snap_min.
+ * don't including it in the published pinned ID.
*/
if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE)
txn->snapshot[n++] = id;
/* For pure read-only workloads, avoid scanning. */
if (prev_oldest_id == current_id) {
- txn_state->snap_min = current_id;
+ txn_state->pinned_id = current_id;
/* Check that the oldest ID has not moved in the meantime. */
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
goto done;
@@ -172,18 +172,18 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
(id = s->id) != WT_TXN_NONE &&
WT_TXNID_LE(prev_oldest_id, id)) {
txn->snapshot[n++] = id;
- if (WT_TXNID_LT(id, snap_min))
- snap_min = id;
+ if (WT_TXNID_LT(id, pinned_id))
+ pinned_id = id;
}
}
/*
- * If we got a new snapshot, update the published snap_min for this
+ * If we got a new snapshot, update the published pinned ID for this
* session.
*/
- WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min));
+ WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id));
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- txn_state->snap_min = snap_min;
+ txn_state->pinned_id = pinned_id;
done: __wt_readunlock(session, txn_global->scan_rwlock);
__txn_sort_snapshot(session, n, current_id);
@@ -232,13 +232,13 @@ __txn_oldest_scan(WT_SESSION_IMPL *session,
/*
* !!!
- * Note: Don't ignore snap_min values older than the previous
- * oldest ID. Read-uncommitted operations publish snap_min
+ * Note: Don't ignore pinned ID values older than the previous
+ * oldest ID. Read-uncommitted operations publish pinned ID
* values without acquiring the scan lock to protect the global
- * table. See the comment in __wt_txn_cursor_op for
- * more details.
+ * table. See the comment in __wt_txn_cursor_op for more
+ * details.
*/
- if ((id = s->snap_min) != WT_TXN_NONE &&
+ if ((id = s->pinned_id) != WT_TXN_NONE &&
WT_TXNID_LT(id, oldest_id)) {
oldest_id = id;
oldest_session = &conn->sessions[i];
@@ -673,7 +673,7 @@ __wt_txn_init(WT_SESSION_IMPL *session)
if (S2C(session)->txn_global.states != NULL) {
WT_TXN_STATE *txn_state;
txn_state = WT_SESSION_TXN_STATE(session);
- WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE);
+ WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE);
}
#endif
@@ -773,7 +773,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states);
for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
- s->id = s->snap_min = WT_TXN_NONE;
+ s->id = s->pinned_id = WT_TXN_NONE;
return (0);
}
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 3aad95f5a9f..03cfcf080c1 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -689,7 +689,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_ASSERT(session,
WT_TXNID_LE(txn_global->oldest_id, txn_state->id) &&
- WT_TXNID_LE(txn_global->oldest_id, txn_state->snap_min));
+ WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id));
/*
* Clear our entry from the global transaction session table. Any
@@ -698,7 +698,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* can safely ignore the checkpoint ID (see the visible all check for
* details).
*/
- txn_state->id = txn_state->snap_min = WT_TXN_NONE;
+ txn_state->id = txn_state->pinned_id = WT_TXN_NONE;
__wt_writeunlock(session, txn_global->scan_rwlock);
/*
diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c
index 8f7e93238de..5cc139eba93 100644
--- a/src/txn/txn_nsnap.c
+++ b/src/txn/txn_nsnap.c
@@ -42,9 +42,16 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
return (WT_NOTFOUND);
/* Bump the global ID if we are removing the first entry */
- if (found == TAILQ_FIRST(&txn_global->nsnaph))
+ if (found == TAILQ_FIRST(&txn_global->nsnaph)) {
+ WT_ASSERT(session, !__wt_txn_visible_all(
+ session, txn_global->nsnap_oldest_id));
txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ?
- TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE;
+ TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE;
+ WT_DIAGNOSTIC_YIELD;
+ WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE ||
+ !__wt_txn_visible_all(
+ session, txn_global->nsnap_oldest_id));
+ }
TAILQ_REMOVE(&txn_global->nsnaph, found, q);
__nsnap_destroy(session, found);
WT_STAT_CONN_INCR(session, txn_snapshots_dropped);
@@ -104,7 +111,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive)
}
if (TAILQ_NEXT(last, q) != NULL)
- new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min;
+ new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id;
}
do {
@@ -117,7 +124,15 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive)
} while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph));
/* Now that the queue of named snapshots is updated, update the ID */
+ WT_ASSERT(session, !__wt_txn_visible_all(
+ session, txn_global->nsnap_oldest_id) &&
+ (new_nsnap_oldest == WT_TXN_NONE ||
+ WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest)));
txn_global->nsnap_oldest_id = new_nsnap_oldest;
+ WT_DIAGNOSTIC_YIELD;
+ WT_ASSERT(session,
+ new_nsnap_oldest == WT_TXN_NONE ||
+ !__wt_txn_visible_all(session, new_nsnap_oldest));
return (ret);
}
@@ -134,6 +149,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
WT_NAMED_SNAPSHOT *nsnap, *nsnap_new;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
+ uint64_t pinned_id;
const char *txn_cfg[] =
{ WT_CONFIG_BASE(session, WT_SESSION_begin_transaction),
"isolation=snapshot", NULL };
@@ -157,6 +173,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_calloc_one(session, &nsnap_new));
nsnap = nsnap_new;
WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name));
+ nsnap->pinned_id = pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id;
nsnap->snap_min = txn->snap_min;
nsnap->snap_max = txn->snap_max;
if (txn->snapshot_count > 0) {
@@ -175,15 +192,22 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval));
- if (TAILQ_EMPTY(&txn_global->nsnaph))
- txn_global->nsnap_oldest_id = nsnap_new->snap_min;
+ if (TAILQ_EMPTY(&txn_global->nsnaph)) {
+ WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE &&
+ !__wt_txn_visible_all(session, nsnap_new->pinned_id));
+ __wt_readlock(session, txn_global->scan_rwlock);
+ txn_global->nsnap_oldest_id = nsnap_new->pinned_id;
+ __wt_readunlock(session, txn_global->scan_rwlock);
+ }
TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
WT_STAT_CONN_INCR(session, txn_snapshots_created);
nsnap_new = NULL;
-err: if (started_txn)
+err: if (started_txn) {
WT_TRET(__wt_txn_rollback(session, NULL));
- else if (ret == 0)
+ WT_DIAGNOSTIC_YIELD;
+ WT_ASSERT(session, !__wt_txn_visible_all(session, pinned_id));
+ } else if (ret == 0)
F_SET(txn, WT_TXN_NAMED_SNAPSHOT);
if (nsnap_new != NULL)
@@ -258,7 +282,20 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval)
__wt_readlock(session, txn_global->nsnap_rwlock);
TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) {
- txn->snap_min = txn_state->snap_min = nsnap->snap_min;
+ /*
+ * Acquire the scan lock so the oldest ID can't move
+ * forward without seeing our pinned ID.
+ */
+ __wt_readlock(session, txn_global->scan_rwlock);
+ txn_state->pinned_id = nsnap->pinned_id;
+ __wt_readunlock(session, txn_global->scan_rwlock);
+
+ WT_ASSERT(session, !__wt_txn_visible_all(
+ session, txn_state->pinned_id) &&
+ txn_global->nsnap_oldest_id != WT_TXN_NONE &&
+ WT_TXNID_LE(txn_global->nsnap_oldest_id,
+ txn_state->pinned_id));
+ txn->snap_min = nsnap->snap_min;
txn->snap_max = nsnap->snap_max;
if ((txn->snapshot_count = nsnap->snapshot_count) != 0)
memcpy(txn->snapshot, nsnap->snapshot,
diff --git a/test/format/lrt.c b/test/format/lrt.c
index 937525522fa..69d6b22d71f 100644
--- a/test/format/lrt.c
+++ b/test/format/lrt.c
@@ -96,6 +96,22 @@ lrt(void *arg)
pinned = 0;
} else {
/*
+ * Test named snapshots: create a snapshot, wait to
+ * give the transaction state time to move forward,
+ * then start a transaction with the named snapshot,
+ * drop it, then commit the transaction. This exercises
+ * most of the named snapshot logic under load.
+ */
+ testutil_check(session->snapshot(session, "name=test"));
+ sleep(1);
+ testutil_check(session->begin_transaction(
+ session, "snapshot=test"));
+ testutil_check(session->snapshot(
+ session, "drop=(all)"));
+ testutil_check(session->commit_transaction(
+ session, NULL));
+
+ /*
* Begin transaction: without an explicit transaction,
* the snapshot is only kept around while a cursor is
* positioned. As soon as the cursor loses its position