diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-10-04 10:11:27 +1100 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-10-04 10:11:27 +1100 |
commit | 213053e190333eecf91556875317a44ac7f283f6 (patch) | |
tree | 25404b2d9224b12ce5288ddbb06d1ce049227143 | |
parent | 17221485164bd2041f0bfba2b9eff6d8a627344c (diff) | |
download | mongo-213053e190333eecf91556875317a44ac7f283f6.tar.gz |
WT-2933 Fix a race between named snapshots and checkpoints. (#3068)
-rw-r--r-- | src/btree/bt_sync.c | 6 | ||||
-rw-r--r-- | src/evict/evict_lru.c | 2 | ||||
-rw-r--r-- | src/include/cache.i | 2 | ||||
-rw-r--r-- | src/include/txn.h | 10 | ||||
-rw-r--r-- | src/include/txn.i | 6 | ||||
-rw-r--r-- | src/lsm/lsm_cursor.c | 19 | ||||
-rw-r--r-- | src/session/session_api.c | 11 | ||||
-rw-r--r-- | src/txn/txn.c | 38 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 4 | ||||
-rw-r--r-- | src/txn/txn_nsnap.c | 53 | ||||
-rw-r--r-- | test/format/lrt.c | 16 |
11 files changed, 110 insertions, 57 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index c322548c059..c9234bb5a45 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -24,14 +24,14 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; - uint64_t oldest_id, saved_snap_min; + uint64_t oldest_id, saved_pinned_id; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; - saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; + saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; @@ -225,7 +225,7 @@ err: /* On error, clear any left-over tree walk. */ * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && - saved_snap_min == WT_TXN_NONE) + saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag and push the change. */ diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index acc81f566a5..91cc64e7d27 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1816,7 +1816,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) * limit the work to 5 evictions and return. If that's not the * case, we can do more. */ - if (!busy && txn_state->snap_min != WT_TXN_NONE && + if (!busy && txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) busy = true; max_pages_evicted = busy ? 5 : 20; diff --git a/src/include/cache.i b/src/include/cache.i index 4255d04ec37..17ab39e97d2 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -355,7 +355,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) txn_state = WT_SESSION_TXN_STATE(session); busy = busy || txn_state->id != WT_TXN_NONE || session->nhazard > 0 || - (txn_state->snap_min != WT_TXN_NONE && + (txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); /* diff --git a/src/include/txn.h b/src/include/txn.h index 2e41ae8620d..8128e8e4cc2 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -49,9 +49,9 @@ WT_ASSERT((s), (s)->txn.forced_iso > 0); \ (s)->txn.forced_iso--; \ WT_ASSERT((s), txn_state->id == saved_state.id && \ - (txn_state->snap_min == saved_state.snap_min || \ - saved_state.snap_min == WT_TXN_NONE)); \ - txn_state->snap_min = saved_state.snap_min; \ + (txn_state->pinned_id == saved_state.pinned_id || \ + saved_state.pinned_id == WT_TXN_NONE)); \ + txn_state->pinned_id = saved_state.pinned_id; \ } while (0) struct __wt_named_snapshot { @@ -59,14 +59,14 @@ struct __wt_named_snapshot { TAILQ_ENTRY(__wt_named_snapshot) q; - uint64_t snap_min, snap_max; + uint64_t pinned_id, snap_min, snap_max; uint64_t *snapshot; uint32_t snapshot_count; }; struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; - volatile uint64_t snap_min; + volatile uint64_t pinned_id; }; struct __wt_txn_global { diff --git a/src/include/txn.i b/src/include/txn.i index 1a8851a9a2a..cf7e2eafc65 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -309,7 +309,7 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session) * WT_TXN_HAS_SNAPSHOT. */ if (F_ISSET(txn, WT_TXN_RUNNING) && - !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->snap_min == WT_TXN_NONE) + !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->pinned_id == WT_TXN_NONE) WT_RET(__wt_cache_eviction_check(session, false, NULL)); return (0); @@ -480,8 +480,8 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * positioned on a value, it can't be freed. */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { - if (txn_state->snap_min == WT_TXN_NONE) - txn_state->snap_min = txn_global->last_running; + if (txn_state->pinned_id == WT_TXN_NONE) + txn_state->pinned_id = txn_global->last_running; } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) WT_RET(__wt_txn_get_snapshot(session)); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index e98f59e7b05..b9a6dd18b7a 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -165,8 +165,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; - uint64_t *switch_txnp; - uint64_t snap_min; + uint64_t pinned_id, *switchp; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; @@ -226,8 +225,8 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) * that overlaps with our snapshot is a potential * conflict. * - * Note that the global snap_min is correct here: it - * tracks concurrent transactions excluding special + * Note that the pinned ID is correct here: it tracks + * concurrent transactions excluding special * transactions such as checkpoint (which we can't * conflict with because checkpoint only writes the * metadata, which is not an LSM tree). @@ -237,17 +236,17 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); - snap_min = - WT_SESSION_TXN_STATE(session)->snap_min; - for (switch_txnp = + pinned_id = + WT_SESSION_TXN_STATE(session)->pinned_id; + for (switchp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; - clsm->nupdates++, switch_txnp--) { - if (WT_TXNID_LT(*switch_txnp, snap_min)) + clsm->nupdates++, switchp--) { + if (WT_TXNID_LT(*switchp, pinned_id)) break; WT_ASSERT(session, !__wt_txn_visible_all( - session, *switch_txnp)); + session, *switchp)); } } } diff --git a/src/session/session_api.c b/src/session/session_api.c index 0d3fcad3184..426354b4994 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -68,9 +68,10 @@ __wt_session_copy_values(WT_SESSION_IMPL *session) * unless the cursor is reading from a checkpoint. */ WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->snap_min != WT_TXN_NONE || - (WT_PREFIX_MATCH(cursor->uri, "file:") && - F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); + WT_ASSERT(session, + txn_state->pinned_id != WT_TXN_NONE || + (WT_PREFIX_MATCH(cursor->uri, "file:") && + F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); #endif F_CLR(cursor, WT_CURSTD_VALUE_INT); @@ -1417,10 +1418,10 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) /* Assign pinned to the lesser of id or snap_min */ if (txn_state->id != WT_TXN_NONE && - WT_TXNID_LT(txn_state->id, txn_state->snap_min)) + WT_TXNID_LT(txn_state->id, txn_state->pinned_id)) pinned = txn_state->id; else - pinned = txn_state->snap_min; + pinned = txn_state->pinned_id; if (pinned == WT_TXN_NONE) *prange = 0; diff --git a/src/txn/txn.c b/src/txn/txn.c index 2ba48954456..3b24bcd505d 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -96,11 +96,11 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, - txn_state->snap_min == WT_TXN_NONE || + txn_state->pinned_id == WT_TXN_NONE || session->txn.isolation == WT_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->snap_min)); + !__wt_txn_visible_all(session, txn_state->pinned_id)); - txn_state->snap_min = WT_TXN_NONE; + txn_state->pinned_id = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); } @@ -117,7 +117,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; - uint64_t prev_oldest_id, snap_min; + uint64_t prev_oldest_id, pinned_id; uint32_t i, n, session_cnt; conn = S2C(session); @@ -135,21 +135,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_PAUSE(); WT_RET(ret); - current_id = snap_min = txn_global->current; + current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so - * don't including it in the published snap_min. + * don't including it in the published pinned ID. */ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) txn->snapshot[n++] = id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { - txn_state->snap_min = current_id; + txn_state->pinned_id = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; @@ -172,18 +172,18 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; - if (WT_TXNID_LT(id, snap_min)) - snap_min = id; + if (WT_TXNID_LT(id, pinned_id)) + pinned_id = id; } } /* - * If we got a new snapshot, update the published snap_min for this + * If we got a new snapshot, update the published pinned ID for this * session. */ - WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); + WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->snap_min = snap_min; + txn_state->pinned_id = pinned_id; done: __wt_readunlock(session, txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); @@ -232,13 +232,13 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, /* * !!! - * Note: Don't ignore snap_min values older than the previous - * oldest ID. Read-uncommitted operations publish snap_min + * Note: Don't ignore pinned ID values older than the previous + * oldest ID. Read-uncommitted operations publish pinned ID * values without acquiring the scan lock to protect the global - * table. See the comment in __wt_txn_cursor_op for - * more details. + * table. See the comment in __wt_txn_cursor_op for more + * details. */ - if ((id = s->snap_min) != WT_TXN_NONE && + if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; @@ -673,7 +673,7 @@ __wt_txn_init(WT_SESSION_IMPL *session) if (S2C(session)->txn_global.states != NULL) { WT_TXN_STATE *txn_state; txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE); + WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); } #endif @@ -773,7 +773,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states); for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) - s->id = s->snap_min = WT_TXN_NONE; + s->id = s->pinned_id = WT_TXN_NONE; return (0); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3aad95f5a9f..03cfcf080c1 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -689,7 +689,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); + WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); /* * Clear our entry from the global transaction session table. Any @@ -698,7 +698,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * can safely ignore the checkpoint ID (see the visible all check for * details). */ - txn_state->id = txn_state->snap_min = WT_TXN_NONE; + txn_state->id = txn_state->pinned_id = WT_TXN_NONE; __wt_writeunlock(session, txn_global->scan_rwlock); /* diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index 8f7e93238de..5cc139eba93 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -42,9 +42,16 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) return (WT_NOTFOUND); /* Bump the global ID if we are removing the first entry */ - if (found == TAILQ_FIRST(&txn_global->nsnaph)) + if (found == TAILQ_FIRST(&txn_global->nsnaph)) { + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id)); txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ? - TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; + TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE || + !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id)); + } TAILQ_REMOVE(&txn_global->nsnaph, found, q); __nsnap_destroy(session, found); WT_STAT_CONN_INCR(session, txn_snapshots_dropped); @@ -104,7 +111,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) } if (TAILQ_NEXT(last, q) != NULL) - new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min; + new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id; } do { @@ -117,7 +124,15 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); /* Now that the queue of named snapshots is updated, update the ID */ + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id) && + (new_nsnap_oldest == WT_TXN_NONE || + WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest))); txn_global->nsnap_oldest_id = new_nsnap_oldest; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, + new_nsnap_oldest == WT_TXN_NONE || + !__wt_txn_visible_all(session, new_nsnap_oldest)); return (ret); } @@ -134,6 +149,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) WT_NAMED_SNAPSHOT *nsnap, *nsnap_new; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; + uint64_t pinned_id; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; @@ -157,6 +173,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_calloc_one(session, &nsnap_new)); nsnap = nsnap_new; WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name)); + nsnap->pinned_id = pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; nsnap->snap_min = txn->snap_min; nsnap->snap_max = txn->snap_max; if (txn->snapshot_count > 0) { @@ -175,15 +192,22 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); - if (TAILQ_EMPTY(&txn_global->nsnaph)) - txn_global->nsnap_oldest_id = nsnap_new->snap_min; + if (TAILQ_EMPTY(&txn_global->nsnaph)) { + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && + !__wt_txn_visible_all(session, nsnap_new->pinned_id)); + __wt_readlock(session, txn_global->scan_rwlock); + txn_global->nsnap_oldest_id = nsnap_new->pinned_id; + __wt_readunlock(session, txn_global->scan_rwlock); + } TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); WT_STAT_CONN_INCR(session, txn_snapshots_created); nsnap_new = NULL; -err: if (started_txn) +err: if (started_txn) { WT_TRET(__wt_txn_rollback(session, NULL)); - else if (ret == 0) + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, !__wt_txn_visible_all(session, pinned_id)); + } else if (ret == 0) F_SET(txn, WT_TXN_NAMED_SNAPSHOT); if (nsnap_new != NULL) @@ -258,7 +282,20 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) __wt_readlock(session, txn_global->nsnap_rwlock); TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { - txn->snap_min = txn_state->snap_min = nsnap->snap_min; + /* + * Acquire the scan lock so the oldest ID can't move + * forward without seeing our pinned ID. + */ + __wt_readlock(session, txn_global->scan_rwlock); + txn_state->pinned_id = nsnap->pinned_id; + __wt_readunlock(session, txn_global->scan_rwlock); + + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_state->pinned_id) && + txn_global->nsnap_oldest_id != WT_TXN_NONE && + WT_TXNID_LE(txn_global->nsnap_oldest_id, + txn_state->pinned_id)); + txn->snap_min = nsnap->snap_min; txn->snap_max = nsnap->snap_max; if ((txn->snapshot_count = nsnap->snapshot_count) != 0) memcpy(txn->snapshot, nsnap->snapshot, diff --git a/test/format/lrt.c b/test/format/lrt.c index 937525522fa..69d6b22d71f 100644 --- a/test/format/lrt.c +++ b/test/format/lrt.c @@ -96,6 +96,22 @@ lrt(void *arg) pinned = 0; } else { /* + * Test named snapshots: create a snapshot, wait to + * give the transaction state time to move forward, + * then start a transaction with the named snapshot, + * drop it, then commit the transaction. This exercises + * most of the named snapshot logic under load. + */ + testutil_check(session->snapshot(session, "name=test")); + sleep(1); + testutil_check(session->begin_transaction( + session, "snapshot=test")); + testutil_check(session->snapshot( + session, "drop=(all)")); + testutil_check(session->commit_transaction( + session, NULL)); + + /* * Begin transaction: without an explicit transaction, * the snapshot is only kept around while a cursor is * positioned. As soon as the cursor loses its position |