diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-07-27 16:18:00 +1000 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-07-27 16:18:00 +1000 |
commit | 7e1a47dd45735a8ee98aaf24f60406a0bb682359 (patch) | |
tree | b9f5502ee918c8222b1edc9ee31d953f464c4af0 | |
parent | e892427f099623146e0a5343d95ceb6dbb082d0f (diff) | |
download | mongo-7e1a47dd45735a8ee98aaf24f60406a0bb682359.tar.gz |
WT-2798 Fix data consistency bug with table creates during a checkpoint. (#2910)
When logging is disabled, a create operation (and potentially other
metadata updates) could write partially completed checkpoint metadata,
leaving on-disk files inconsistent until the checkpoint completes.
-rw-r--r-- | bench/wtperf/idle_table_cycle.c | 3 | ||||
-rw-r--r-- | bench/wtperf/runners/checkpoint_schema_race.wtperf | 20 | ||||
-rw-r--r-- | src/include/txn.h | 1 | ||||
-rw-r--r-- | src/lsm/lsm_cursor.c | 9 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 12 | ||||
-rw-r--r-- | src/txn/txn.c | 34 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 20 |
7 files changed, 75 insertions, 24 deletions
diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c index b699b5b9dd1..3c079bb560f 100644 --- a/bench/wtperf/idle_table_cycle.c +++ b/bench/wtperf/idle_table_cycle.c @@ -129,7 +129,8 @@ cycle_idle_tables(void *arg) * Drop the table. Keep retrying on EBUSY failure - it is an * expected return when checkpoints are happening. */ - while ((ret = session->drop(session, uri, "force")) == EBUSY) + while ((ret = session->drop( + session, uri, "force,checkpoint_wait=false")) == EBUSY) __wt_sleep(1, 0); if (ret != 0 && ret != EBUSY) { diff --git a/bench/wtperf/runners/checkpoint_schema_race.wtperf b/bench/wtperf/runners/checkpoint_schema_race.wtperf new file mode 100644 index 00000000000..ade8e88ee9b --- /dev/null +++ b/bench/wtperf/runners/checkpoint_schema_race.wtperf @@ -0,0 +1,20 @@ +# Check create and drop behavior concurrent with checkpoints (WT-2798). +# Setup a multiple tables and a cache size large enough that checkpoints can +# take a long time. +conn_config="cache_size=8GB,log=(enabled=false),checkpoint=(wait=30)" +table_config="leaf_page_max=4k,internal_page_max=16k,type=file" +icount=10000000 +table_count=100 +table_count_idle=100 +# Turn on create/drop of idle tables, but don't worry if individual operations +# take a long time. +idle_table_cycle=120 +populate_threads=5 +checkpoint_threads=0 +report_interval=5 +# 100 million +random_range=10000000 +run_time=300 +# Setup a workload that dirties a lot of the cache +threads=((count=2,reads=1),(count=2,inserts=1),(count=2,updates=1)) +value_sz=500 diff --git a/src/include/txn.h b/src/include/txn.h index d10738cc670..2e41ae8620d 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -98,6 +98,7 @@ struct __wt_txn_global { volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ volatile uint64_t checkpoint_gen; volatile uint64_t checkpoint_pinned; + volatile uint64_t checkpoint_txnid; /* Checkpoint's txn ID */ /* Named snapshot state. */ WT_RWLOCK *nsnap_rwlock; diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index a0e636768f0..bedef6a8596 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -225,13 +225,20 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) * transaction ID in each chunk: any transaction ID * that overlaps with our snapshot is a potential * conflict. + * + * Note that the global snap_min is correct here: it + * tracks concurrent transactions excluding special + * transactions such as checkpoint (which we can't + * conflict with because checkpoint only writes the + * metadata, which is not an LSM tree). */ clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); - snap_min = txn->snap_min; + snap_min = + WT_SESSION_TXN_STATE(session)->snap_min; for (switch_txnp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 6bd1e9dd84b..75dcb481cd4 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -1150,8 +1150,18 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) || - __wt_txn_visible_all(session, max_txn))) + __wt_txn_visible_all(session, max_txn))) { + /* + * The checkpoint transaction is special. Make sure we never + * write (metadata) updates from a checkpoint in a concurrent + * session. + */ + WT_ASSERT(session, *updp == NULL || + (txnid = (*updp)->txnid) == WT_TXN_NONE || + txnid != S2C(session)->txn_global.checkpoint_txnid || + WT_SESSION_IS_CHECKPOINT(session)); return (0); + } /* * In some cases, there had better not be skipped updates or updates not diff --git a/src/txn/txn.c b/src/txn/txn.c index dd4384d9a9a..31d9aef4cbc 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -124,6 +124,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); + n = 0; /* * Spin waiting for the lock: the sleeps in our blocking readlock @@ -137,20 +138,26 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; + /* + * Include the checkpoint transaction, if one is running: we should + * ignore any uncommitted changes the checkpoint has written to the + * metadata. We don't have to keep the checkpoint's changes pinned so + * don't including it in the published snap_min. + */ + if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) + txn->snapshot[n++] = id; + /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { txn_state->snap_min = current_id; - __txn_sort_snapshot(session, 0, current_id); - /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - WT_RET(__wt_readunlock(session, txn_global->scan_rwlock)); - return (0); + goto done; } /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* * Build our snapshot of any concurrent transaction IDs. * @@ -178,8 +185,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; - WT_RET(__wt_readunlock(session, txn_global->scan_rwlock)); - +done: WT_RET(__wt_readunlock(session, txn_global->scan_rwlock)); __txn_sort_snapshot(session, n, current_id); return (0); } @@ -433,18 +439,22 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_TXN_STATE *txn_state; txn = &session->txn; - WT_ASSERT(session, txn->mod_count == 0); - txn->notify = NULL; - txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); + WT_ASSERT(session, txn->mod_count == 0); + txn->notify = NULL; + /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { WT_ASSERT(session, txn_state->id == WT_TXN_NONE); - txn->id = WT_TXN_NONE; + txn->id = txn_global->checkpoint_txnid = WT_TXN_NONE; - /* Clear the global checkpoint transaction IDs. */ + /* + * Be extra careful to cleanup everything for checkpoints: once + * the global checkpoint ID is cleared, we can no longer tell + * if this session is doing a checkpoint. + */ txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 661702d7f17..62f6d099d0f 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -478,21 +478,22 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_id_check(session)); /* - * Save the checkpoint session ID. We never do checkpoints in the - * default session (with id zero). + * Save the checkpoint session ID. + * + * We never do checkpoints in the default session (with id zero). */ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); txn_global->checkpoint_id = session->id; - txn_global->checkpoint_pinned = - WT_MIN(txn_state->id, txn_state->snap_min); - /* - * We're about to clear the checkpoint transaction from the global - * state table so the oldest ID can move forward. Make sure everything - * we've done above is scheduled. + * Remove the checkpoint transaction from the global table. + * + * This allows ordinary visibility checks to move forward because + * checkpoints often take a long time and only write to the metadata. */ - WT_FULL_BARRIER(); + WT_ERR(__wt_writelock(session, txn_global->scan_rwlock)); + txn_global->checkpoint_txnid = txn->id; + txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); /* * Sanity check that the oldest ID hasn't moved on before we have @@ -510,6 +511,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * details). */ txn_state->id = txn_state->snap_min = WT_TXN_NONE; + WT_ERR(__wt_writeunlock(session, txn_global->scan_rwlock)); /* Tell logging that we have started a database checkpoint. */ if (full && logging) |