summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2016-07-27 16:18:00 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2016-07-29 16:19:02 +1000
commit2969c02ab9a2e5407754e3afc278f871f93fd7b0 (patch)
tree4392f44584f53c22fa680bd8fe455d05fa8d6f9f
parent82323cf72fe5c4806527120e2045cecaa0c16d74 (diff)
downloadmongo-2969c02ab9a2e5407754e3afc278f871f93fd7b0.tar.gz
WT-2798 Fix data consistency bug with table creates during a checkpoint. (#2910)
When logging is disabled, a create operation (and potentially other metadata updates) could write partially completed checkpoint metadata, leaving on-disk files inconsistent until the checkpoint completes. (cherry picked from commit 7e1a47dd45735a8ee98aaf24f60406a0bb682359)
-rw-r--r--bench/wtperf/idle_table_cycle.c3
-rw-r--r--bench/wtperf/runners/checkpoint_schema_race.wtperf20
-rw-r--r--src/include/txn.h1
-rw-r--r--src/lsm/lsm_cursor.c9
-rw-r--r--src/reconcile/rec_write.c12
-rw-r--r--src/txn/txn.c34
-rw-r--r--src/txn/txn_ckpt.c20
7 files changed, 75 insertions, 24 deletions
diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c
index b699b5b9dd1..3c079bb560f 100644
--- a/bench/wtperf/idle_table_cycle.c
+++ b/bench/wtperf/idle_table_cycle.c
@@ -129,7 +129,8 @@ cycle_idle_tables(void *arg)
* Drop the table. Keep retrying on EBUSY failure - it is an
* expected return when checkpoints are happening.
*/
- while ((ret = session->drop(session, uri, "force")) == EBUSY)
+ while ((ret = session->drop(
+ session, uri, "force,checkpoint_wait=false")) == EBUSY)
__wt_sleep(1, 0);
if (ret != 0 && ret != EBUSY) {
diff --git a/bench/wtperf/runners/checkpoint_schema_race.wtperf b/bench/wtperf/runners/checkpoint_schema_race.wtperf
new file mode 100644
index 00000000000..ade8e88ee9b
--- /dev/null
+++ b/bench/wtperf/runners/checkpoint_schema_race.wtperf
@@ -0,0 +1,20 @@
+# Check create and drop behavior concurrent with checkpoints (WT-2798).
+# Setup a multiple tables and a cache size large enough that checkpoints can
+# take a long time.
+conn_config="cache_size=8GB,log=(enabled=false),checkpoint=(wait=30)"
+table_config="leaf_page_max=4k,internal_page_max=16k,type=file"
+icount=10000000
+table_count=100
+table_count_idle=100
+# Turn on create/drop of idle tables, but don't worry if individual operations
+# take a long time.
+idle_table_cycle=120
+populate_threads=5
+checkpoint_threads=0
+report_interval=5
+# 100 million
+random_range=10000000
+run_time=300
+# Setup a workload that dirties a lot of the cache
+threads=((count=2,reads=1),(count=2,inserts=1),(count=2,updates=1))
+value_sz=500
diff --git a/src/include/txn.h b/src/include/txn.h
index d10738cc670..2e41ae8620d 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -98,6 +98,7 @@ struct __wt_txn_global {
volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
volatile uint64_t checkpoint_gen;
volatile uint64_t checkpoint_pinned;
+ volatile uint64_t checkpoint_txnid; /* Checkpoint's txn ID */
/* Named snapshot state. */
WT_RWLOCK *nsnap_rwlock;
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 78235fb6a92..55e311fd273 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -219,13 +219,20 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
* transaction ID in each chunk: any transaction ID
* that overlaps with our snapshot is a potential
* conflict.
+ *
+ * Note that the global snap_min is correct here: it
+ * tracks concurrent transactions excluding special
+ * transactions such as checkpoint (which we can't
+ * conflict with because checkpoint only writes the
+ * metadata, which is not an LSM tree).
*/
clsm->nupdates = 1;
if (txn->isolation == WT_ISO_SNAPSHOT &&
F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
WT_ASSERT(session,
F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
- snap_min = txn->snap_min;
+ snap_min =
+ WT_SESSION_TXN_STATE(session)->snap_min;
for (switch_txnp =
&clsm->switch_txn[clsm->nchunks - 2];
clsm->nupdates < clsm->nchunks;
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index b49946bb10e..6bcb5457385 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1135,8 +1135,18 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (!skipped &&
(F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
- __wt_txn_visible_all(session, max_txn)))
+ __wt_txn_visible_all(session, max_txn))) {
+ /*
+ * The checkpoint transaction is special. Make sure we never
+ * write (metadata) updates from a checkpoint in a concurrent
+ * session.
+ */
+ WT_ASSERT(session, *updp == NULL ||
+ (txnid = (*updp)->txnid) == WT_TXN_NONE ||
+ txnid != S2C(session)->txn_global.checkpoint_txnid ||
+ WT_SESSION_IS_CHECKPOINT(session));
return (0);
+ }
/*
* In some cases, there had better not be skipped updates or updates not
diff --git a/src/txn/txn.c b/src/txn/txn.c
index dd4384d9a9a..31d9aef4cbc 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -124,6 +124,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_global = &conn->txn_global;
txn_state = WT_SESSION_TXN_STATE(session);
+ n = 0;
/*
* Spin waiting for the lock: the sleeps in our blocking readlock
@@ -137,20 +138,26 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
current_id = snap_min = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
+ /*
+ * Include the checkpoint transaction, if one is running: we should
+ * ignore any uncommitted changes the checkpoint has written to the
+ * metadata. We don't have to keep the checkpoint's changes pinned so
+ * don't including it in the published snap_min.
+ */
+ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE)
+ txn->snapshot[n++] = id;
+
/* For pure read-only workloads, avoid scanning. */
if (prev_oldest_id == current_id) {
txn_state->snap_min = current_id;
- __txn_sort_snapshot(session, 0, current_id);
-
/* Check that the oldest ID has not moved in the meantime. */
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
- return (0);
+ goto done;
}
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
/*
* Build our snapshot of any concurrent transaction IDs.
*
@@ -178,8 +185,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
txn_state->snap_min = snap_min;
- WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
-
+done: WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
__txn_sort_snapshot(session, n, current_id);
return (0);
}
@@ -433,18 +439,22 @@ __wt_txn_release(WT_SESSION_IMPL *session)
WT_TXN_STATE *txn_state;
txn = &session->txn;
- WT_ASSERT(session, txn->mod_count == 0);
- txn->notify = NULL;
-
txn_global = &S2C(session)->txn_global;
txn_state = WT_SESSION_TXN_STATE(session);
+ WT_ASSERT(session, txn->mod_count == 0);
+ txn->notify = NULL;
+
/* Clear the transaction's ID from the global table. */
if (WT_SESSION_IS_CHECKPOINT(session)) {
WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
- txn->id = WT_TXN_NONE;
+ txn->id = txn_global->checkpoint_txnid = WT_TXN_NONE;
- /* Clear the global checkpoint transaction IDs. */
+ /*
+ * Be extra careful to cleanup everything for checkpoints: once
+ * the global checkpoint ID is cleared, we can no longer tell
+ * if this session is doing a checkpoint.
+ */
txn_global->checkpoint_id = 0;
txn_global->checkpoint_pinned = WT_TXN_NONE;
} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 51d26b9aed6..82c1fe7bdfe 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -475,21 +475,22 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_txn_id_check(session));
/*
- * Save the checkpoint session ID. We never do checkpoints in the
- * default session (with id zero).
+ * Save the checkpoint session ID.
+ *
+ * We never do checkpoints in the default session (with id zero).
*/
WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
txn_global->checkpoint_id = session->id;
- txn_global->checkpoint_pinned =
- WT_MIN(txn_state->id, txn_state->snap_min);
-
/*
- * We're about to clear the checkpoint transaction from the global
- * state table so the oldest ID can move forward. Make sure everything
- * we've done above is scheduled.
+ * Remove the checkpoint transaction from the global table.
+ *
+ * This allows ordinary visibility checks to move forward because
+ * checkpoints often take a long time and only write to the metadata.
*/
- WT_FULL_BARRIER();
+ WT_ERR(__wt_writelock(session, txn_global->scan_rwlock));
+ txn_global->checkpoint_txnid = txn->id;
+ txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min);
/*
* Sanity check that the oldest ID hasn't moved on before we have
@@ -507,6 +508,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* details).
*/
txn_state->id = txn_state->snap_min = WT_TXN_NONE;
+ WT_ERR(__wt_writeunlock(session, txn_global->scan_rwlock));
/* Tell logging that we have started a database checkpoint. */
if (full && logging)