WT-2798 Fix data consistency bug with table creates during a checkpoint. (#2910)

When logging is disabled, a create operation (and potentially other metadata updates) could write partially completed checkpoint metadata, leaving on-disk files inconsistent until the checkpoint completes. (cherry picked from commit 7e1a47dd45735a8ee98aaf24f60406a0bb682359)
author: Michael Cahill <michael.cahill@mongodb.com> 2016-07-27 16:18:00 +1000
committer: Michael Cahill <michael.cahill@mongodb.com> 2016-07-29 16:19:02 +1000
commit: 2969c02ab9a2e5407754e3afc278f871f93fd7b0 (patch)
tree: 4392f44584f53c22fa680bd8fe455d05fa8d6f9f
parent: 82323cf72fe5c4806527120e2045cecaa0c16d74 (diff)
download: mongo-2969c02ab9a2e5407754e3afc278f871f93fd7b0.tar.gz
7 files changed, 75 insertions, 24 deletions
diff --git a/bench/wtperf/idle_table_cycle.c b/bench/wtperf/idle_table_cycle.c
index b699b5b9dd1..3c079bb560f 100644
--- a/bench/wtperf/idle_table_cycle.c
+++ b/bench/wtperf/idle_table_cycle.c
@@ -129,7 +129,8 @@ cycle_idle_tables(void *arg)
 		 * Drop the table. Keep retrying on EBUSY failure - it is an
 		 * expected return when checkpoints are happening.
 		 */
-		while ((ret = session->drop(session, uri, "force")) == EBUSY)
+		while ((ret = session->drop(
+		    session, uri, "force,checkpoint_wait=false")) == EBUSY)
 			__wt_sleep(1, 0);
 
 		if (ret != 0 && ret != EBUSY) {
diff --git a/bench/wtperf/runners/checkpoint_schema_race.wtperf b/bench/wtperf/runners/checkpoint_schema_race.wtperf
new file mode 100644
index 00000000000..ade8e88ee9b
--- /dev/null
+++ b/bench/wtperf/runners/checkpoint_schema_race.wtperf
@@ -0,0 +1,20 @@
+# Check create and drop behavior concurrent with checkpoints (WT-2798).
+# Setup a multiple tables and a cache size large enough that checkpoints can
+# take a long time.
+conn_config="cache_size=8GB,log=(enabled=false),checkpoint=(wait=30)"
+table_config="leaf_page_max=4k,internal_page_max=16k,type=file"
+icount=10000000
+table_count=100
+table_count_idle=100
+# Turn on create/drop of idle tables, but don't worry if individual operations
+# take a long time.
+idle_table_cycle=120
+populate_threads=5
+checkpoint_threads=0
+report_interval=5
+# 100 million
+random_range=10000000
+run_time=300
+# Setup a workload that dirties a lot of the cache
+threads=((count=2,reads=1),(count=2,inserts=1),(count=2,updates=1))
+value_sz=500
diff --git a/src/include/txn.h b/src/include/txn.h
index d10738cc670..2e41ae8620d 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -98,6 +98,7 @@ struct __wt_txn_global {
 	volatile uint32_t checkpoint_id;	/* Checkpoint's session ID */
 	volatile uint64_t checkpoint_gen;
 	volatile uint64_t checkpoint_pinned;
+	volatile uint64_t checkpoint_txnid;	/* Checkpoint's txn ID */
 
 	/* Named snapshot state. */
 	WT_RWLOCK *nsnap_rwlock;
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 78235fb6a92..55e311fd273 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -219,13 +219,20 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
 			 * transaction ID in each chunk: any transaction ID
 			 * that overlaps with our snapshot is a potential
 			 * conflict.
+			 *
+			 * Note that the global snap_min is correct here: it
+			 * tracks concurrent transactions excluding special
+			 * transactions such as checkpoint (which we can't
+			 * conflict with because checkpoint only writes the
+			 * metadata, which is not an LSM tree).
 			 */
 			clsm->nupdates = 1;
 			if (txn->isolation == WT_ISO_SNAPSHOT &&
 			    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
 				WT_ASSERT(session,
 				    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
-				snap_min = txn->snap_min;
+				snap_min =
+				    WT_SESSION_TXN_STATE(session)->snap_min;
 				for (switch_txnp =
 				    &clsm->switch_txn[clsm->nchunks - 2];
 				    clsm->nupdates < clsm->nchunks;
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index b49946bb10e..6bcb5457385 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -1135,8 +1135,18 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
 	 */
 	if (!skipped &&
 	    (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
-	    __wt_txn_visible_all(session, max_txn)))
+	    __wt_txn_visible_all(session, max_txn))) {
+		/*
+		 * The checkpoint transaction is special.  Make sure we never
+		 * write (metadata) updates from a checkpoint in a concurrent
+		 * session.
+		 */
+		WT_ASSERT(session, *updp == NULL ||
+		    (txnid = (*updp)->txnid) == WT_TXN_NONE ||
+		    txnid != S2C(session)->txn_global.checkpoint_txnid ||
+		    WT_SESSION_IS_CHECKPOINT(session));
 		return (0);
+	}
 
 	/*
 	 * In some cases, there had better not be skipped updates or updates not
diff --git a/src/txn/txn.c b/src/txn/txn.c
index dd4384d9a9a..31d9aef4cbc 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -124,6 +124,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	txn = &session->txn;
 	txn_global = &conn->txn_global;
 	txn_state = WT_SESSION_TXN_STATE(session);
+	n = 0;
 
 	/*
 	 * Spin waiting for the lock: the sleeps in our blocking readlock
@@ -137,20 +138,26 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	current_id = snap_min = txn_global->current;
 	prev_oldest_id = txn_global->oldest_id;
 
+	/*
+	 * Include the checkpoint transaction, if one is running: we should
+	 * ignore any uncommitted changes the checkpoint has written to the
+	 * metadata.  We don't have to keep the checkpoint's changes pinned so
+	 * don't including it in the published snap_min.
+	 */
+	if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE)
+		txn->snapshot[n++] = id;
+
 	/* For pure read-only workloads, avoid scanning. */
 	if (prev_oldest_id == current_id) {
 		txn_state->snap_min = current_id;
-		__txn_sort_snapshot(session, 0, current_id);
-
 		/* Check that the oldest ID has not moved in the meantime. */
 		WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
-		WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
-		return (0);
+		goto done;
 	}
 
 	/* Walk the array of concurrent transactions. */
 	WT_ORDERED_READ(session_cnt, conn->session_cnt);
-	for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
+	for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
 		/*
 		 * Build our snapshot of any concurrent transaction IDs.
 		 *
@@ -178,8 +185,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
 	WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
 	txn_state->snap_min = snap_min;
 
-	WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
-
+done:	WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
 	__txn_sort_snapshot(session, n, current_id);
 	return (0);
 }
@@ -433,18 +439,22 @@ __wt_txn_release(WT_SESSION_IMPL *session)
 	WT_TXN_STATE *txn_state;
 
 	txn = &session->txn;
-	WT_ASSERT(session, txn->mod_count == 0);
-	txn->notify = NULL;
-
 	txn_global = &S2C(session)->txn_global;
 	txn_state = WT_SESSION_TXN_STATE(session);
 
+	WT_ASSERT(session, txn->mod_count == 0);
+	txn->notify = NULL;
+
 	/* Clear the transaction's ID from the global table. */
 	if (WT_SESSION_IS_CHECKPOINT(session)) {
 		WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
-		txn->id = WT_TXN_NONE;
+		txn->id = txn_global->checkpoint_txnid = WT_TXN_NONE;
 
-		/* Clear the global checkpoint transaction IDs. */
+		/*
+		 * Be extra careful to cleanup everything for checkpoints: once
+		 * the global checkpoint ID is cleared, we can no longer tell
+		 * if this session is doing a checkpoint.
+		 */
 		txn_global->checkpoint_id = 0;
 		txn_global->checkpoint_pinned = WT_TXN_NONE;
 	} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 51d26b9aed6..82c1fe7bdfe 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -475,21 +475,22 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 	WT_ERR(__wt_txn_id_check(session));
 
 	/*
-	 * Save the checkpoint session ID.  We never do checkpoints in the
-	 * default session (with id zero).
+	 * Save the checkpoint session ID.
+	 *
+	 * We never do checkpoints in the default session (with id zero).
 	 */
 	WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
 	txn_global->checkpoint_id = session->id;
 
-	txn_global->checkpoint_pinned =
-	    WT_MIN(txn_state->id, txn_state->snap_min);
-
 	/*
-	 * We're about to clear the checkpoint transaction from the global
-	 * state table so the oldest ID can move forward.  Make sure everything
-	 * we've done above is scheduled.
+	 * Remove the checkpoint transaction from the global table.
+	 *
+	 * This allows ordinary visibility checks to move forward because
+	 * checkpoints often take a long time and only write to the metadata.
 	 */
-	WT_FULL_BARRIER();
+	WT_ERR(__wt_writelock(session, txn_global->scan_rwlock));
+	txn_global->checkpoint_txnid = txn->id;
+	txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min);
 
 	/*
 	 * Sanity check that the oldest ID hasn't moved on before we have
@@ -507,6 +508,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
 	 * details).
 	 */
 	txn_state->id = txn_state->snap_min = WT_TXN_NONE;
+	WT_ERR(__wt_writeunlock(session, txn_global->scan_rwlock));
 
 	/* Tell logging that we have started a database checkpoint. */
 	if (full && logging)
author	Michael Cahill <michael.cahill@mongodb.com>	2016-07-27 16:18:00 +1000
committer	Michael Cahill <michael.cahill@mongodb.com>	2016-07-29 16:19:02 +1000
commit	2969c02ab9a2e5407754e3afc278f871f93fd7b0 (patch)
tree	4392f44584f53c22fa680bd8fe455d05fa8d6f9f
parent	82323cf72fe5c4806527120e2045cecaa0c16d74 (diff)
download	mongo-2969c02ab9a2e5407754e3afc278f871f93fd7b0.tar.gz