summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA. Jesse Jiryu Davis <jesse@mongodb.com>2020-06-26 17:04:01 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-07-11 23:55:48 +0000
commit74ab0cdac56e1cc62fc96d3ca3be3ddfa54f2bcb (patch)
tree77d0347dc607247ac8ec1c64ee7e593de3fbbeea
parent4b918b3f7693c42ad74dfa5030e7cf2ffc3d1ddb (diff)
downloadmongo-74ab0cdac56e1cc62fc96d3ca3be3ddfa54f2bcb.tar.gz
SERVER-48518 Fix rollback via refetch anomaly
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_multiversion.yml1
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/libs/rollback_test.js44
-rw-r--r--jstests/replsets/rollback_via_refetch_anomaly.js194
-rw-r--r--jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js3
-rw-r--r--jstests/replsets/transactions_after_rollback_via_refetch.js3
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp16
-rw-r--r--src/mongo/db/storage/storage_engine_impl.cpp1
-rw-r--r--src/mongo/db/storage/storage_engine_impl.h1
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp4
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h2
11 files changed, 260 insertions, 11 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_multiversion.yml b/buildscripts/resmokeconfig/suites/replica_sets_multiversion.yml
index 87577f23687..1173146fa1e 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_multiversion.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_multiversion.yml
@@ -10,6 +10,7 @@ selector:
- jstests/replsets/initial_sync_rename_collection.js
- jstests/replsets/initial_sync_drop_collection.js
- jstests/replsets/step_down_chaining_disabled.js
+ - jstests/replsets/rollback_via_refetch_anomaly.js
executor:
config:
shell_options:
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 86a1d4829d1..e0e27c3e764 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -40,6 +40,8 @@ replica_sets_jscore_multiversion_passthrough:
test_file: jstests/core/apply_ops_system_dot_views.js
- ticket: SERVER-48614
test_file: jstests/core/wildcard_index_partial_index.js
+- ticket: SERVER-48518
+ test_file: jstests/replsets/rollback_via_refetch_anomaly.js
sharding_jscore_multiversion_passthrough:
- ticket: SERVER-47773
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js
index 7ab5068aa97..b4420dc67b0 100644
--- a/jstests/replsets/libs/rollback_test.js
+++ b/jstests/replsets/libs/rollback_test.js
@@ -429,12 +429,46 @@ function RollbackTest(name = "RollbackTest", replSet) {
};
/**
+ * Insert on primary until its lastApplied is >= the rollback node's.
+ */
+ const _awaitPrimaryAppliedSurpassesRollbackApplied = function() {
+ log(`Waiting for lastApplied on sync source ${curPrimary.host} to surpass lastApplied` +
+ ` on rollback node ${curSecondary.host}`);
+
+ function lastApplied(node) {
+ const reply = assert.commandWorked(node.adminCommand({replSetGetStatus: 1}));
+ return reply.optimes.appliedOpTime.ts;
+ }
+
+ const rollbackApplied = lastApplied(curSecondary);
+ assert.soon(() => {
+ const primaryApplied = lastApplied(curPrimary);
+ jsTestLog(
+ `lastApplied on sync source ${curPrimary.host}:` +
+ ` ${tojson(primaryApplied)}, lastApplied on rollback node ${curSecondary.host}:` +
+ ` ${tojson(rollbackApplied)}`);
+
+ if (timestampCmp(primaryApplied, rollbackApplied) >= 0) {
+ return true;
+ }
+
+ let crudColl = curPrimary.getDB("test")["awaitPrimaryAppliedSurpassesRollbackApplied"];
+ assert.commandWorked(crudColl.insertOne({}));
+ }, "primary's lastApplied never surpassed rollback node's");
+ };
+
+ /**
* Transition to the second stage of rollback testing, where we isolate the old primary and
* elect the old secondary as the new primary. Then, operations can be performed on the new
* primary so that that optimes diverge and previous operations on the old primary will be
* rolled back.
+ *
+ * If awaitPrimaryAppliedSurpassesRollbackApplied, insert on new primary until its lastApplied
+ * is >= the rollback node's. Useful for testing rollback via refetch, which completes rollback
+ * recovery when new lastApplied >= old top of oplog.
*/
- this.transitionToSyncSourceOperationsBeforeRollback = function() {
+ this.transitionToSyncSourceOperationsBeforeRollback = function(
+ {awaitPrimaryAppliedSurpassesRollbackApplied = false} = {}) {
const start = new Date();
transitionIfAllowed(State.kSyncSourceOpsBeforeRollback);
@@ -498,6 +532,10 @@ function RollbackTest(name = "RollbackTest", replSet) {
lastRBID = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
+ if (awaitPrimaryAppliedSurpassesRollbackApplied) {
+ _awaitPrimaryAppliedSurpassesRollbackApplied();
+ }
+
log(`RollbackTest transition to ${curState} took ${(new Date() - start)} ms`);
// The current primary, which is the old secondary, will later become the sync source.
return curPrimary;
@@ -550,6 +588,10 @@ function RollbackTest(name = "RollbackTest", replSet) {
return curSecondary;
};
+ this.getTieBreaker = function() {
+ return tiebreakerNode;
+ };
+
this.restartNode = function(nodeId, signal, startOptions, allowedExitCode) {
assert(signal === SIGKILL || signal === SIGTERM, `Received unknown signal: ${signal}`);
assert.gte(nodeId, 0, "Invalid argument to RollbackTest.restartNode()");
diff --git a/jstests/replsets/rollback_via_refetch_anomaly.js b/jstests/replsets/rollback_via_refetch_anomaly.js
new file mode 100644
index 00000000000..1a3f192e498
--- /dev/null
+++ b/jstests/replsets/rollback_via_refetch_anomaly.js
@@ -0,0 +1,194 @@
+/**
+ * Rolled-back documents must not be visible when querying a recovered secondary.
+ *
+ * This test attempts to reproduce SERVER-48518. In the following description, 'T' is the tiebreaker
+ * node, 'P1' and 'P2' are primaries in terms 1 and 2.
+ *
+ * - Begin in RollbackTest.kSteadyStateOps with primary P1, all nodes connected:
+ *
+ * T
+ * / \
+ * P1 - P2
+ * primary secondary
+ *
+ * - Insert _id 0 into P1 at timestamp 1.
+ * - Transition to kRollbackOps by disconnecting P1 from P2:
+ *
+ * T
+ * /
+ * P1 P2
+ * primary secondary
+ *
+ * - Insert _id 1 into P1 at timestamp 2.
+ *
+ * TS 1 TS 2
+ * insert 0 insert 1
+ * P1 --------------->
+ * P2 --->
+ *
+ * - Isolate P1 from T, connect P2 to T:
+ *
+ * T
+ * \
+ * P1 P2
+ * primary new primary
+ *
+ * (Same as RollbackTest.transitionToSyncSourceOperationsBeforeRollback(), except do *not* trigger a
+ * stepdown on P1.)
+ *
+ * - Step up P2, which writes a no-op to its oplog at timestamp 3.
+ *
+ * TS 1 TS 2
+ * insert 0 insert 1
+ *
+ * P1 --------------->
+ * P2 ----*
+ * \
+ * *-------------------------->
+ * no-op
+ * TS 3
+ *
+ * - Delete _id 0 and 1 from P1 at timestamp 4.
+ *
+ * TS 1 TS 2 TS 4
+ * insert 0 insert 1 delete 0 and 1
+ *
+ * P1 --------------------------------------------------------------->
+ * P2 ----*
+ * \
+ * *-------------------------->
+ * no-op
+ * TS 3
+ *
+ * - Reconnect P1 to P2 so it rolls back.
+ *
+ * T
+ * \
+ * P1 - P2
+ * rollback new primary
+ *
+ * Rollback via refetch undoes the delete of _id 0 by reinserting _id 0 in P1 with an
+ * untimestamped write. (It can't undo the delete of _id 1, since P2 doesn't have _id 1.)
+ *
+ * Before we fixed SERVER-48518, P1 served queries at lastApplied = top of P2's oplog = TS 3,
+ * which includes _id 0, _id 1, and _id 0 again (it was reinserted with an untimestamped write).
+ * To fix SERVER-48518, P1 won't transition from ROLLBACK until its lastApplied >= max(P1's oplog
+ * top, P2's oplog top) = TS 4.
+ *
+ * - Write to P2 so it advances >= timestamp 4 and satisfies P1's conditions to finish rollback.
+ * - Wait for P1 to finish rollback and transition to SECONDARY.
+ * - Query P1 and check that rolled-back records aren't visible.
+ *
+ * To end the test, RollbackTest.transitionToSteadyStateOperations won't work, we've diverged from
+ * the state it expects, so we end the test manually. Reconnect P1 to T, enable replication on T,
+ * and stop the replica set.
+ *
+ * T
+ * / \
+ * P1 - P2
+ * secondary new primary
+ */
+
+(function() {
+"use strict";
+
+load("jstests/core/txns/libs/prepare_helpers.js");
+load("jstests/libs/write_concern_util.js");
+load("jstests/replsets/libs/rollback_test.js");
+
+const rst = new ReplSetTest({
+ nodes: 3,
+ useBridge: true,
+ nodeOptions: {
+ enableMajorityReadConcern: "false",
+ setParameter: {logComponentVerbosity: tojsononeline({replication: 2})}
+ }
+});
+
+rst.startSet();
+const config = rst.getReplSetConfig();
+config.members[2].priority = 0;
+config.settings = {
+ chainingAllowed: false
+};
+rst.initiateWithHighElectionTimeout(config);
+
+const rollbackTest = new RollbackTest(jsTestName(), rst);
+const P1 = rollbackTest.getPrimary();
+const P2 = rollbackTest.getSecondary();
+const T = rollbackTest.getTieBreaker();
+
+jsTestLog(`Node P1: ${P1.host}, P2: ${P2.host}, T: ${T.host}`);
+
+let testDB = P1.getDB(jsTestName());
+const testColl = testDB.getCollection("test");
+
+let reply = assert.commandWorked(testColl.insert({_id: 0}, {"writeConcern": {"w": "majority"}}));
+jsTestLog(`Inserted _id 0 into P1 ${reply.operationTime}`);
+
+rollbackTest.transitionToRollbackOperations();
+reply = assert.commandWorked(testColl.insert({_id: 1}, {"writeConcern": {"w": 1}}));
+jsTestLog(`Inserted _id 1 into P1 ${reply.operationTime}`);
+
+jsTestLog("Isolating P1 from tiebreaker");
+P1.disconnect([T]);
+
+jsTestLog("Reconnecting P2 to the tiebreaker");
+P2.reconnect([T]);
+
+jsTestLog("Step up P2");
+assert.soonNoExcept(() => {
+ const res = P2.adminCommand({replSetStepUp: 1});
+ return res.ok;
+}, "Failed to step up P2", ReplSetTest.kDefaultTimeoutMS);
+// "Transition to primary complete; database writes are now permitted".
+checkLog.containsJson(P2, 21331);
+jsTestLog("P2 stepped up");
+
+reply = assert.commandWorked(testDB.runCommand({delete: "test", deletes: [{q: {}, limit: 0}]}));
+jsTestLog(`Deleted from P1 at ${reply.operationTime}`);
+
+// Ensure P1's lastApplied > P2's, even if P1's set-up entry was written at the same timestamp as
+// P2's delete timestamp.
+assert.soon(() => {
+ testDB.runCommand({insert: "otherCollection", documents: [{}]});
+ function lastApplied(node) {
+ const reply = assert.commandWorked(node.adminCommand({replSetGetStatus: 1}));
+ return reply.optimes.appliedOpTime.ts;
+ }
+ const P1applied = lastApplied(P1);
+ const P2applied = lastApplied(P2);
+ jsTestLog(`P1 lastApplied ${P1applied}, P2 lastApplied ${P2applied}`);
+ return timestampCmp(P1applied, P2applied) > 0;
+}, "P1's lastApplied never surpassed P2's");
+
+jsTestLog("Reconnecting P1 to P2, so P1 rolls back");
+P1.reconnect([P2]);
+// "Rollback using the 'rollbackViaRefetch' method".
+checkLog.containsJson(P1, 21103);
+// "Finding the Common Point".
+checkLog.containsJson(P1, 21682);
+// "We cannot transition to SECONDARY state because our 'lastApplied' optime is less than the
+// initial data timestamp and enableMajorityReadConcern = false".
+checkLog.containsJson(P1, 4851800);
+
+reply = assert.commandWorked(
+ P2.getDB(jsTestName()).runCommand({insert: "anotherCollection", documents: [{}]}));
+jsTestLog(`Inserted into P2 at ${tojson(reply.operationTime)}`);
+
+jsTestLog("Wait for P1 to enter SECONDARY");
+waitForState(P1, ReplSetTest.State.SECONDARY);
+
+// Both counts should be 1. If SERVER-48518 isn't fixed then itCount() = 3: _ids 0, 1, and 0 again!
+jsTestLog("Check collection count");
+let itCount = testColl.find().itcount();
+let fastCount = testColl.count();
+assert.eq(itCount,
+ fastCount,
+ `count: ${fastCount}, itCount: ${itCount}, data: ${tojson(testColl.find().toArray())}`);
+
+jsTestLog("Check succeeded. Ending test.");
+P1.reconnect([T]);
+restartServerReplication(T);
+rst.stopSet();
+}());
diff --git a/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js b/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js
index e9bd8876672..37bb9da1d31 100644
--- a/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js
+++ b/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js
@@ -42,7 +42,8 @@ assert.commandWorked(rollbackDB.adminCommand(
assert.commandWorked(rollbackDB.runCommand({create: collName}));
// Step down the current primary and elect the node that does not have the collection.
-rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+rollbackTest.transitionToSyncSourceOperationsBeforeRollback(
+ {awaitPrimaryAppliedSurpassesRollbackApplied: true});
jsTestLog("Attempting to roll back.");
// Make the old primary rollback against the new primary. This attempt should fail because the
diff --git a/jstests/replsets/transactions_after_rollback_via_refetch.js b/jstests/replsets/transactions_after_rollback_via_refetch.js
index 80ef4a8ded9..16c1e56b6d6 100644
--- a/jstests/replsets/transactions_after_rollback_via_refetch.js
+++ b/jstests/replsets/transactions_after_rollback_via_refetch.js
@@ -70,7 +70,8 @@ CommonOps(rollbackTest.getPrimary());
let rollbackNode = rollbackTest.transitionToRollbackOperations();
RollbackOps(rollbackNode);
-let syncSourceNode = rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+let syncSourceNode = rollbackTest.transitionToSyncSourceOperationsBeforeRollback(
+ {awaitPrimaryAppliedSurpassesRollbackApplied: true});
SyncSourceOps(syncSourceNode);
// Wait for rollback to finish.
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 0a85a178ac7..ed9a5753956 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -5122,6 +5122,22 @@ void ReplicationCoordinatorImpl::finishRecoveryIfEligible(OperationContext* opCt
return;
}
+ // Rolling back with eMRC false, we set initialDataTimestamp to max(local oplog top, source's
+ // oplog top), then rollback via refetch. Data is inconsistent until lastApplied >=
+ // initialDataTimestamp.
+ auto initialTs = opCtx->getServiceContext()->getStorageEngine()->getInitialDataTimestamp();
+ if (lastApplied.getTimestamp() < initialTs) {
+ invariant(!serverGlobalParams.enableMajorityReadConcern);
+ LOGV2_DEBUG(4851800,
+ 2,
+ "We cannot transition to SECONDARY state because our 'lastApplied' optime is "
+ "less than the initial data timestamp and enableMajorityReadConcern = false",
+ "minValid"_attr = minValid,
+ "lastApplied"_attr = lastApplied,
+ "initialDataTimestamp"_attr = initialTs);
+ return;
+ }
+
// Execute the transition to SECONDARY.
auto status = setFollowerMode(MemberState::RS_SECONDARY);
if (!status.isOK()) {
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index bcded20bb6d..1e3977ee532 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -778,7 +778,6 @@ void StorageEngineImpl::setStableTimestamp(Timestamp stableTimestamp, bool force
}
void StorageEngineImpl::setInitialDataTimestamp(Timestamp initialDataTimestamp) {
- _initialDataTimestamp = initialDataTimestamp;
_engine->setInitialDataTimestamp(initialDataTimestamp);
}
diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h
index 6ef1a605992..7f658ba910b 100644
--- a/src/mongo/db/storage/storage_engine_impl.h
+++ b/src/mongo/db/storage/storage_engine_impl.h
@@ -398,7 +398,6 @@ private:
const bool _supportsDocLocking;
const bool _supportsCappedCollections;
- Timestamp _initialDataTimestamp = Timestamp::kAllowUnstableCheckpointsSentinel;
std::unique_ptr<RecordStore> _catalogRecordStore;
std::unique_ptr<DurableCatalogImpl> _catalog;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 3fa21ec4f8d..7d98f3c7205 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -2400,10 +2400,6 @@ Timestamp WiredTigerKVEngine::getCheckpointTimestamp() const {
return Timestamp(_getCheckpointTimestamp());
}
-Timestamp WiredTigerKVEngine::getInitialDataTimestamp() const {
- return Timestamp(_initialDataTimestamp.load());
-}
-
std::uint64_t WiredTigerKVEngine::_getCheckpointTimestamp() const {
char buf[(2 * 8 /*bytes in hex*/) + 1 /*nul terminator*/];
invariantWTOK(_conn->query_timestamp(_conn, buf, "get=last_checkpoint"));
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
index df0dccc4373..09152ec80fe 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
@@ -323,8 +323,6 @@ public:
Timestamp getOldestTimestamp() const override;
Timestamp getCheckpointTimestamp() const override;
- Timestamp getInitialDataTimestamp() const;
-
/**
* Returns the data file path associated with an ident on disk. Returns boost::none if the data
* file can not be found. This will attempt to locate a file even if the storage engine's own