summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilliam Schultz <william.schultz@mongodb.com>2020-01-22 22:54:43 +0000
committerevergreen <evergreen@mongodb.com>2020-01-22 22:54:43 +0000
commit759e930c88081aa0fb86e34a3ce7b2ed190c806e (patch)
tree4e89fdb981bc212ec94ed2fbe28ed0b6252db003
parent7f0125ebf52d2695bd5790790ad9825a95e0496a (diff)
downloadmongo-759e930c88081aa0fb86e34a3ce7b2ed190c806e.tar.gz
SERVER-45010 Avoid taking a checkpoint on clean shutdown if stableTimestamp < initialDataTimestamp
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js55
-rw-r--r--src/mongo/db/repl/bgsync.cpp10
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp12
4 files changed, 78 insertions, 1 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index dba91a4caa7..61d69292c6d 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -49,6 +49,8 @@ replica_sets_multiversion:
test_file: jstests/replsets/read_operations_during_step_up.js
- ticket: SERVER-42825
test_file: jstests/replsets/step_down_on_secondary.js
+- ticket: SERVER-45010
+ test_file: jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js
sharding_multiversion:
- ticket: SERVER-38691
diff --git a/jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js b/jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js
new file mode 100644
index 00000000000..db623b1d67d
--- /dev/null
+++ b/jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js
@@ -0,0 +1,55 @@
+// When run with --majorityReadConcern=off, this test reproduces the bug described in SERVER-38925,
+// where rolling back a delete followed by a restart produces documents with duplicate _id.
+//
+// In this test we also make sure that a clean shutdown during rollback does not overwrite the
+// unstable checkpoints taken during the rollback process.
+//
+// @tags: [requires_persistence]
+//
+(function() {
+"use strict";
+
+load("jstests/replsets/libs/rollback_test.js");
+
+TestData.rollbackShutdowns = true;
+let dbName = "test";
+let sourceCollName = "coll";
+
+let doc1 = {_id: 1, x: "document_of_interest"};
+
+let CommonOps = (node) => {
+ // Insert a document that will exist on all nodes.
+ assert.commandWorked(node.getDB(dbName)[sourceCollName].insert(doc1));
+};
+
+let RollbackOps = (node) => {
+ // Delete the document on rollback node so it will be refetched from sync source.
+ assert.commandWorked(node.getDB(dbName)[sourceCollName].remove({_id: 1}));
+};
+
+// Set up Rollback Test.
+let rollbackTest = new RollbackTest();
+CommonOps(rollbackTest.getPrimary());
+
+let rollbackNode = rollbackTest.transitionToRollbackOperations();
+// Have rollback hang after it has taken an unstable checkpoint but before it completes.
+rollbackNode.adminCommand({configureFailPoint: 'bgSyncHangAfterRunRollback', mode: 'alwaysOn'});
+RollbackOps(rollbackNode);
+
+// Wait for rollback to finish.
+rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+rollbackTest.transitionToSyncSourceOperationsDuringRollback();
+
+jsTestLog("Waiting for rollback node to hit failpoint.");
+checkLog.contains(rollbackNode, "bgSyncHangAfterRunRollback failpoint is set");
+
+// Sending a shutdown signal to the node should cause us to break out of the hung failpoint, so we
+// don't need to explicitly turn the failpoint off.
+jsTestLog("Restarting rollback node with a clean shutdown.");
+rollbackTest.restartNode(0, 15 /* SIGTERM */);
+
+rollbackTest.transitionToSteadyStateOperations();
+
+// Check the replica set.
+rollbackTest.stop();
+}()); \ No newline at end of file
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index e08f286b5be..bbe81147ee0 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -147,6 +147,9 @@ MONGO_FAIL_POINT_DEFINE(rollbackHangBeforeStart);
// Failpoint to override the time to sleep before retrying sync source selection.
MONGO_FAIL_POINT_DEFINE(forceBgSyncSyncSourceRetryWaitMS);
+// Failpoint which causes rollback to hang after completing.
+MONGO_FAIL_POINT_DEFINE(bgSyncHangAfterRunRollback);
+
BackgroundSync::BackgroundSync(
ReplicationCoordinator* replicationCoordinator,
ReplicationCoordinatorExternalState* replicationCoordinatorExternalState,
@@ -515,6 +518,13 @@ void BackgroundSync::_produce() {
auto storageInterface = StorageInterface::get(opCtx.get());
_runRollback(
opCtx.get(), fetcherReturnStatus, source, syncSourceResp.rbid, storageInterface);
+
+ if (bgSyncHangAfterRunRollback.shouldFail()) {
+ log() << "bgSyncHangAfterRunRollback failpoint is set.";
+ while (MONGO_unlikely(bgSyncHangAfterRunRollback.shouldFail()) && !inShutdown()) {
+ mongo::sleepmillis(100);
+ }
+ }
} else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
Seconds blacklistDuration(60);
warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 4d01620b268..f85299d0422 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -91,6 +91,7 @@
#include "mongo/util/exit.h"
#include "mongo/util/log.h"
#include "mongo/util/processinfo.h"
+#include "mongo/util/quick_exit.h"
#include "mongo/util/scopeguard.h"
#include "mongo/util/time_support.h"
@@ -974,7 +975,16 @@ void WiredTigerKVEngine::cleanShutdown() {
closeConfig += "use_timestamp=false,";
}
- invariantWTOK(_conn->close(_conn, closeConfig.c_str()));
+ const Timestamp stableTimestamp = getStableTimestamp();
+ const Timestamp initialDataTimestamp = getInitialDataTimestamp();
+ if (stableTimestamp >= initialDataTimestamp) {
+ invariantWTOK(_conn->close(_conn, closeConfig.c_str()));
+ } else {
+ log() << "Skipping checkpoint during clean shutdown because stableTimestamp ("
+ << stableTimestamp << ") is less than the initialDataTimestamp ("
+ << initialDataTimestamp << ")";
+ quickExit(EXIT_SUCCESS);
+ }
_conn = nullptr;
}