diff options
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 2 | ||||
-rw-r--r-- | jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js | 55 | ||||
-rw-r--r-- | src/mongo/db/repl/bgsync.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp | 12 |
4 files changed, 78 insertions, 1 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index dba91a4caa7..61d69292c6d 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -49,6 +49,8 @@ replica_sets_multiversion: test_file: jstests/replsets/read_operations_during_step_up.js - ticket: SERVER-42825 test_file: jstests/replsets/step_down_on_secondary.js +- ticket: SERVER-45010 + test_file: jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js sharding_multiversion: - ticket: SERVER-38691 diff --git a/jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js b/jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js new file mode 100644 index 00000000000..db623b1d67d --- /dev/null +++ b/jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js @@ -0,0 +1,55 @@ +// When run with --majorityReadConcern=off, this test reproduces the bug described in SERVER-38925, +// where rolling back a delete followed by a restart produces documents with duplicate _id. +// +// In this test we also make sure that a clean shutdown during rollback does not overwrite the +// unstable checkpoints taken during the rollback process. +// +// @tags: [requires_persistence] +// +(function() { +"use strict"; + +load("jstests/replsets/libs/rollback_test.js"); + +TestData.rollbackShutdowns = true; +let dbName = "test"; +let sourceCollName = "coll"; + +let doc1 = {_id: 1, x: "document_of_interest"}; + +let CommonOps = (node) => { + // Insert a document that will exist on all nodes. + assert.commandWorked(node.getDB(dbName)[sourceCollName].insert(doc1)); +}; + +let RollbackOps = (node) => { + // Delete the document on rollback node so it will be refetched from sync source. + assert.commandWorked(node.getDB(dbName)[sourceCollName].remove({_id: 1})); +}; + +// Set up Rollback Test. +let rollbackTest = new RollbackTest(); +CommonOps(rollbackTest.getPrimary()); + +let rollbackNode = rollbackTest.transitionToRollbackOperations(); +// Have rollback hang after it has taken an unstable checkpoint but before it completes. +rollbackNode.adminCommand({configureFailPoint: 'bgSyncHangAfterRunRollback', mode: 'alwaysOn'}); +RollbackOps(rollbackNode); + +// Wait for rollback to finish. +rollbackTest.transitionToSyncSourceOperationsBeforeRollback(); +rollbackTest.transitionToSyncSourceOperationsDuringRollback(); + +jsTestLog("Waiting for rollback node to hit failpoint."); +checkLog.contains(rollbackNode, "bgSyncHangAfterRunRollback failpoint is set"); + +// Sending a shutdown signal to the node should cause us to break out of the hung failpoint, so we +// don't need to explicitly turn the failpoint off. +jsTestLog("Restarting rollback node with a clean shutdown."); +rollbackTest.restartNode(0, 15 /* SIGTERM */); + +rollbackTest.transitionToSteadyStateOperations(); + +// Check the replica set. +rollbackTest.stop(); +}());
\ No newline at end of file diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index e08f286b5be..bbe81147ee0 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -147,6 +147,9 @@ MONGO_FAIL_POINT_DEFINE(rollbackHangBeforeStart); // Failpoint to override the time to sleep before retrying sync source selection. MONGO_FAIL_POINT_DEFINE(forceBgSyncSyncSourceRetryWaitMS); +// Failpoint which causes rollback to hang after completing. +MONGO_FAIL_POINT_DEFINE(bgSyncHangAfterRunRollback); + BackgroundSync::BackgroundSync( ReplicationCoordinator* replicationCoordinator, ReplicationCoordinatorExternalState* replicationCoordinatorExternalState, @@ -515,6 +518,13 @@ void BackgroundSync::_produce() { auto storageInterface = StorageInterface::get(opCtx.get()); _runRollback( opCtx.get(), fetcherReturnStatus, source, syncSourceResp.rbid, storageInterface); + + if (bgSyncHangAfterRunRollback.shouldFail()) { + log() << "bgSyncHangAfterRunRollback failpoint is set."; + while (MONGO_unlikely(bgSyncHangAfterRunRollback.shouldFail()) && !inShutdown()) { + mongo::sleepmillis(100); + } + } } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 4d01620b268..f85299d0422 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -91,6 +91,7 @@ #include "mongo/util/exit.h" #include "mongo/util/log.h" #include "mongo/util/processinfo.h" +#include "mongo/util/quick_exit.h" #include "mongo/util/scopeguard.h" #include "mongo/util/time_support.h" @@ -974,7 +975,16 @@ void WiredTigerKVEngine::cleanShutdown() { closeConfig += "use_timestamp=false,"; } - invariantWTOK(_conn->close(_conn, closeConfig.c_str())); + const Timestamp stableTimestamp = getStableTimestamp(); + const Timestamp initialDataTimestamp = getInitialDataTimestamp(); + if (stableTimestamp >= initialDataTimestamp) { + invariantWTOK(_conn->close(_conn, closeConfig.c_str())); + } else { + log() << "Skipping checkpoint during clean shutdown because stableTimestamp (" + << stableTimestamp << ") is less than the initialDataTimestamp (" + << initialDataTimestamp << ")"; + quickExit(EXIT_SUCCESS); + } _conn = nullptr; } |