diff options
author | Matthew Russotto <matthew.russotto@10gen.com> | 2017-06-15 10:39:43 -0400 |
---|---|---|
committer | Matthew Russotto <matthew.russotto@10gen.com> | 2017-06-15 10:39:43 -0400 |
commit | daa09e3bfa5382cfc51de3f339a486cfc8aed219 (patch) | |
tree | b697d3197f1690239bca797ba15d27c799f3350d | |
parent | 4ab6ccc79febd7b3e76c792fa67efaae4060dc05 (diff) | |
download | mongo-daa09e3bfa5382cfc51de3f339a486cfc8aed219.tar.gz |
SERVER-28877 Fix cancel race which can cause crashes during elections.
(cherry picked from commit 7844ffc9be25e16daa9615949941b8749ad3de9f)
-rw-r--r-- | src/mongo/db/repl/replication_executor.cpp | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor.h | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor_test.cpp | 80 |
3 files changed, 92 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp index bf55ef462df..2b1c7c6124f 100644 --- a/src/mongo/db/repl/replication_executor.cpp +++ b/src/mongo/db/repl/replication_executor.cpp @@ -428,7 +428,9 @@ void ReplicationExecutor::_doOperation(OperationContext* txn, return; Callback* callback = _getCallbackFromHandle(cbHandle); const WorkQueue::iterator iter = callback->_iter; + callback->_isRemoved = true; iter->callback = CallbackHandle(); + iter->isNetworkOperation = false; _freeQueue.splice(_freeQueue.begin(), *workQueue, iter); lk.unlock(); { @@ -495,7 +497,10 @@ ReplicationExecutor::getWork() { } const WorkItem work = *_readyQueue.begin(); const CallbackHandle cbHandle = work.callback; + Callback* callback = _getCallbackFromHandle(cbHandle); + callback->_isRemoved = true; _readyQueue.begin()->callback = CallbackHandle(); + _readyQueue.begin()->isNetworkOperation = false; _freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin()); return std::make_pair(work, cbHandle); } @@ -592,6 +597,7 @@ ReplicationExecutor::Callback::Callback(ReplicationExecutor* executor, _callbackFn(callbackFn), _isCanceled(false), _isSleeper(false), + _isRemoved(false), _iter(iter), _finishedEvent(finishedEvent) {} @@ -604,6 +610,11 @@ bool ReplicationExecutor::Callback::isCanceled() const { void ReplicationExecutor::Callback::cancel() { stdx::unique_lock<stdx::mutex> lk(_executor->_mutex); + // If this element has already been removed from the queues, + // the cancel is too late and has no effect. + if (_isRemoved) + return; + _isCanceled = true; if (_isSleeper) { diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h index c46ef4a659a..9402a66f364 100644 --- a/src/mongo/db/repl/replication_executor.h +++ b/src/mongo/db/repl/replication_executor.h @@ -363,6 +363,7 @@ private: CallbackFn _callbackFn; bool _isCanceled; bool _isSleeper; + bool _isRemoved; WorkQueue::iterator _iter; EventHandle _finishedEvent; }; diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp index 5bcf567b0af..60259be0b52 100644 --- a/src/mongo/db/repl/replication_executor_test.cpp +++ b/src/mongo/db/repl/replication_executor_test.cpp @@ -52,6 +52,7 @@ namespace repl { namespace { using executor::NetworkInterfaceMock; +using executor::RemoteCommandResponse; using unittest::assertGet; const int64_t prngSeed = 1; @@ -251,6 +252,85 @@ TEST_F(ReplicationExecutorTest, ScheduleCallbackAtAFutureTime) { } +TEST_F(ReplicationExecutorTest, TestForCancelRace) { + launchExecutorThread(); + getNet()->exitNetwork(); + + unittest::Barrier enterCallback(2U), runCallback(2U); + + ReplicationExecutor& executor = getReplExecutor(); + bool firstEventDone = false; + bool firstEventCanceled = false; + auto fn = [&executor, &enterCallback, &runCallback, &firstEventDone, &firstEventCanceled]( + const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) { + // This barrier lets the test code wait until we're in the callback. + enterCallback.countDownAndWait(); + // This barrier lets the test code keep us in the callback until it has run the cancel. + runCallback.countDownAndWait(); + firstEventCanceled = !cbData.response.getStatus().isOK(); + firstEventDone = true; + }; + + // First, schedule a network event to run. + const executor::RemoteCommandRequest request( + HostAndPort("test1", 1234), "mydb", BSON("nothing" << 0)); + auto firstCallback = assertGet(executor.scheduleRemoteCommand(request, fn)); + + // Now let the request happen. + // We need to run the network on another thread, because the test + // fixture will hang waiting for the callbacks to complete. + auto timeThread = stdx::thread([this] { + getNet()->enterNetwork(); + ASSERT(getNet()->hasReadyRequests()); + auto noi = getNet()->getNextReadyRequest(); + getNet()->scheduleResponse(noi, getNet()->now(), RemoteCommandResponse()); + getNet()->runReadyNetworkOperations(); + getNet()->exitNetwork(); + }); + + // Wait until we're in the callback. + enterCallback.countDownAndWait(); + + // Schedule a different network event to run. + bool secondEventDone = false; + bool secondEventCanceled = false; + auto fn2 = [&executor, &secondEventDone, &secondEventCanceled]( + const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) { + secondEventCanceled = !cbData.response.getStatus().isOK(); + secondEventDone = true; + }; + auto secondCallback = assertGet(executor.scheduleRemoteCommand(request, fn2)); + ASSERT_FALSE(firstEventDone); // The first event should be stuck at runCallback barrier. + // Cancel the first callback. This cancel should have no effect as the callback has + // already been started. + executor.cancel(firstCallback); + + // Let the first callback continue to completion. + runCallback.countDownAndWait(); + + // Now the time thread can continue. + timeThread.join(); + + // The first event should be done, the second event should be pending. + ASSERT(firstEventDone); + ASSERT_FALSE(secondEventDone); + + // Run the network thread, which should run the second request. + { + getNet()->enterNetwork(); + // The second request should be ready. + ASSERT(getNet()->hasReadyRequests()) << "Second request is not ready (cancelled?)"; + auto noi = getNet()->getNextReadyRequest(); + getNet()->scheduleResponse(noi, getNet()->now(), RemoteCommandResponse()); + getNet()->runReadyNetworkOperations(); + getNet()->exitNetwork(); + } + + // The second callback should have run without being canceled. + ASSERT_TRUE(secondEventDone); + ASSERT_FALSE(secondEventCanceled); +} + } // namespace } // namespace repl } // namespace mongo |