diff options
author | Matthew Russotto <matthew.russotto@10gen.com> | 2017-05-18 13:04:58 -0400 |
---|---|---|
committer | Matthew Russotto <matthew.russotto@10gen.com> | 2017-05-18 13:04:58 -0400 |
commit | 7844ffc9be25e16daa9615949941b8749ad3de9f (patch) | |
tree | 6363fc9b10453d35926ae08ec0a5aafb73b5229a | |
parent | 4e6736dd8def2d0643a657d1900c38e491592abb (diff) | |
download | mongo-7844ffc9be25e16daa9615949941b8749ad3de9f.tar.gz |
SERVER-28877 Fix cancel race which can cause crashes during elections.
-rw-r--r-- | src/mongo/db/repl/replication_executor.cpp | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor.h | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor_test.cpp | 77 |
3 files changed, 89 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp index 9fcf7b01359..cfdaa785b1a 100644 --- a/src/mongo/db/repl/replication_executor.cpp +++ b/src/mongo/db/repl/replication_executor.cpp @@ -414,7 +414,9 @@ void ReplicationExecutor::_doOperation(OperationContext* opCtx, return; Callback* callback = _getCallbackFromHandle(cbHandle); const WorkQueue::iterator iter = callback->_iter; + callback->_isRemoved = true; iter->callback = CallbackHandle(); + iter->isNetworkOperation = false; _freeQueue.splice(_freeQueue.begin(), *workQueue, iter); lk.unlock(); { @@ -481,7 +483,10 @@ ReplicationExecutor::getWork() { } const WorkItem work = *_readyQueue.begin(); const CallbackHandle cbHandle = work.callback; + Callback* callback = _getCallbackFromHandle(cbHandle); + callback->_isRemoved = true; _readyQueue.begin()->callback = CallbackHandle(); + _readyQueue.begin()->isNetworkOperation = false; _freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin()); return std::make_pair(work, cbHandle); } @@ -582,6 +587,7 @@ ReplicationExecutor::Callback::Callback(ReplicationExecutor* executor, _callbackFn(callbackFn), _isCanceled(false), _isSleeper(false), + _isRemoved(false), _iter(iter), _finishedEvent(finishedEvent) {} @@ -594,6 +600,11 @@ bool ReplicationExecutor::Callback::isCanceled() const { void ReplicationExecutor::Callback::cancel() { stdx::unique_lock<stdx::mutex> lk(_executor->_mutex); + // If this element has already been removed from the queues, + // the cancel is too late and has no effect. + if (_isRemoved) + return; + _isCanceled = true; if (_isSleeper) { diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h index 37b05e2fa66..2da3e3f3deb 100644 --- a/src/mongo/db/repl/replication_executor.h +++ b/src/mongo/db/repl/replication_executor.h @@ -361,6 +361,7 @@ private: CallbackFn _callbackFn; bool _isCanceled; bool _isSleeper; + bool _isRemoved; WorkQueue::iterator _iter; EventHandle _finishedEvent; }; diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp index 831259951b4..e0159598eb6 100644 --- a/src/mongo/db/repl/replication_executor_test.cpp +++ b/src/mongo/db/repl/replication_executor_test.cpp @@ -291,6 +291,83 @@ TEST_F(ReplicationExecutorTest, CallbacksAreInvokedOnClientThreads) { ASSERT_TRUE(haveClientInCallback); } +TEST_F(ReplicationExecutorTest, TestForCancelRace) { + launchExecutorThread(); + getNet()->exitNetwork(); + + unittest::Barrier enterCallback(2U), runCallback(2U); + + ReplicationExecutor& executor = getReplExecutor(); + bool firstEventDone = false; + bool firstEventCanceled = false; + auto fn = [&executor, &enterCallback, &runCallback, &firstEventDone, &firstEventCanceled]( + const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) { + // This barrier lets the test code wait until we're in the callback. + enterCallback.countDownAndWait(); + // This barrier lets the test code keep us in the callback until it has run the cancel. + runCallback.countDownAndWait(); + firstEventCanceled = !cbData.response.status.isOK(); + firstEventDone = true; + }; + + // First, schedule a network event to run. + const executor::RemoteCommandRequest request( + HostAndPort("test1", 1234), "mydb", BSON("nothing" << 0), nullptr); + auto firstCallback = assertGet(executor.scheduleRemoteCommand(request, fn)); + + // Now let the request happen. + // We need to run the network on another thread, because the test + // fixture will hang waiting for the callbacks to complete. + auto timeThread = stdx::thread([this] { + executor::NetworkInterfaceMock::InNetworkGuard guard(getNet()); + ASSERT(getNet()->hasReadyRequests()); + auto noi = getNet()->getNextReadyRequest(); + getNet()->scheduleSuccessfulResponse(noi, {}); + getNet()->runReadyNetworkOperations(); + }); + + // Wait until we're in the callback. + enterCallback.countDownAndWait(); + + // Schedule a different network event to run. + bool secondEventDone = false; + bool secondEventCanceled = false; + auto fn2 = [&executor, &secondEventDone, &secondEventCanceled]( + const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) { + secondEventCanceled = !cbData.response.status.isOK(); + secondEventDone = true; + }; + auto secondCallback = assertGet(executor.scheduleRemoteCommand(request, fn2)); + ASSERT_FALSE(firstEventDone); // The first event should be stuck at runCallback barrier. + // Cancel the first callback. This cancel should have no effect as the callback has + // already been started. + executor.cancel(firstCallback); + + // Let the first callback continue to completion. + runCallback.countDownAndWait(); + + // Now the time thread can continue. + timeThread.join(); + + // The first event should be done, the second event should be pending. + ASSERT(firstEventDone); + ASSERT_FALSE(secondEventDone); + + // Run the network thread, which should run the second request. + { + executor::NetworkInterfaceMock::InNetworkGuard guard(getNet()); + // The second request should be ready. + ASSERT(getNet()->hasReadyRequests()) << "Second request is not ready (cancelled?)"; + auto noi = getNet()->getNextReadyRequest(); + getNet()->scheduleSuccessfulResponse(noi, {}); + getNet()->runReadyNetworkOperations(); + } + + // The second callback should have run without being canceled. + ASSERT_TRUE(secondEventDone); + ASSERT_FALSE(secondEventCanceled); +} + } // namespace } // namespace repl } // namespace mongo |