diff options
author | Matthew Russotto <matthew.russotto@10gen.com> | 2017-05-18 13:04:58 -0400 |
---|---|---|
committer | Matthew Russotto <matthew.russotto@10gen.com> | 2017-06-14 11:55:02 -0400 |
commit | ca14d6a53664d228d0115be55a33f5fcd11d7fd2 (patch) | |
tree | d4b4e331de0949f20745b158b870fa86a49b7404 /src | |
parent | 786f52b43e6aced5a2c85efb42d8b9e363e3ad03 (diff) | |
download | mongo-ca14d6a53664d228d0115be55a33f5fcd11d7fd2.tar.gz |
SERVER-28877 Fix cancel race which can cause crashes during elections.
(cherry picked from commit 7844ffc9be25e16daa9615949941b8749ad3de9f)
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/repl/replication_executor.cpp | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor.h | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor_test.cpp | 77 |
3 files changed, 89 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp index 7497793b6ca..9990aabedb4 100644 --- a/src/mongo/db/repl/replication_executor.cpp +++ b/src/mongo/db/repl/replication_executor.cpp @@ -427,7 +427,9 @@ void ReplicationExecutor::_doOperation(OperationContext* txn, return; Callback* callback = _getCallbackFromHandle(cbHandle); const WorkQueue::iterator iter = callback->_iter; + callback->_isRemoved = true; iter->callback = CallbackHandle(); + iter->isNetworkOperation = false; _freeQueue.splice(_freeQueue.begin(), *workQueue, iter); lk.unlock(); { @@ -494,7 +496,10 @@ ReplicationExecutor::getWork() { } const WorkItem work = *_readyQueue.begin(); const CallbackHandle cbHandle = work.callback; + Callback* callback = _getCallbackFromHandle(cbHandle); + callback->_isRemoved = true; _readyQueue.begin()->callback = CallbackHandle(); + _readyQueue.begin()->isNetworkOperation = false; _freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin()); return std::make_pair(work, cbHandle); } @@ -595,6 +600,7 @@ ReplicationExecutor::Callback::Callback(ReplicationExecutor* executor, _callbackFn(callbackFn), _isCanceled(false), _isSleeper(false), + _isRemoved(false), _iter(iter), _finishedEvent(finishedEvent) {} @@ -607,6 +613,11 @@ bool ReplicationExecutor::Callback::isCanceled() const { void ReplicationExecutor::Callback::cancel() { stdx::unique_lock<stdx::mutex> lk(_executor->_mutex); + // If this element has already been removed from the queues, + // the cancel is too late and has no effect. + if (_isRemoved) + return; + _isCanceled = true; if (_isSleeper) { diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h index b317ca587ec..b0e837083b2 100644 --- a/src/mongo/db/repl/replication_executor.h +++ b/src/mongo/db/repl/replication_executor.h @@ -366,6 +366,7 @@ private: CallbackFn _callbackFn; bool _isCanceled; bool _isSleeper; + bool _isRemoved; WorkQueue::iterator _iter; EventHandle _finishedEvent; }; diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp index 98cb267f0da..ba41df46769 100644 --- a/src/mongo/db/repl/replication_executor_test.cpp +++ b/src/mongo/db/repl/replication_executor_test.cpp @@ -291,6 +291,83 @@ TEST_F(ReplicationExecutorTest, CallbacksAreInvokedOnClientThreads) { ASSERT_TRUE(haveClientInCallback); } +TEST_F(ReplicationExecutorTest, TestForCancelRace) { + launchExecutorThread(); + getNet()->exitNetwork(); + + unittest::Barrier enterCallback(2U), runCallback(2U); + + ReplicationExecutor& executor = getReplExecutor(); + bool firstEventDone = false; + bool firstEventCanceled = false; + auto fn = [&executor, &enterCallback, &runCallback, &firstEventDone, &firstEventCanceled]( + const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) { + // This barrier lets the test code wait until we're in the callback. + enterCallback.countDownAndWait(); + // This barrier lets the test code keep us in the callback until it has run the cancel. + runCallback.countDownAndWait(); + firstEventCanceled = !cbData.response.status.isOK(); + firstEventDone = true; + }; + + // First, schedule a network event to run. + const executor::RemoteCommandRequest request( + HostAndPort("test1", 1234), "mydb", BSON("nothing" << 0), nullptr); + auto firstCallback = assertGet(executor.scheduleRemoteCommand(request, fn)); + + // Now let the request happen. + // We need to run the network on another thread, because the test + // fixture will hang waiting for the callbacks to complete. + auto timeThread = stdx::thread([this] { + executor::NetworkInterfaceMock::InNetworkGuard guard(getNet()); + ASSERT(getNet()->hasReadyRequests()); + auto noi = getNet()->getNextReadyRequest(); + getNet()->scheduleSuccessfulResponse(noi, {}); + getNet()->runReadyNetworkOperations(); + }); + + // Wait until we're in the callback. + enterCallback.countDownAndWait(); + + // Schedule a different network event to run. + bool secondEventDone = false; + bool secondEventCanceled = false; + auto fn2 = [&executor, &secondEventDone, &secondEventCanceled]( + const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) { + secondEventCanceled = !cbData.response.status.isOK(); + secondEventDone = true; + }; + auto secondCallback = assertGet(executor.scheduleRemoteCommand(request, fn2)); + ASSERT_FALSE(firstEventDone); // The first event should be stuck at runCallback barrier. + // Cancel the first callback. This cancel should have no effect as the callback has + // already been started. + executor.cancel(firstCallback); + + // Let the first callback continue to completion. + runCallback.countDownAndWait(); + + // Now the time thread can continue. + timeThread.join(); + + // The first event should be done, the second event should be pending. + ASSERT(firstEventDone); + ASSERT_FALSE(secondEventDone); + + // Run the network thread, which should run the second request. + { + executor::NetworkInterfaceMock::InNetworkGuard guard(getNet()); + // The second request should be ready. + ASSERT(getNet()->hasReadyRequests()) << "Second request is not ready (cancelled?)"; + auto noi = getNet()->getNextReadyRequest(); + getNet()->scheduleSuccessfulResponse(noi, {}); + getNet()->runReadyNetworkOperations(); + } + + // The second callback should have run without being canceled. + ASSERT_TRUE(secondEventDone); + ASSERT_FALSE(secondEventCanceled); +} + } // namespace } // namespace repl } // namespace mongo |