summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@10gen.com>2017-05-18 13:04:58 -0400
committerMatthew Russotto <matthew.russotto@10gen.com>2017-06-14 11:55:02 -0400
commitca14d6a53664d228d0115be55a33f5fcd11d7fd2 (patch)
treed4b4e331de0949f20745b158b870fa86a49b7404 /src
parent786f52b43e6aced5a2c85efb42d8b9e363e3ad03 (diff)
downloadmongo-ca14d6a53664d228d0115be55a33f5fcd11d7fd2.tar.gz
SERVER-28877 Fix cancel race which can cause crashes during elections.
(cherry picked from commit 7844ffc9be25e16daa9615949941b8749ad3de9f)
Diffstat (limited to 'src')
-rw-r--r--src/mongo/db/repl/replication_executor.cpp11
-rw-r--r--src/mongo/db/repl/replication_executor.h1
-rw-r--r--src/mongo/db/repl/replication_executor_test.cpp77
3 files changed, 89 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp
index 7497793b6ca..9990aabedb4 100644
--- a/src/mongo/db/repl/replication_executor.cpp
+++ b/src/mongo/db/repl/replication_executor.cpp
@@ -427,7 +427,9 @@ void ReplicationExecutor::_doOperation(OperationContext* txn,
return;
Callback* callback = _getCallbackFromHandle(cbHandle);
const WorkQueue::iterator iter = callback->_iter;
+ callback->_isRemoved = true;
iter->callback = CallbackHandle();
+ iter->isNetworkOperation = false;
_freeQueue.splice(_freeQueue.begin(), *workQueue, iter);
lk.unlock();
{
@@ -494,7 +496,10 @@ ReplicationExecutor::getWork() {
}
const WorkItem work = *_readyQueue.begin();
const CallbackHandle cbHandle = work.callback;
+ Callback* callback = _getCallbackFromHandle(cbHandle);
+ callback->_isRemoved = true;
_readyQueue.begin()->callback = CallbackHandle();
+ _readyQueue.begin()->isNetworkOperation = false;
_freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin());
return std::make_pair(work, cbHandle);
}
@@ -595,6 +600,7 @@ ReplicationExecutor::Callback::Callback(ReplicationExecutor* executor,
_callbackFn(callbackFn),
_isCanceled(false),
_isSleeper(false),
+ _isRemoved(false),
_iter(iter),
_finishedEvent(finishedEvent) {}
@@ -607,6 +613,11 @@ bool ReplicationExecutor::Callback::isCanceled() const {
void ReplicationExecutor::Callback::cancel() {
stdx::unique_lock<stdx::mutex> lk(_executor->_mutex);
+ // If this element has already been removed from the queues,
+ // the cancel is too late and has no effect.
+ if (_isRemoved)
+ return;
+
_isCanceled = true;
if (_isSleeper) {
diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h
index b317ca587ec..b0e837083b2 100644
--- a/src/mongo/db/repl/replication_executor.h
+++ b/src/mongo/db/repl/replication_executor.h
@@ -366,6 +366,7 @@ private:
CallbackFn _callbackFn;
bool _isCanceled;
bool _isSleeper;
+ bool _isRemoved;
WorkQueue::iterator _iter;
EventHandle _finishedEvent;
};
diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp
index 98cb267f0da..ba41df46769 100644
--- a/src/mongo/db/repl/replication_executor_test.cpp
+++ b/src/mongo/db/repl/replication_executor_test.cpp
@@ -291,6 +291,83 @@ TEST_F(ReplicationExecutorTest, CallbacksAreInvokedOnClientThreads) {
ASSERT_TRUE(haveClientInCallback);
}
+TEST_F(ReplicationExecutorTest, TestForCancelRace) {
+ launchExecutorThread();
+ getNet()->exitNetwork();
+
+ unittest::Barrier enterCallback(2U), runCallback(2U);
+
+ ReplicationExecutor& executor = getReplExecutor();
+ bool firstEventDone = false;
+ bool firstEventCanceled = false;
+ auto fn = [&executor, &enterCallback, &runCallback, &firstEventDone, &firstEventCanceled](
+ const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) {
+ // This barrier lets the test code wait until we're in the callback.
+ enterCallback.countDownAndWait();
+ // This barrier lets the test code keep us in the callback until it has run the cancel.
+ runCallback.countDownAndWait();
+ firstEventCanceled = !cbData.response.status.isOK();
+ firstEventDone = true;
+ };
+
+ // First, schedule a network event to run.
+ const executor::RemoteCommandRequest request(
+ HostAndPort("test1", 1234), "mydb", BSON("nothing" << 0), nullptr);
+ auto firstCallback = assertGet(executor.scheduleRemoteCommand(request, fn));
+
+ // Now let the request happen.
+ // We need to run the network on another thread, because the test
+ // fixture will hang waiting for the callbacks to complete.
+ auto timeThread = stdx::thread([this] {
+ executor::NetworkInterfaceMock::InNetworkGuard guard(getNet());
+ ASSERT(getNet()->hasReadyRequests());
+ auto noi = getNet()->getNextReadyRequest();
+ getNet()->scheduleSuccessfulResponse(noi, {});
+ getNet()->runReadyNetworkOperations();
+ });
+
+ // Wait until we're in the callback.
+ enterCallback.countDownAndWait();
+
+ // Schedule a different network event to run.
+ bool secondEventDone = false;
+ bool secondEventCanceled = false;
+ auto fn2 = [&executor, &secondEventDone, &secondEventCanceled](
+ const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) {
+ secondEventCanceled = !cbData.response.status.isOK();
+ secondEventDone = true;
+ };
+ auto secondCallback = assertGet(executor.scheduleRemoteCommand(request, fn2));
+ ASSERT_FALSE(firstEventDone); // The first event should be stuck at runCallback barrier.
+ // Cancel the first callback. This cancel should have no effect as the callback has
+ // already been started.
+ executor.cancel(firstCallback);
+
+ // Let the first callback continue to completion.
+ runCallback.countDownAndWait();
+
+ // Now the time thread can continue.
+ timeThread.join();
+
+ // The first event should be done, the second event should be pending.
+ ASSERT(firstEventDone);
+ ASSERT_FALSE(secondEventDone);
+
+ // Run the network thread, which should run the second request.
+ {
+ executor::NetworkInterfaceMock::InNetworkGuard guard(getNet());
+ // The second request should be ready.
+ ASSERT(getNet()->hasReadyRequests()) << "Second request is not ready (cancelled?)";
+ auto noi = getNet()->getNextReadyRequest();
+ getNet()->scheduleSuccessfulResponse(noi, {});
+ getNet()->runReadyNetworkOperations();
+ }
+
+ // The second callback should have run without being canceled.
+ ASSERT_TRUE(secondEventDone);
+ ASSERT_FALSE(secondEventCanceled);
+}
+
} // namespace
} // namespace repl
} // namespace mongo