summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@10gen.com>2017-05-18 13:04:58 -0400
committerMatthew Russotto <matthew.russotto@10gen.com>2017-05-18 13:04:58 -0400
commit7844ffc9be25e16daa9615949941b8749ad3de9f (patch)
tree6363fc9b10453d35926ae08ec0a5aafb73b5229a
parent4e6736dd8def2d0643a657d1900c38e491592abb (diff)
downloadmongo-7844ffc9be25e16daa9615949941b8749ad3de9f.tar.gz
SERVER-28877 Fix cancel race which can cause crashes during elections.
-rw-r--r--src/mongo/db/repl/replication_executor.cpp11
-rw-r--r--src/mongo/db/repl/replication_executor.h1
-rw-r--r--src/mongo/db/repl/replication_executor_test.cpp77
3 files changed, 89 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp
index 9fcf7b01359..cfdaa785b1a 100644
--- a/src/mongo/db/repl/replication_executor.cpp
+++ b/src/mongo/db/repl/replication_executor.cpp
@@ -414,7 +414,9 @@ void ReplicationExecutor::_doOperation(OperationContext* opCtx,
return;
Callback* callback = _getCallbackFromHandle(cbHandle);
const WorkQueue::iterator iter = callback->_iter;
+ callback->_isRemoved = true;
iter->callback = CallbackHandle();
+ iter->isNetworkOperation = false;
_freeQueue.splice(_freeQueue.begin(), *workQueue, iter);
lk.unlock();
{
@@ -481,7 +483,10 @@ ReplicationExecutor::getWork() {
}
const WorkItem work = *_readyQueue.begin();
const CallbackHandle cbHandle = work.callback;
+ Callback* callback = _getCallbackFromHandle(cbHandle);
+ callback->_isRemoved = true;
_readyQueue.begin()->callback = CallbackHandle();
+ _readyQueue.begin()->isNetworkOperation = false;
_freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin());
return std::make_pair(work, cbHandle);
}
@@ -582,6 +587,7 @@ ReplicationExecutor::Callback::Callback(ReplicationExecutor* executor,
_callbackFn(callbackFn),
_isCanceled(false),
_isSleeper(false),
+ _isRemoved(false),
_iter(iter),
_finishedEvent(finishedEvent) {}
@@ -594,6 +600,11 @@ bool ReplicationExecutor::Callback::isCanceled() const {
void ReplicationExecutor::Callback::cancel() {
stdx::unique_lock<stdx::mutex> lk(_executor->_mutex);
+ // If this element has already been removed from the queues,
+ // the cancel is too late and has no effect.
+ if (_isRemoved)
+ return;
+
_isCanceled = true;
if (_isSleeper) {
diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h
index 37b05e2fa66..2da3e3f3deb 100644
--- a/src/mongo/db/repl/replication_executor.h
+++ b/src/mongo/db/repl/replication_executor.h
@@ -361,6 +361,7 @@ private:
CallbackFn _callbackFn;
bool _isCanceled;
bool _isSleeper;
+ bool _isRemoved;
WorkQueue::iterator _iter;
EventHandle _finishedEvent;
};
diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp
index 831259951b4..e0159598eb6 100644
--- a/src/mongo/db/repl/replication_executor_test.cpp
+++ b/src/mongo/db/repl/replication_executor_test.cpp
@@ -291,6 +291,83 @@ TEST_F(ReplicationExecutorTest, CallbacksAreInvokedOnClientThreads) {
ASSERT_TRUE(haveClientInCallback);
}
+TEST_F(ReplicationExecutorTest, TestForCancelRace) {
+ launchExecutorThread();
+ getNet()->exitNetwork();
+
+ unittest::Barrier enterCallback(2U), runCallback(2U);
+
+ ReplicationExecutor& executor = getReplExecutor();
+ bool firstEventDone = false;
+ bool firstEventCanceled = false;
+ auto fn = [&executor, &enterCallback, &runCallback, &firstEventDone, &firstEventCanceled](
+ const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) {
+ // This barrier lets the test code wait until we're in the callback.
+ enterCallback.countDownAndWait();
+ // This barrier lets the test code keep us in the callback until it has run the cancel.
+ runCallback.countDownAndWait();
+ firstEventCanceled = !cbData.response.status.isOK();
+ firstEventDone = true;
+ };
+
+ // First, schedule a network event to run.
+ const executor::RemoteCommandRequest request(
+ HostAndPort("test1", 1234), "mydb", BSON("nothing" << 0), nullptr);
+ auto firstCallback = assertGet(executor.scheduleRemoteCommand(request, fn));
+
+ // Now let the request happen.
+ // We need to run the network on another thread, because the test
+ // fixture will hang waiting for the callbacks to complete.
+ auto timeThread = stdx::thread([this] {
+ executor::NetworkInterfaceMock::InNetworkGuard guard(getNet());
+ ASSERT(getNet()->hasReadyRequests());
+ auto noi = getNet()->getNextReadyRequest();
+ getNet()->scheduleSuccessfulResponse(noi, {});
+ getNet()->runReadyNetworkOperations();
+ });
+
+ // Wait until we're in the callback.
+ enterCallback.countDownAndWait();
+
+ // Schedule a different network event to run.
+ bool secondEventDone = false;
+ bool secondEventCanceled = false;
+ auto fn2 = [&executor, &secondEventDone, &secondEventCanceled](
+ const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) {
+ secondEventCanceled = !cbData.response.status.isOK();
+ secondEventDone = true;
+ };
+ auto secondCallback = assertGet(executor.scheduleRemoteCommand(request, fn2));
+ ASSERT_FALSE(firstEventDone); // The first event should be stuck at runCallback barrier.
+ // Cancel the first callback. This cancel should have no effect as the callback has
+ // already been started.
+ executor.cancel(firstCallback);
+
+ // Let the first callback continue to completion.
+ runCallback.countDownAndWait();
+
+ // Now the time thread can continue.
+ timeThread.join();
+
+ // The first event should be done, the second event should be pending.
+ ASSERT(firstEventDone);
+ ASSERT_FALSE(secondEventDone);
+
+ // Run the network thread, which should run the second request.
+ {
+ executor::NetworkInterfaceMock::InNetworkGuard guard(getNet());
+ // The second request should be ready.
+ ASSERT(getNet()->hasReadyRequests()) << "Second request is not ready (cancelled?)";
+ auto noi = getNet()->getNextReadyRequest();
+ getNet()->scheduleSuccessfulResponse(noi, {});
+ getNet()->runReadyNetworkOperations();
+ }
+
+ // The second callback should have run without being canceled.
+ ASSERT_TRUE(secondEventDone);
+ ASSERT_FALSE(secondEventCanceled);
+}
+
} // namespace
} // namespace repl
} // namespace mongo