summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@10gen.com>2017-06-15 10:39:43 -0400
committerMatthew Russotto <matthew.russotto@10gen.com>2017-06-15 10:39:43 -0400
commitdaa09e3bfa5382cfc51de3f339a486cfc8aed219 (patch)
treeb697d3197f1690239bca797ba15d27c799f3350d
parent4ab6ccc79febd7b3e76c792fa67efaae4060dc05 (diff)
downloadmongo-daa09e3bfa5382cfc51de3f339a486cfc8aed219.tar.gz
SERVER-28877 Fix cancel race which can cause crashes during elections.
(cherry picked from commit 7844ffc9be25e16daa9615949941b8749ad3de9f)
-rw-r--r--src/mongo/db/repl/replication_executor.cpp11
-rw-r--r--src/mongo/db/repl/replication_executor.h1
-rw-r--r--src/mongo/db/repl/replication_executor_test.cpp80
3 files changed, 92 insertions, 0 deletions
diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp
index bf55ef462df..2b1c7c6124f 100644
--- a/src/mongo/db/repl/replication_executor.cpp
+++ b/src/mongo/db/repl/replication_executor.cpp
@@ -428,7 +428,9 @@ void ReplicationExecutor::_doOperation(OperationContext* txn,
return;
Callback* callback = _getCallbackFromHandle(cbHandle);
const WorkQueue::iterator iter = callback->_iter;
+ callback->_isRemoved = true;
iter->callback = CallbackHandle();
+ iter->isNetworkOperation = false;
_freeQueue.splice(_freeQueue.begin(), *workQueue, iter);
lk.unlock();
{
@@ -495,7 +497,10 @@ ReplicationExecutor::getWork() {
}
const WorkItem work = *_readyQueue.begin();
const CallbackHandle cbHandle = work.callback;
+ Callback* callback = _getCallbackFromHandle(cbHandle);
+ callback->_isRemoved = true;
_readyQueue.begin()->callback = CallbackHandle();
+ _readyQueue.begin()->isNetworkOperation = false;
_freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin());
return std::make_pair(work, cbHandle);
}
@@ -592,6 +597,7 @@ ReplicationExecutor::Callback::Callback(ReplicationExecutor* executor,
_callbackFn(callbackFn),
_isCanceled(false),
_isSleeper(false),
+ _isRemoved(false),
_iter(iter),
_finishedEvent(finishedEvent) {}
@@ -604,6 +610,11 @@ bool ReplicationExecutor::Callback::isCanceled() const {
void ReplicationExecutor::Callback::cancel() {
stdx::unique_lock<stdx::mutex> lk(_executor->_mutex);
+ // If this element has already been removed from the queues,
+ // the cancel is too late and has no effect.
+ if (_isRemoved)
+ return;
+
_isCanceled = true;
if (_isSleeper) {
diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h
index c46ef4a659a..9402a66f364 100644
--- a/src/mongo/db/repl/replication_executor.h
+++ b/src/mongo/db/repl/replication_executor.h
@@ -363,6 +363,7 @@ private:
CallbackFn _callbackFn;
bool _isCanceled;
bool _isSleeper;
+ bool _isRemoved;
WorkQueue::iterator _iter;
EventHandle _finishedEvent;
};
diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp
index 5bcf567b0af..60259be0b52 100644
--- a/src/mongo/db/repl/replication_executor_test.cpp
+++ b/src/mongo/db/repl/replication_executor_test.cpp
@@ -52,6 +52,7 @@ namespace repl {
namespace {
using executor::NetworkInterfaceMock;
+using executor::RemoteCommandResponse;
using unittest::assertGet;
const int64_t prngSeed = 1;
@@ -251,6 +252,85 @@ TEST_F(ReplicationExecutorTest, ScheduleCallbackAtAFutureTime) {
}
+TEST_F(ReplicationExecutorTest, TestForCancelRace) {
+ launchExecutorThread();
+ getNet()->exitNetwork();
+
+ unittest::Barrier enterCallback(2U), runCallback(2U);
+
+ ReplicationExecutor& executor = getReplExecutor();
+ bool firstEventDone = false;
+ bool firstEventCanceled = false;
+ auto fn = [&executor, &enterCallback, &runCallback, &firstEventDone, &firstEventCanceled](
+ const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) {
+ // This barrier lets the test code wait until we're in the callback.
+ enterCallback.countDownAndWait();
+ // This barrier lets the test code keep us in the callback until it has run the cancel.
+ runCallback.countDownAndWait();
+ firstEventCanceled = !cbData.response.getStatus().isOK();
+ firstEventDone = true;
+ };
+
+ // First, schedule a network event to run.
+ const executor::RemoteCommandRequest request(
+ HostAndPort("test1", 1234), "mydb", BSON("nothing" << 0));
+ auto firstCallback = assertGet(executor.scheduleRemoteCommand(request, fn));
+
+ // Now let the request happen.
+ // We need to run the network on another thread, because the test
+ // fixture will hang waiting for the callbacks to complete.
+ auto timeThread = stdx::thread([this] {
+ getNet()->enterNetwork();
+ ASSERT(getNet()->hasReadyRequests());
+ auto noi = getNet()->getNextReadyRequest();
+ getNet()->scheduleResponse(noi, getNet()->now(), RemoteCommandResponse());
+ getNet()->runReadyNetworkOperations();
+ getNet()->exitNetwork();
+ });
+
+ // Wait until we're in the callback.
+ enterCallback.countDownAndWait();
+
+ // Schedule a different network event to run.
+ bool secondEventDone = false;
+ bool secondEventCanceled = false;
+ auto fn2 = [&executor, &secondEventDone, &secondEventCanceled](
+ const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData) {
+ secondEventCanceled = !cbData.response.getStatus().isOK();
+ secondEventDone = true;
+ };
+ auto secondCallback = assertGet(executor.scheduleRemoteCommand(request, fn2));
+ ASSERT_FALSE(firstEventDone); // The first event should be stuck at runCallback barrier.
+ // Cancel the first callback. This cancel should have no effect as the callback has
+ // already been started.
+ executor.cancel(firstCallback);
+
+ // Let the first callback continue to completion.
+ runCallback.countDownAndWait();
+
+ // Now the time thread can continue.
+ timeThread.join();
+
+ // The first event should be done, the second event should be pending.
+ ASSERT(firstEventDone);
+ ASSERT_FALSE(secondEventDone);
+
+ // Run the network thread, which should run the second request.
+ {
+ getNet()->enterNetwork();
+ // The second request should be ready.
+ ASSERT(getNet()->hasReadyRequests()) << "Second request is not ready (cancelled?)";
+ auto noi = getNet()->getNextReadyRequest();
+ getNet()->scheduleResponse(noi, getNet()->now(), RemoteCommandResponse());
+ getNet()->runReadyNetworkOperations();
+ getNet()->exitNetwork();
+ }
+
+ // The second callback should have run without being canceled.
+ ASSERT_TRUE(secondEventDone);
+ ASSERT_FALSE(secondEventCanceled);
+}
+
} // namespace
} // namespace repl
} // namespace mongo