summaryrefslogtreecommitdiff
path: root/src/mongo/s/query/async_results_merger.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/s/query/async_results_merger.cpp')
-rw-r--r--src/mongo/s/query/async_results_merger.cpp31
1 files changed, 22 insertions, 9 deletions
diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp
index c82de6a3bbe..984c2f85184 100644
--- a/src/mongo/s/query/async_results_merger.cpp
+++ b/src/mongo/s/query/async_results_merger.cpp
@@ -304,6 +304,14 @@ Status AsyncResultsMerger::askForNextBatch_inlock(size_t remoteIndex) {
return Status::OK();
}
+/*
+ * Note: When nextEvent() is called to do retries, only the remotes with retriable errors will
+ * be rescheduled because:
+ *
+ * 1. Other pending remotes still have callback assigned to them.
+ * 2. Remotes that already has some result will have a non-empty buffer.
+ * 3. Remotes that reached maximum retries will be in 'exhausted' state.
+ */
StatusWith<executor::TaskExecutor::EventHandle> AsyncResultsMerger::nextEvent() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
@@ -441,24 +449,29 @@ void AsyncResultsMerger::handleBatchResponse(
}
}
- // If the error is retriable, schedule another request.
+ // If we can still retry the initial cursor establishment, reset the state so it can be
+ // retried the next time nextEvent is called. Never retry getMores to avoid
+ // accidentally skipping results.
if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) {
+ invariant(remote.docBuffer.empty());
+
LOG(1) << "Initial cursor establishment failed with retriable error and will be retried"
<< causedBy(cursorResponseStatus.getStatus());
++remote.retryCount;
- // Since we potentially updated the targeter that the last host it chose might be
- // faulty, the call below may end up getting a different host.
- remote.status = askForNextBatch_inlock(remoteIndex);
- if (remote.status.isOK()) {
- return;
+ remote.status = Status::OK(); // Reset status so it can be retried.
+
+ // Signal the merger thread to make it retry this remote again.
+ if (_currentEvent.isValid()) {
+ // To prevent ourselves from signalling the event twice,
+ // we set '_currentEvent' as invalid after signalling it.
+ _executor->signalEvent(_currentEvent);
+ _currentEvent = executor::TaskExecutor::EventHandle();
}
- // If we end up here, it means we failed to schedule the retry request, which is a more
- // severe error that should not be retried. Just pass through to the error handling
- // logic below.
+ return;
} else {
remote.status = cursorResponseStatus.getStatus();
}