diff options
Diffstat (limited to 'src/mongo/s/query/async_results_merger.cpp')
-rw-r--r-- | src/mongo/s/query/async_results_merger.cpp | 31 |
1 files changed, 22 insertions, 9 deletions
diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index c82de6a3bbe..984c2f85184 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -304,6 +304,14 @@ Status AsyncResultsMerger::askForNextBatch_inlock(size_t remoteIndex) { return Status::OK(); } +/* + * Note: When nextEvent() is called to do retries, only the remotes with retriable errors will + * be rescheduled because: + * + * 1. Other pending remotes still have callback assigned to them. + * 2. Remotes that already has some result will have a non-empty buffer. + * 3. Remotes that reached maximum retries will be in 'exhausted' state. + */ StatusWith<executor::TaskExecutor::EventHandle> AsyncResultsMerger::nextEvent() { stdx::lock_guard<stdx::mutex> lk(_mutex); @@ -441,24 +449,29 @@ void AsyncResultsMerger::handleBatchResponse( } } - // If the error is retriable, schedule another request. + // If we can still retry the initial cursor establishment, reset the state so it can be + // retried the next time nextEvent is called. Never retry getMores to avoid + // accidentally skipping results. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) { + invariant(remote.docBuffer.empty()); + LOG(1) << "Initial cursor establishment failed with retriable error and will be retried" << causedBy(cursorResponseStatus.getStatus()); ++remote.retryCount; - // Since we potentially updated the targeter that the last host it chose might be - // faulty, the call below may end up getting a different host. - remote.status = askForNextBatch_inlock(remoteIndex); - if (remote.status.isOK()) { - return; + remote.status = Status::OK(); // Reset status so it can be retried. + + // Signal the merger thread to make it retry this remote again. + if (_currentEvent.isValid()) { + // To prevent ourselves from signalling the event twice, + // we set '_currentEvent' as invalid after signalling it. + _executor->signalEvent(_currentEvent); + _currentEvent = executor::TaskExecutor::EventHandle(); } - // If we end up here, it means we failed to schedule the retry request, which is a more - // severe error that should not be retried. Just pass through to the error handling - // logic below. + return; } else { remote.status = cursorResponseStatus.getStatus(); } |