diff options
author | Nick Vatamaniuc <vatamane@apache.org> | 2017-10-30 14:41:23 -0400 |
---|---|---|
committer | Joan Touzet <joant@atypical.net> | 2017-10-31 20:01:12 -0400 |
commit | f8c082453c78a6a434bffdbb31460cde98494446 (patch) | |
tree | 8aeb588c9b8db14d63ce7577b2c515017c2bc65c | |
parent | 235ac7115070b42193746b9ba5f9c2ab0caa97c5 (diff) | |
download | couchdb-f8c082453c78a6a434bffdbb31460cde98494446.tar.gz |
Configurable delay before retrying on missing_doc error
Implement a configurable delay before retrying a document fetch in replicator.
missing_doc exceptions usually happen when there is a continuous replication
set up and the source is updated. The change might appear in the changes feed,
but when worker tries to fetch the document's revisions it talks to a
node where internal replication hasn't caught up and so it throws an exception.
Previously the delay was hard-coded at 0 (that is retrying was immediate). The
replication would still make progress, but after crashing, retrying and
generating a lot of unnecessary log noise. Since updating a source while
continuous replication is running is a common scenario, it's worth optimizing
for it and avoiding wasting resources and spamming logs.
-rw-r--r-- | rel/overlay/etc/default.ini | 9 | ||||
-rw-r--r-- | src/couch_replicator/src/couch_replicator_worker.erl | 7 |
2 files changed, 16 insertions, 0 deletions
diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini index 4e61deb60..745e5a8e4 100644 --- a/rel/overlay/etc/default.ini +++ b/rel/overlay/etc/default.ini @@ -399,6 +399,15 @@ verify_ssl_certificates = false ssl_certificate_max_depth = 3 ; Maximum document ID length for replication. ;max_document_id_length = 0 +; How much time to wait before retrying after a missing doc exception. This +; exception happens if the document was seen in the changes feed, but internal +; replication hasn't caught up yet, and fetching document's revisions +; fails. This a common scenario when source is updated while continous +; replication is running. The retry period would depend on how quickly internal +; replication is expected to catch up. In general this is an optimisation to +; avoid crashing the whole replication job, which would consume more resources +; and add log noise. +;missing_doc_retry_msec = 2000 [compaction_daemon] ; The delay, in seconds, between each check for which database and view indexes diff --git a/src/couch_replicator/src/couch_replicator_worker.erl b/src/couch_replicator/src/couch_replicator_worker.erl index 45ccefa10..db6b72b2e 100644 --- a/src/couch_replicator/src/couch_replicator_worker.erl +++ b/src/couch_replicator/src/couch_replicator_worker.erl @@ -31,6 +31,7 @@ -define(MAX_BULK_ATT_SIZE, 64 * 1024). -define(MAX_BULK_ATTS_PER_DOC, 8). -define(STATS_DELAY, 10000000). % 10 seconds (in microseconds) +-define(MISSING_DOC_RETRY_MSEC, 2000). -import(couch_replicator_utils, [ open_db/1, @@ -314,11 +315,17 @@ fetch_doc(Source, {Id, Revs, PAs}, DocHandler, Acc) -> couch_log:error("Retrying fetch and update of document `~s` as it is " "unexpectedly missing. Missing revisions are: ~s", [Id, couch_doc:revs_to_strs(Revs)]), + WaitMSec = config:get_integer("replicator", "missing_doc_retry_msec", + ?MISSING_DOC_RETRY_MSEC), + timer:sleep(WaitMSec), couch_replicator_api_wrap:open_doc_revs(Source, Id, Revs, [latest], DocHandler, Acc); throw:{missing_stub, _} -> couch_log:error("Retrying fetch and update of document `~s` due to out of " "sync attachment stubs. Missing revisions are: ~s", [Id, couch_doc:revs_to_strs(Revs)]), + WaitMSec = config:get_integer("replicator", "missing_doc_retry_msec", + ?MISSING_DOC_RETRY_MSEC), + timer:sleep(WaitMSec), couch_replicator_api_wrap:open_doc_revs(Source, Id, Revs, [latest], DocHandler, Acc) end. |