summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@apache.org>2017-10-30 14:41:23 -0400
committerNick Vatamaniuc <nickva@users.noreply.github.com>2017-10-31 15:45:32 -0400
commit40b9f85f0be775fe5508f12332130f2695262595 (patch)
treee09535ead3923295d92ec7bc6ac3bf313807fbda
parent0ce05d26a565188aa4e2b0181788b244fe385e01 (diff)
downloadcouchdb-40b9f85f0be775fe5508f12332130f2695262595.tar.gz
Configurable delay before retrying on missing_doc error
Implement a configurable delay before retrying a document fetch in replicator. missing_doc exceptions usually happen when there is a continuous replication set up and the source is updated. The change might appear in the changes feed, but when worker tries to fetch the document's revisions it talks to a node where internal replication hasn't caught up and so it throws an exception. Previously the delay was hard-coded at 0 (that is retrying was immediate). The replication would still make progress, but after crashing, retrying and generating a lot of unnecessary log noise. Since updating a source while continuous replication is running is a common scenario, it's worth optimizing for it and avoiding wasting resources and spamming logs.
-rw-r--r--rel/overlay/etc/default.ini9
-rw-r--r--src/couch_replicator/src/couch_replicator_worker.erl7
2 files changed, 16 insertions, 0 deletions
diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini
index 4e61deb60..745e5a8e4 100644
--- a/rel/overlay/etc/default.ini
+++ b/rel/overlay/etc/default.ini
@@ -399,6 +399,15 @@ verify_ssl_certificates = false
ssl_certificate_max_depth = 3
; Maximum document ID length for replication.
;max_document_id_length = 0
+; How much time to wait before retrying after a missing doc exception. This
+; exception happens if the document was seen in the changes feed, but internal
+; replication hasn't caught up yet, and fetching document's revisions
+; fails. This a common scenario when source is updated while continous
+; replication is running. The retry period would depend on how quickly internal
+; replication is expected to catch up. In general this is an optimisation to
+; avoid crashing the whole replication job, which would consume more resources
+; and add log noise.
+;missing_doc_retry_msec = 2000
[compaction_daemon]
; The delay, in seconds, between each check for which database and view indexes
diff --git a/src/couch_replicator/src/couch_replicator_worker.erl b/src/couch_replicator/src/couch_replicator_worker.erl
index 45ccefa10..db6b72b2e 100644
--- a/src/couch_replicator/src/couch_replicator_worker.erl
+++ b/src/couch_replicator/src/couch_replicator_worker.erl
@@ -31,6 +31,7 @@
-define(MAX_BULK_ATT_SIZE, 64 * 1024).
-define(MAX_BULK_ATTS_PER_DOC, 8).
-define(STATS_DELAY, 10000000). % 10 seconds (in microseconds)
+-define(MISSING_DOC_RETRY_MSEC, 2000).
-import(couch_replicator_utils, [
open_db/1,
@@ -314,11 +315,17 @@ fetch_doc(Source, {Id, Revs, PAs}, DocHandler, Acc) ->
couch_log:error("Retrying fetch and update of document `~s` as it is "
"unexpectedly missing. Missing revisions are: ~s",
[Id, couch_doc:revs_to_strs(Revs)]),
+ WaitMSec = config:get_integer("replicator", "missing_doc_retry_msec",
+ ?MISSING_DOC_RETRY_MSEC),
+ timer:sleep(WaitMSec),
couch_replicator_api_wrap:open_doc_revs(Source, Id, Revs, [latest], DocHandler, Acc);
throw:{missing_stub, _} ->
couch_log:error("Retrying fetch and update of document `~s` due to out of "
"sync attachment stubs. Missing revisions are: ~s",
[Id, couch_doc:revs_to_strs(Revs)]),
+ WaitMSec = config:get_integer("replicator", "missing_doc_retry_msec",
+ ?MISSING_DOC_RETRY_MSEC),
+ timer:sleep(WaitMSec),
couch_replicator_api_wrap:open_doc_revs(Source, Id, Revs, [latest], DocHandler, Acc)
end.