[fixup|_job] improve commentsprototype/fdb-replicator

author: Nick Vatamaniuc <vatamane@apache.org> 2020-09-15 14:56:18 -0400
committer: Nick Vatamaniuc <vatamane@apache.org> 2020-09-15 14:56:18 -0400
commit: 65b41ed8d9f7ffcc5fa031db6f9318bac660401e (patch)
tree: ed7fa7969ef07298f37f7a033b806e4b33828961
parent: be80e84f5ac27a787a7af14af1c1661eaaa34568 (diff)
download: couchdb-prototype/fdb-replicator.tar.gz
1 files changed, 10 insertions, 3 deletions
diff --git a/src/couch_replicator/src/couch_replicator_job.erl b/src/couch_replicator/src/couch_replicator_job.erl
index ae8aa9da3..ed3d00d7b 100644
--- a/src/couch_replicator/src/couch_replicator_job.erl
+++ b/src/couch_replicator/src/couch_replicator_job.erl
@@ -297,10 +297,11 @@ handle_info({'EXIT', Pid, Reason}, #rep_state{changes_queue=Pid} = State) ->
 handle_info({'EXIT', Pid, normal}, #rep_state{workers = Workers} = State) ->
     case Workers -- [Pid] of
         Workers ->
-            couch_log:error("unknown pid bit the dust ~p ~n", [Pid]),
+            %% Processes might be linked by replicator's auth plugins so
+            %% we tolerate them exiting `normal` here and don't crash
+            LogMsg = "~p: unknown pid exited `normal` ~p",
+            couch_log:error(LogMsg, [?MODULE, Pid]),
             {noreply, State#rep_state{workers = Workers}};
-            %% not clear why a stop was here before
-            %%{stop, {unknown_process_died, Pid, normal}, State};
         [] ->
             catch unlink(State#rep_state.changes_manager),
             catch exit(State#rep_state.changes_manager, kill),
@@ -564,11 +565,17 @@ check_ownership(#{jtx := true} = JTx, Job, JobData) ->
             case couch_replicator_jobs:get_job_data(JTx, OtherJobId) of
                 {ok, #{?STATE := S, ?DB_NAME := null}} when
                         S == ?ST_RUNNING; S == ?ST_PENDING ->
+                    % Conflicting job is a transient job, not associated with a
+                    % _replicator doc, so we let this job retry. This is also
+                    % partly done for compatibility with pervious replicator
+                    % behavior.
                     Error = <<"Duplicate job running: ", OtherJobId/binary>>,
                     reschedule_on_error(JTx, Job, JobData, Error),
                     not_owner;
                 {ok, #{?STATE := S, ?DB_NAME := <<_/binary>>}} when
                         S == ?ST_RUNNING; S == ?ST_PENDING ->
+                    % Conflicting job is a permanent replication job, so this
+                    % job is marked as failed.
                     Error = <<"Duplicate job running: ", OtherJobId/binary>>,
                     fail_job(JTx, Job, JobData, Error),
                     not_owner;
author	Nick Vatamaniuc <vatamane@apache.org>	2020-09-15 14:56:18 -0400
committer	Nick Vatamaniuc <vatamane@apache.org>	2020-09-15 14:56:18 -0400
commit	65b41ed8d9f7ffcc5fa031db6f9318bac660401e (patch)
tree	ed7fa7969ef07298f37f7a033b806e4b33828961
parent	be80e84f5ac27a787a7af14af1c1661eaaa34568 (diff)
download	couchdb-prototype/fdb-replicator.tar.gz