summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@apache.org>2020-06-09 17:53:56 -0400
committerNick Vatamaniuc <nickva@users.noreply.github.com>2020-06-10 01:05:41 -0400
commita7803fb2023b72b684f8d2f1198363b9d6723400 (patch)
treeae274a76de4bc3c9327244776d7f0be6bb57f80e
parent887d740ede15a7e2aafee899e3adff8024a7f9ef (diff)
downloadcouchdb-a7803fb2023b72b684f8d2f1198363b9d6723400.tar.gz
In replicator, when rescheduling, pick only pending jobs which are not running
Previously, when pending jobs were picked in the `ets:foldl` traversal, both running and non-running jobs were considered and a large number of running jobs could displace pending jobs in the accumulator. In the worst case, no crashed jobs would be restarted during rescheduling.
-rw-r--r--src/couch_replicator/src/couch_replicator_scheduler.erl16
1 files changed, 16 insertions, 0 deletions
diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl b/src/couch_replicator/src/couch_replicator_scheduler.erl
index 53c040e8c..641443a7c 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler.erl
@@ -456,6 +456,9 @@ pending_jobs(Count) when is_integer(Count), Count > 0 ->
[Job || {_Started, Job} <- gb_sets:to_list(Set1)].
+pending_fold(#job{pid = Pid}, Acc) when is_pid(Pid) ->
+ Acc;
+
pending_fold(Job, {Set, Now, Count, HealthThreshold}) ->
Set1 = case {not_recently_crashed(Job, Now, HealthThreshold),
gb_sets:size(Set) >= Count} of
@@ -1051,6 +1054,7 @@ scheduler_test_() ->
[
t_pending_jobs_simple(),
t_pending_jobs_skip_crashed(),
+ t_pending_jobs_skip_running(),
t_one_job_starts(),
t_no_jobs_start_if_max_is_0(),
t_one_job_starts_if_max_is_1(),
@@ -1112,6 +1116,18 @@ t_pending_jobs_skip_crashed() ->
end).
+t_pending_jobs_skip_running() ->
+ ?_test(begin
+ Job1 = continuous(1),
+ Job2 = continuous_running(2),
+ Job3 = oneshot(3),
+ Job4 = oneshot_running(4),
+ Jobs = [Job1, Job2, Job3, Job4],
+ setup_jobs(Jobs),
+ ?assertEqual([Job1, Job3], pending_jobs(4))
+ end).
+
+
t_one_job_starts() ->
?_test(begin
setup_jobs([oneshot(1)]),