summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@apache.org>2018-04-10 10:31:53 -0400
committerNick Vatamaniuc <nickva@users.noreply.github.com>2018-04-12 16:54:44 -0400
commitb0f673fb51bf521f96729499e939e29f0c58fe8c (patch)
treee3c626ce03135a5faac60254cf1048b823cbf66d
parent3d1eecb576cc26d17d23ab1658d0d0932580a63d (diff)
downloadcouchdb-b0f673fb51bf521f96729499e939e29f0c58fe8c.tar.gz
In _scheduler/docs fix `crashing` state showing as `pending` sometimes
Replication jobs are backed off based on the number of consecutive crashes, that is, we count the number of crashes in a row and then penalize jobs with an exponential wait based that number. After a job runs without crashing for 2 minutes, we consider it healthy and stop going back in its history and looking for crashes. Previously a job's state was set to `crashing` only if there were any consecutive errors. So it could have ran for 3 minutes, then user deletes the source database, job crashes and stops. Until it runs again the state would have been shown as `pending`. For internal accounting purposes that's correct but it is confusing for the user because the last event in its history is a crash. This commit makes sure that if the last even in job's history is a crash user will see the jobs as `crashing` with the respective crash reason. The scheduling algorithm didn't change. Fixes #1276
-rw-r--r--src/couch_replicator/src/couch_replicator_scheduler.erl82
1 files changed, 78 insertions, 4 deletions
diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl b/src/couch_replicator/src/couch_replicator_scheduler.erl
index 0b396346a..50896c548 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler.erl
@@ -138,11 +138,15 @@ job_summary(JobId, HealthThreshold) ->
ErrorCount = consecutive_crashes(History, HealthThreshold),
{State, Info} = case {Pid, ErrorCount} of
{undefined, 0} ->
- {pending, null};
+ case History of
+ [{{crashed, Error}, _When} | _] ->
+ {crashing, crash_reason_json(Error)};
+ [_ | _] ->
+ {pending, null}
+ end;
{undefined, ErrorCount} when ErrorCount > 0 ->
[{{crashed, Error}, _When} | _] = History,
- ErrMsg = couch_replicator_utils:rep_error_to_binary(Error),
- {crashing, ErrMsg};
+ {crashing, crash_reason_json(Error)};
{Pid, ErrorCount} when is_pid(Pid) ->
{running, null}
end,
@@ -1021,7 +1025,11 @@ scheduler_test_() ->
t_oneshot_will_hog_the_scheduler(),
t_if_excess_is_trimmed_rotation_doesnt_happen(),
t_if_transient_job_crashes_it_gets_removed(),
- t_if_permanent_job_crashes_it_stays_in_ets()
+ t_if_permanent_job_crashes_it_stays_in_ets(),
+ t_job_summary_running(),
+ t_job_summary_pending(),
+ t_job_summary_crashing_once(),
+ t_job_summary_crashing_many_times()
]
}.
@@ -1300,6 +1308,72 @@ t_if_permanent_job_crashes_it_stays_in_ets() ->
end).
+t_job_summary_running() ->
+ ?_test(begin
+ Job = #job{
+ id = job1,
+ pid = mock_pid(),
+ history = [added()],
+ rep = #rep{
+ db_name = <<"db1">>,
+ source = <<"s">>,
+ target = <<"t">>
+ }
+ },
+ setup_jobs([Job]),
+ Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+ ?assertEqual(running, proplists:get_value(state, Summary)),
+ ?assertEqual(null, proplists:get_value(info, Summary)),
+ ?assertEqual(0, proplists:get_value(error_count, Summary))
+ end).
+
+
+t_job_summary_pending() ->
+ ?_test(begin
+ Job = #job{
+ id = job1,
+ pid = undefined,
+ history = [stopped(20), started(10), added()],
+ rep = #rep{source = <<"s">>, target = <<"t">>}
+ },
+ setup_jobs([Job]),
+ Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+ ?assertEqual(pending, proplists:get_value(state, Summary)),
+ ?assertEqual(null, proplists:get_value(info, Summary)),
+ ?assertEqual(0, proplists:get_value(error_count, Summary))
+ end).
+
+
+t_job_summary_crashing_once() ->
+ ?_test(begin
+ Job = #job{
+ id = job1,
+ history = [crashed(?DEFAULT_HEALTH_THRESHOLD_SEC + 1), started(0)],
+ rep = #rep{source = <<"s">>, target = <<"t">>}
+ },
+ setup_jobs([Job]),
+ Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+ ?assertEqual(crashing, proplists:get_value(state, Summary)),
+ ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)),
+ ?assertEqual(0, proplists:get_value(error_count, Summary))
+ end).
+
+
+t_job_summary_crashing_many_times() ->
+ ?_test(begin
+ Job = #job{
+ id = job1,
+ history = [crashed(4), started(3), crashed(2), started(1)],
+ rep = #rep{source = <<"s">>, target = <<"t">>}
+ },
+ setup_jobs([Job]),
+ Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+ ?assertEqual(crashing, proplists:get_value(state, Summary)),
+ ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)),
+ ?assertEqual(2, proplists:get_value(error_count, Summary))
+ end).
+
+
% Test helper functions
setup() ->