diff options
author | Nick Vatamaniuc <vatamane@apache.org> | 2018-04-10 10:31:53 -0400 |
---|---|---|
committer | Nick Vatamaniuc <nickva@users.noreply.github.com> | 2018-04-12 16:54:44 -0400 |
commit | b0f673fb51bf521f96729499e939e29f0c58fe8c (patch) | |
tree | e3c626ce03135a5faac60254cf1048b823cbf66d | |
parent | 3d1eecb576cc26d17d23ab1658d0d0932580a63d (diff) | |
download | couchdb-b0f673fb51bf521f96729499e939e29f0c58fe8c.tar.gz |
In _scheduler/docs fix `crashing` state showing as `pending` sometimes
Replication jobs are backed off based on the number of consecutive crashes,
that is, we count the number of crashes in a row and then penalize jobs with an
exponential wait based that number. After a job runs without crashing for 2
minutes, we consider it healthy and stop going back in its history and looking
for crashes.
Previously a job's state was set to `crashing` only if there were any
consecutive errors. So it could have ran for 3 minutes, then user deletes the
source database, job crashes and stops. Until it runs again the state would
have been shown as `pending`. For internal accounting purposes that's correct
but it is confusing for the user because the last event in its history is a
crash.
This commit makes sure that if the last even in job's history is a crash user
will see the jobs as `crashing` with the respective crash reason. The
scheduling algorithm didn't change.
Fixes #1276
-rw-r--r-- | src/couch_replicator/src/couch_replicator_scheduler.erl | 82 |
1 files changed, 78 insertions, 4 deletions
diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl b/src/couch_replicator/src/couch_replicator_scheduler.erl index 0b396346a..50896c548 100644 --- a/src/couch_replicator/src/couch_replicator_scheduler.erl +++ b/src/couch_replicator/src/couch_replicator_scheduler.erl @@ -138,11 +138,15 @@ job_summary(JobId, HealthThreshold) -> ErrorCount = consecutive_crashes(History, HealthThreshold), {State, Info} = case {Pid, ErrorCount} of {undefined, 0} -> - {pending, null}; + case History of + [{{crashed, Error}, _When} | _] -> + {crashing, crash_reason_json(Error)}; + [_ | _] -> + {pending, null} + end; {undefined, ErrorCount} when ErrorCount > 0 -> [{{crashed, Error}, _When} | _] = History, - ErrMsg = couch_replicator_utils:rep_error_to_binary(Error), - {crashing, ErrMsg}; + {crashing, crash_reason_json(Error)}; {Pid, ErrorCount} when is_pid(Pid) -> {running, null} end, @@ -1021,7 +1025,11 @@ scheduler_test_() -> t_oneshot_will_hog_the_scheduler(), t_if_excess_is_trimmed_rotation_doesnt_happen(), t_if_transient_job_crashes_it_gets_removed(), - t_if_permanent_job_crashes_it_stays_in_ets() + t_if_permanent_job_crashes_it_stays_in_ets(), + t_job_summary_running(), + t_job_summary_pending(), + t_job_summary_crashing_once(), + t_job_summary_crashing_many_times() ] }. @@ -1300,6 +1308,72 @@ t_if_permanent_job_crashes_it_stays_in_ets() -> end). +t_job_summary_running() -> + ?_test(begin + Job = #job{ + id = job1, + pid = mock_pid(), + history = [added()], + rep = #rep{ + db_name = <<"db1">>, + source = <<"s">>, + target = <<"t">> + } + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(running, proplists:get_value(state, Summary)), + ?assertEqual(null, proplists:get_value(info, Summary)), + ?assertEqual(0, proplists:get_value(error_count, Summary)) + end). + + +t_job_summary_pending() -> + ?_test(begin + Job = #job{ + id = job1, + pid = undefined, + history = [stopped(20), started(10), added()], + rep = #rep{source = <<"s">>, target = <<"t">>} + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(pending, proplists:get_value(state, Summary)), + ?assertEqual(null, proplists:get_value(info, Summary)), + ?assertEqual(0, proplists:get_value(error_count, Summary)) + end). + + +t_job_summary_crashing_once() -> + ?_test(begin + Job = #job{ + id = job1, + history = [crashed(?DEFAULT_HEALTH_THRESHOLD_SEC + 1), started(0)], + rep = #rep{source = <<"s">>, target = <<"t">>} + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(crashing, proplists:get_value(state, Summary)), + ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)), + ?assertEqual(0, proplists:get_value(error_count, Summary)) + end). + + +t_job_summary_crashing_many_times() -> + ?_test(begin + Job = #job{ + id = job1, + history = [crashed(4), started(3), crashed(2), started(1)], + rep = #rep{source = <<"s">>, target = <<"t">>} + }, + setup_jobs([Job]), + Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC), + ?assertEqual(crashing, proplists:get_value(state, Summary)), + ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)), + ?assertEqual(2, proplists:get_value(error_count, Summary)) + end). + + % Test helper functions setup() -> |