In _scheduler/docs fix `crashing` state showing as `pending` sometimes

Replication jobs are backed off based on the number of consecutive crashes, that is, we count the number of crashes in a row and then penalize jobs with an exponential wait based that number. After a job runs without crashing for 2 minutes, we consider it healthy and stop going back in its history and looking for crashes. Previously a job's state was set to `crashing` only if there were any consecutive errors. So it could have ran for 3 minutes, then user deletes the source database, job crashes and stops. Until it runs again the state would have been shown as `pending`. For internal accounting purposes that's correct but it is confusing for the user because the last event in its history is a crash. This commit makes sure that if the last even in job's history is a crash user will see the jobs as `crashing` with the respective crash reason. The scheduling algorithm didn't change. Fixes #1276
author: Nick Vatamaniuc <vatamane@apache.org> 2018-04-10 10:31:53 -0400
committer: Nick Vatamaniuc <nickva@users.noreply.github.com> 2018-04-12 16:54:44 -0400
commit: b0f673fb51bf521f96729499e939e29f0c58fe8c (patch)
tree: e3c626ce03135a5faac60254cf1048b823cbf66d
parent: 3d1eecb576cc26d17d23ab1658d0d0932580a63d (diff)
download: couchdb-b0f673fb51bf521f96729499e939e29f0c58fe8c.tar.gz
1 files changed, 78 insertions, 4 deletions
diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl b/src/couch_replicator/src/couch_replicator_scheduler.erl
index 0b396346a..50896c548 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler.erl
@@ -138,11 +138,15 @@ job_summary(JobId, HealthThreshold) ->
             ErrorCount = consecutive_crashes(History, HealthThreshold),
             {State, Info} = case {Pid, ErrorCount} of
                 {undefined, 0}  ->
-                    {pending, null};
+                    case History of
+                        [{{crashed, Error}, _When} | _] ->
+                            {crashing, crash_reason_json(Error)};
+                        [_ | _] ->
+                            {pending, null}
+                    end;
                 {undefined, ErrorCount} when ErrorCount > 0 ->
                      [{{crashed, Error}, _When} | _] = History,
-                     ErrMsg = couch_replicator_utils:rep_error_to_binary(Error),
-                     {crashing, ErrMsg};
+                     {crashing, crash_reason_json(Error)};
                 {Pid, ErrorCount} when is_pid(Pid) ->
                      {running, null}
             end,
@@ -1021,7 +1025,11 @@ scheduler_test_() ->
             t_oneshot_will_hog_the_scheduler(),
             t_if_excess_is_trimmed_rotation_doesnt_happen(),
             t_if_transient_job_crashes_it_gets_removed(),
-            t_if_permanent_job_crashes_it_stays_in_ets()
+            t_if_permanent_job_crashes_it_stays_in_ets(),
+            t_job_summary_running(),
+            t_job_summary_pending(),
+            t_job_summary_crashing_once(),
+            t_job_summary_crashing_many_times()
          ]
     }.
 
@@ -1300,6 +1308,72 @@ t_if_permanent_job_crashes_it_stays_in_ets() ->
    end).
 
 
+t_job_summary_running() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            pid = mock_pid(),
+            history = [added()],
+            rep = #rep{
+                db_name = <<"db1">>,
+                source = <<"s">>,
+                target = <<"t">>
+            }
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(running, proplists:get_value(state, Summary)),
+        ?assertEqual(null, proplists:get_value(info, Summary)),
+        ?assertEqual(0, proplists:get_value(error_count, Summary))
+    end).
+
+
+t_job_summary_pending() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            pid = undefined,
+            history = [stopped(20), started(10), added()],
+            rep = #rep{source = <<"s">>, target = <<"t">>}
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(pending, proplists:get_value(state, Summary)),
+        ?assertEqual(null, proplists:get_value(info, Summary)),
+        ?assertEqual(0, proplists:get_value(error_count, Summary))
+    end).
+
+
+t_job_summary_crashing_once() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            history = [crashed(?DEFAULT_HEALTH_THRESHOLD_SEC + 1), started(0)],
+            rep = #rep{source = <<"s">>, target = <<"t">>}
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(crashing, proplists:get_value(state, Summary)),
+        ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)),
+        ?assertEqual(0, proplists:get_value(error_count, Summary))
+    end).
+
+
+t_job_summary_crashing_many_times() ->
+    ?_test(begin
+        Job =  #job{
+            id = job1,
+            history = [crashed(4), started(3), crashed(2), started(1)],
+            rep = #rep{source = <<"s">>, target = <<"t">>}
+        },
+        setup_jobs([Job]),
+        Summary = job_summary(job1, ?DEFAULT_HEALTH_THRESHOLD_SEC),
+        ?assertEqual(crashing, proplists:get_value(state, Summary)),
+        ?assertEqual(<<"some_reason">>, proplists:get_value(info, Summary)),
+        ?assertEqual(2, proplists:get_value(error_count, Summary))
+    end).
+
+
 % Test helper functions
 
 setup() ->
author	Nick Vatamaniuc <vatamane@apache.org>	2018-04-10 10:31:53 -0400
committer	Nick Vatamaniuc <nickva@users.noreply.github.com>	2018-04-12 16:54:44 -0400
commit	b0f673fb51bf521f96729499e939e29f0c58fe8c (patch)
tree	e3c626ce03135a5faac60254cf1048b823cbf66d
parent	3d1eecb576cc26d17d23ab1658d0d0932580a63d (diff)
download	couchdb-b0f673fb51bf521f96729499e939e29f0c58fe8c.tar.gz