summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Vatamaniuc <vatamane@apache.org>2017-09-28 13:19:52 -0400
committerNick Vatamaniuc <nickva@users.noreply.github.com>2017-09-28 15:58:47 -0400
commit2684561e38378ee084d25ce09634c6ef783ea5ac (patch)
tree319df48afebb678e94604e4df3d4d131a17390ce
parent30fcd7bb9fc570136269db0c9750e718847e1640 (diff)
downloadcouchdb-2684561e38378ee084d25ce09634c6ef783ea5ac.tar.gz
Clean up replicator logs
Previously replicator was unnesessarily verbose during crashes. This commit reduces the vorbosity and make the error messages more helpful. Most of the replication failures happen in the startup phase when both target and source are opened. That's a good place to handle common errors, and there were a few already handled (db not found, lack of authorization). This commit adds another other common one - inability to resolve endpoint host names. This covers cases were there user mistypes the host name or there is a DNS issue. Also during the startup phase, if an error occurs a stacktrace was logged in addition to the whole state of the #rep{} record. Most of the rep record and the stack are not that useful compared to how much noise it generates. So instead, log only a few relevant fields from #rep{} and only the top 2 stack frames. Combined with dns lookup failure this change results in almost a 4x (2KB vs 500B) reduction in log noise while providing better debugging information. One last source of excessive log noise the dumping of the full replicator job state during crashes. This included both the #rep and the #rep_state records. Those have a lot of redundnat information, and since they are dumped as tuples, it was hard to use and find the values of each individual field. In this case `format_status/2` was improved to dump only a selected set of field along with their names. This results in another 3x reduction in log noise.
-rw-r--r--src/couch_replicator/src/couch_replicator_scheduler_job.erl122
1 files changed, 116 insertions, 6 deletions
diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
index 08a24886c..bc7438757 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl
@@ -363,10 +363,11 @@ handle_info(timeout, InitArgs) ->
{stop, {shutdown, max_backoff}, {error, InitArgs}};
Class:Error ->
ShutdownReason = {error, replication_start_error(Error)},
+ StackTop2 = lists:sublist(erlang:get_stacktrace(), 2),
% Shutdown state is a hack as it is not really the state of the
% gen_server (it failed to initialize, so it doesn't have one).
% Shutdown state is used to pass extra info about why start failed.
- ShutdownState = {error, Class, erlang:get_stacktrace(), InitArgs},
+ ShutdownState = {error, Class, StackTop2, InitArgs},
{stop, {shutdown, ShutdownReason}, ShutdownState}
end.
@@ -399,11 +400,20 @@ terminate({shutdown, max_backoff}, {error, InitArgs}) ->
couch_replicator_notifier:notify({error, RepId, max_backoff});
terminate({shutdown, {error, Error}}, {error, Class, Stack, InitArgs}) ->
- #rep{id=RepId} = InitArgs,
+ #rep{
+ id = {BaseId, Ext} = RepId,
+ source = Source0,
+ target = Target0,
+ doc_id = DocId,
+ db_name = DbName
+ } = InitArgs,
+ Source = couch_replicator_api_wrap:db_uri(Source0),
+ Target = couch_replicator_api_wrap:db_uri(Target0),
+ RepIdStr = BaseId ++ Ext,
+ Msg = "~p:~p: Replication ~s failed to start ~p -> ~p doc ~p:~p stack:~p",
+ couch_log:error(Msg, [Class, Error, RepIdStr, Source, Target, DbName,
+ DocId, Stack]),
couch_stats:increment_counter([couch_replicator, failed_starts]),
- CleanInitArgs = rep_strip_creds(InitArgs),
- couch_log:error("~p:~p: Replication failed to start for args ~p: ~p",
- [Class, Error, CleanInitArgs, Stack]),
couch_replicator_notifier:notify({error, RepId, Error});
terminate({shutdown, max_backoff}, State) ->
@@ -441,7 +451,38 @@ code_change(_OldVsn, #rep_state{}=State, _Extra) ->
format_status(_Opt, [_PDict, State]) ->
- [{data, [{"State", state_strip_creds(State)}]}].
+ #rep_state{
+ source = Source,
+ target = Target,
+ rep_details = RepDetails,
+ start_seq = StartSeq,
+ source_seq = SourceSeq,
+ committed_seq = CommitedSeq,
+ current_through_seq = ThroughSeq,
+ highest_seq_done = HighestSeqDone,
+ session_id = SessionId
+ } = state_strip_creds(State),
+ #rep{
+ id = RepId,
+ options = Options,
+ type = Type,
+ doc_id = DocId,
+ db_name = DbName
+ } = RepDetails,
+ [
+ {rep_id, RepId},
+ {source, couch_replicator_api_wrap:db_uri(Source)},
+ {target, couch_replicator_api_wrap:db_uri(Target)},
+ {db_name, DbName},
+ {doc_id, DocId},
+ {options, Options},
+ {session_id, SessionId},
+ {start_seq, StartSeq},
+ {source_seq, SourceSeq},
+ {committed_seq, CommitedSeq},
+ {current_through_seq, ThroughSeq},
+ {highest_seq_done, HighestSeqDone}
+ ].
startup_jitter() ->
@@ -989,5 +1030,74 @@ replication_start_error({unauthorized, DbUri}) ->
{unauthorized, <<"unauthorized to access or create database ", DbUri/binary>>};
replication_start_error({db_not_found, DbUri}) ->
{db_not_found, <<"could not open ", DbUri/binary>>};
+replication_start_error({http_request_failed, _Method, Url0,
+ {error, {error, {conn_failed, {error, nxdomain}}}}}) ->
+ Url = ?l2b(couch_util:url_strip_password(Url0)),
+ {nxdomain, <<"could not resolve ", Url/binary>>};
+replication_start_error({http_request_failed, Method0, Url0,
+ {error, {code, Code}}}) when is_integer(Code) ->
+ Url = ?l2b(couch_util:url_strip_password(Url0)),
+ Method = ?l2b(Method0),
+ {http_error_code, Code, <<Method/binary, " ", Url/binary>>};
replication_start_error(Error) ->
Error.
+
+
+
+-ifdef(TEST).
+
+-include_lib("eunit/include/eunit.hrl").
+
+
+replication_start_error_test() ->
+ ?assertEqual({unauthorized, <<"unauthorized to access or create database"
+ " http://x/y">>}, replication_start_error({unauthorized,
+ <<"http://x/y">>})),
+ ?assertEqual({db_not_found, <<"could not open http://x/y">>},
+ replication_start_error({db_not_found, <<"http://x/y">>})),
+ ?assertEqual({nxdomain,<<"could not resolve http://x/y">>},
+ replication_start_error({http_request_failed, "GET", "http://x/y",
+ {error, {error, {conn_failed, {error, nxdomain}}}}})),
+ ?assertEqual({http_error_code,503,<<"GET http://x/y">>},
+ replication_start_error({http_request_failed, "GET", "http://x/y",
+ {error, {code, 503}}})).
+
+
+scheduler_job_format_status_test() ->
+ Source = <<"http://u:p@h1/d1">>,
+ Target = <<"http://u:p@h2/d2">>,
+ Rep = #rep{
+ id = {"base", "+ext"},
+ source = couch_replicator_docs:parse_rep_db(Source, [], []),
+ target = couch_replicator_docs:parse_rep_db(Target, [], []),
+ options = [{create_target, true}],
+ doc_id = <<"mydoc">>,
+ db_name = <<"mydb">>
+ },
+ State = #rep_state{
+ rep_details = Rep,
+ source = Rep#rep.source,
+ target = Rep#rep.target,
+ session_id = <<"a">>,
+ start_seq = <<"1">>,
+ source_seq = <<"2">>,
+ committed_seq = <<"3">>,
+ current_through_seq = <<"4">>,
+ highest_seq_done = <<"5">>
+ },
+ Format = format_status(opts_ignored, [pdict, State]),
+ ?assertEqual("http://u:*****@h1/d1/", proplists:get_value(source, Format)),
+ ?assertEqual("http://u:*****@h2/d2/", proplists:get_value(target, Format)),
+ ?assertEqual({"base", "+ext"}, proplists:get_value(rep_id, Format)),
+ ?assertEqual([{create_target, true}], proplists:get_value(options, Format)),
+ ?assertEqual(<<"mydoc">>, proplists:get_value(doc_id, Format)),
+ ?assertEqual(<<"mydb">>, proplists:get_value(db_name, Format)),
+ ?assertEqual(<<"a">>, proplists:get_value(session_id, Format)),
+ ?assertEqual(<<"1">>, proplists:get_value(start_seq, Format)),
+ ?assertEqual(<<"2">>, proplists:get_value(source_seq, Format)),
+ ?assertEqual(<<"3">>, proplists:get_value(committed_seq, Format)),
+ ?assertEqual(<<"4">>, proplists:get_value(current_through_seq, Format)),
+ ?assertEqual(<<"5">>, proplists:get_value(highest_seq_done, Format)).
+
+
+-endif.