diff options
author | Nick Vatamaniuc <vatamane@apache.org> | 2017-09-28 13:19:52 -0400 |
---|---|---|
committer | Nick Vatamaniuc <nickva@users.noreply.github.com> | 2017-09-28 15:58:47 -0400 |
commit | 2684561e38378ee084d25ce09634c6ef783ea5ac (patch) | |
tree | 319df48afebb678e94604e4df3d4d131a17390ce | |
parent | 30fcd7bb9fc570136269db0c9750e718847e1640 (diff) | |
download | couchdb-2684561e38378ee084d25ce09634c6ef783ea5ac.tar.gz |
Clean up replicator logs
Previously replicator was unnesessarily verbose during crashes. This commit
reduces the vorbosity and make the error messages more helpful.
Most of the replication failures happen in the startup phase when both target
and source are opened. That's a good place to handle common errors, and there
were a few already handled (db not found, lack of authorization). This commit
adds another other common one - inability to resolve endpoint host names. This
covers cases were there user mistypes the host name or there is a DNS issue.
Also during the startup phase, if an error occurs a stacktrace was logged in
addition to the whole state of the #rep{} record. Most of the rep record and
the stack are not that useful compared to how much noise it generates. So
instead, log only a few relevant fields from #rep{} and only the top 2 stack
frames. Combined with dns lookup failure this change results in almost a 4x
(2KB vs 500B) reduction in log noise while providing better debugging
information.
One last source of excessive log noise the dumping of the full replicator job
state during crashes. This included both the #rep and the #rep_state records.
Those have a lot of redundnat information, and since they are dumped as tuples,
it was hard to use and find the values of each individual field. In this case
`format_status/2` was improved to dump only a selected set of field along with
their names. This results in another 3x reduction in log noise.
-rw-r--r-- | src/couch_replicator/src/couch_replicator_scheduler_job.erl | 122 |
1 files changed, 116 insertions, 6 deletions
diff --git a/src/couch_replicator/src/couch_replicator_scheduler_job.erl b/src/couch_replicator/src/couch_replicator_scheduler_job.erl index 08a24886c..bc7438757 100644 --- a/src/couch_replicator/src/couch_replicator_scheduler_job.erl +++ b/src/couch_replicator/src/couch_replicator_scheduler_job.erl @@ -363,10 +363,11 @@ handle_info(timeout, InitArgs) -> {stop, {shutdown, max_backoff}, {error, InitArgs}}; Class:Error -> ShutdownReason = {error, replication_start_error(Error)}, + StackTop2 = lists:sublist(erlang:get_stacktrace(), 2), % Shutdown state is a hack as it is not really the state of the % gen_server (it failed to initialize, so it doesn't have one). % Shutdown state is used to pass extra info about why start failed. - ShutdownState = {error, Class, erlang:get_stacktrace(), InitArgs}, + ShutdownState = {error, Class, StackTop2, InitArgs}, {stop, {shutdown, ShutdownReason}, ShutdownState} end. @@ -399,11 +400,20 @@ terminate({shutdown, max_backoff}, {error, InitArgs}) -> couch_replicator_notifier:notify({error, RepId, max_backoff}); terminate({shutdown, {error, Error}}, {error, Class, Stack, InitArgs}) -> - #rep{id=RepId} = InitArgs, + #rep{ + id = {BaseId, Ext} = RepId, + source = Source0, + target = Target0, + doc_id = DocId, + db_name = DbName + } = InitArgs, + Source = couch_replicator_api_wrap:db_uri(Source0), + Target = couch_replicator_api_wrap:db_uri(Target0), + RepIdStr = BaseId ++ Ext, + Msg = "~p:~p: Replication ~s failed to start ~p -> ~p doc ~p:~p stack:~p", + couch_log:error(Msg, [Class, Error, RepIdStr, Source, Target, DbName, + DocId, Stack]), couch_stats:increment_counter([couch_replicator, failed_starts]), - CleanInitArgs = rep_strip_creds(InitArgs), - couch_log:error("~p:~p: Replication failed to start for args ~p: ~p", - [Class, Error, CleanInitArgs, Stack]), couch_replicator_notifier:notify({error, RepId, Error}); terminate({shutdown, max_backoff}, State) -> @@ -441,7 +451,38 @@ code_change(_OldVsn, #rep_state{}=State, _Extra) -> format_status(_Opt, [_PDict, State]) -> - [{data, [{"State", state_strip_creds(State)}]}]. + #rep_state{ + source = Source, + target = Target, + rep_details = RepDetails, + start_seq = StartSeq, + source_seq = SourceSeq, + committed_seq = CommitedSeq, + current_through_seq = ThroughSeq, + highest_seq_done = HighestSeqDone, + session_id = SessionId + } = state_strip_creds(State), + #rep{ + id = RepId, + options = Options, + type = Type, + doc_id = DocId, + db_name = DbName + } = RepDetails, + [ + {rep_id, RepId}, + {source, couch_replicator_api_wrap:db_uri(Source)}, + {target, couch_replicator_api_wrap:db_uri(Target)}, + {db_name, DbName}, + {doc_id, DocId}, + {options, Options}, + {session_id, SessionId}, + {start_seq, StartSeq}, + {source_seq, SourceSeq}, + {committed_seq, CommitedSeq}, + {current_through_seq, ThroughSeq}, + {highest_seq_done, HighestSeqDone} + ]. startup_jitter() -> @@ -989,5 +1030,74 @@ replication_start_error({unauthorized, DbUri}) -> {unauthorized, <<"unauthorized to access or create database ", DbUri/binary>>}; replication_start_error({db_not_found, DbUri}) -> {db_not_found, <<"could not open ", DbUri/binary>>}; +replication_start_error({http_request_failed, _Method, Url0, + {error, {error, {conn_failed, {error, nxdomain}}}}}) -> + Url = ?l2b(couch_util:url_strip_password(Url0)), + {nxdomain, <<"could not resolve ", Url/binary>>}; +replication_start_error({http_request_failed, Method0, Url0, + {error, {code, Code}}}) when is_integer(Code) -> + Url = ?l2b(couch_util:url_strip_password(Url0)), + Method = ?l2b(Method0), + {http_error_code, Code, <<Method/binary, " ", Url/binary>>}; replication_start_error(Error) -> Error. + + + +-ifdef(TEST). + +-include_lib("eunit/include/eunit.hrl"). + + +replication_start_error_test() -> + ?assertEqual({unauthorized, <<"unauthorized to access or create database" + " http://x/y">>}, replication_start_error({unauthorized, + <<"http://x/y">>})), + ?assertEqual({db_not_found, <<"could not open http://x/y">>}, + replication_start_error({db_not_found, <<"http://x/y">>})), + ?assertEqual({nxdomain,<<"could not resolve http://x/y">>}, + replication_start_error({http_request_failed, "GET", "http://x/y", + {error, {error, {conn_failed, {error, nxdomain}}}}})), + ?assertEqual({http_error_code,503,<<"GET http://x/y">>}, + replication_start_error({http_request_failed, "GET", "http://x/y", + {error, {code, 503}}})). + + +scheduler_job_format_status_test() -> + Source = <<"http://u:p@h1/d1">>, + Target = <<"http://u:p@h2/d2">>, + Rep = #rep{ + id = {"base", "+ext"}, + source = couch_replicator_docs:parse_rep_db(Source, [], []), + target = couch_replicator_docs:parse_rep_db(Target, [], []), + options = [{create_target, true}], + doc_id = <<"mydoc">>, + db_name = <<"mydb">> + }, + State = #rep_state{ + rep_details = Rep, + source = Rep#rep.source, + target = Rep#rep.target, + session_id = <<"a">>, + start_seq = <<"1">>, + source_seq = <<"2">>, + committed_seq = <<"3">>, + current_through_seq = <<"4">>, + highest_seq_done = <<"5">> + }, + Format = format_status(opts_ignored, [pdict, State]), + ?assertEqual("http://u:*****@h1/d1/", proplists:get_value(source, Format)), + ?assertEqual("http://u:*****@h2/d2/", proplists:get_value(target, Format)), + ?assertEqual({"base", "+ext"}, proplists:get_value(rep_id, Format)), + ?assertEqual([{create_target, true}], proplists:get_value(options, Format)), + ?assertEqual(<<"mydoc">>, proplists:get_value(doc_id, Format)), + ?assertEqual(<<"mydb">>, proplists:get_value(db_name, Format)), + ?assertEqual(<<"a">>, proplists:get_value(session_id, Format)), + ?assertEqual(<<"1">>, proplists:get_value(start_seq, Format)), + ?assertEqual(<<"2">>, proplists:get_value(source_seq, Format)), + ?assertEqual(<<"3">>, proplists:get_value(committed_seq, Format)), + ?assertEqual(<<"4">>, proplists:get_value(current_through_seq, Format)), + ?assertEqual(<<"5">>, proplists:get_value(highest_seq_done, Format)). + + +-endif. |