diff options
author | Will Holley <willholley@apache.org> | 2023-03-13 16:48:15 +0000 |
---|---|---|
committer | Will Holley <will.holley@uk.ibm.com> | 2023-03-20 12:31:51 +0000 |
commit | f7a22d4cd466675986076e24dba3b6ed4db393fc (patch) | |
tree | 87fd95b8cfa702bcccbb978b044129ec6ad69c4b | |
parent | a25336f917c68b03da826f9ef2a0665e1a842438 (diff) | |
download | couchdb-f7a22d4cd466675986076e24dba3b6ed4db393fc.tar.gz |
feat: add type and descriptions to prometheus output
The `/_node/_local/_prometheus` is a missing `TYPE` annotation for
`couchdb_httpd_status_codes`.
In addition, it contains no `HELP` annotations, which
are useful when exploring the metrics, particularly where
metrics do not strictly match those returned by the `_stats` or
`_system` endpoints.
This PR adds the missing `TYPE` annotation and adds `HELP` annotations
to all metrics.
The spec for the prometheus text format is at
https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md,
for reference.
It also adds additional spacing between the metrics series, making it
easier for humans to parse.
## couch_prometheus_util:to_prom/3
`couch_prometheus_util:to_prom/3` is replaced by `couch_prometheus_util:to_prom/4`.
which now expects a description alongside the metric name and type.
## couch_prometheus_util:couch_to_prom/3
`couch_prometheus_util:couch_to_prom/3` now extracts the metrics
description from the metric metadata returned by `couch_stats`.
In some cases, where the metrics are transformed e.g. from multiple
metrics to a single metric with a tag, the description is explicitly
specified to match the new metric semantics.
-rw-r--r-- | src/couch_prometheus/src/couch_prometheus_server.erl | 83 | ||||
-rw-r--r-- | src/couch_prometheus/src/couch_prometheus_util.erl | 103 |
2 files changed, 135 insertions, 51 deletions
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl index 7597c7e28..5e446a914 100644 --- a/src/couch_prometheus/src/couch_prometheus_server.erl +++ b/src/couch_prometheus/src/couch_prometheus_server.erl @@ -16,7 +16,7 @@ -import(couch_prometheus_util, [ couch_to_prom/3, - to_prom/3, + to_prom/4, to_prom_summary/2 ]). @@ -116,7 +116,7 @@ get_system_stats() -> ]). get_uptime_stat() -> - to_prom(uptime_seconds, counter, couch_app:uptime() div 1000). + to_prom(uptime_seconds, counter, "couchdb uptime", couch_app:uptime() div 1000). get_vm_stats() -> MemLabels = lists:map( @@ -131,29 +131,70 @@ get_vm_stats() -> ProcCount = erlang:system_info(process_count), ProcLimit = erlang:system_info(process_limit), [ - to_prom(erlang_memory_bytes, gauge, MemLabels), - to_prom(erlang_gc_collections_total, counter, NumGCs), - to_prom(erlang_gc_words_reclaimed_total, counter, WordsReclaimed), - to_prom(erlang_context_switches_total, counter, CtxSwitches), - to_prom(erlang_reductions_total, counter, Reds), - to_prom(erlang_processes, gauge, ProcCount), - to_prom(erlang_process_limit, gauge, ProcLimit) + to_prom( + erlang_memory_bytes, + gauge, + "size of memory dynamically allocated by the Erlang emulator", + MemLabels + ), + to_prom( + erlang_gc_collections_total, + counter, + "number of garbage collections by the Erlang emulator", + NumGCs + ), + to_prom( + erlang_gc_words_reclaimed_total, + counter, + "number of words reclaimed by garbage collections", + WordsReclaimed + ), + to_prom( + erlang_context_switches_total, counter, "total number of context switches", CtxSwitches + ), + to_prom(erlang_reductions_total, counter, "total number of reductions", Reds), + to_prom(erlang_processes, gauge, "the number of Erlang processes", ProcCount), + to_prom( + erlang_process_limit, + gauge, + "the maximum number of simultaneously existing Erlang processes", + ProcLimit + ) ]. get_io_stats() -> {{input, In}, {output, Out}} = erlang:statistics(io), [ - to_prom(erlang_io_recv_bytes_total, counter, In), - to_prom(erlang_io_sent_bytes_total, counter, Out) + to_prom( + erlang_io_recv_bytes_total, + counter, + "the total number of bytes received through ports", + In + ), + to_prom( + erlang_io_sent_bytes_total, counter, "the total number of bytes output to ports", Out + ) ]. get_message_queue_stats() -> QLenFun = fun(Name) -> message_queue_len(whereis(Name)) end, Queues = lists:map(QLenFun, registered()), [ - to_prom(erlang_message_queues, gauge, lists:sum(Queues)), - to_prom(erlang_message_queue_min, gauge, lists:min(Queues)), - to_prom(erlang_message_queue_max, gauge, lists:max(Queues)) + to_prom( + erlang_message_queues, gauge, "total size of all message queues", lists:sum(Queues) + ), + to_prom( + erlang_message_queue_min, + gauge, + "minimum size across all message queues", + lists:min(Queues) + ), + to_prom( + erlang_message_queue_max, + gauge, + "maximum size across all message queues", + lists:max(Queues) + ) ]. message_queue_len(undefined) -> @@ -177,13 +218,18 @@ get_run_queue_stats() -> {lists:sum(SQs), DCQ} end, [ - to_prom(erlang_scheduler_queues, gauge, Normal), - to_prom(erlang_dirty_cpu_scheduler_queues, gauge, Dirty) + to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", Normal), + to_prom( + erlang_dirty_cpu_scheduler_queues, + gauge, + "the total size of all dirty CPU scheduler run queues", + Dirty + ) ]. get_ets_stats() -> NumTabs = length(ets:all()), - to_prom(erlang_ets_table, gauge, NumTabs). + to_prom(erlang_ets_table, gauge, "number of ETS tables", NumTabs). drain_refresh_messages() -> receive @@ -205,7 +251,8 @@ system_stats_test() -> lists:foreach( fun(Line) -> ?assert(is_binary(Line)), - ?assert((starts_with(<<"couchdb_">>, Line) orelse starts_with(<<"# TYPE ">>, Line))) + Trimmed = string:trim(Line), + ?assert(starts_with(<<"couchdb_">>, Trimmed) orelse starts_with(<<"# ">>, Trimmed)) end, get_system_stats() ). diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl index 7147a6539..9e4a74e49 100644 --- a/src/couch_prometheus/src/couch_prometheus_util.erl +++ b/src/couch_prometheus/src/couch_prometheus_util.erl @@ -15,56 +15,78 @@ -export([ couch_to_prom/3, to_bin/1, - to_prom/3, + to_prom/4, to_prom_summary/2 ]). -include("couch_prometheus.hrl"). couch_to_prom([couch_log, level, alert], Info, _All) -> - to_prom(couch_log_requests_total, counter, {[{level, alert}], val(Info)}); + to_prom(couch_log_requests_total, counter, "number of logged messages", { + [{level, alert}], val(Info) + }); couch_to_prom([couch_log, level, Level], Info, _All) -> to_prom(couch_log_requests_total, {[{level, Level}], val(Info)}); couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) -> - to_prom(couch_replicator_checkpoints_failure_total, counter, val(Info)); + to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info)); couch_to_prom([couch_replicator, checkpoints, success], Info, All) -> Total = val(Info) + val([couch_replicator, checkpoints, failure], All), - to_prom(couch_replicator_checkpoints_total, counter, Total); + to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total); couch_to_prom([couch_replicator, responses, failure], Info, _All) -> - to_prom(couch_replicator_responses_failure_total, counter, val(Info)); + to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info)); couch_to_prom([couch_replicator, responses, success], Info, All) -> Total = val(Info) + val([couch_replicator, responses, failure], All), - to_prom(couch_replicator_responses_total, counter, Total); + to_prom( + couch_replicator_responses_total, + counter, + "number of HTTP responses received by the replicator", + Total + ); couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) -> - to_prom(couch_replicator_stream_responses_failure_total, counter, val(Info)); + to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info)); couch_to_prom([couch_replicator, stream_responses, success], Info, All) -> Total = val(Info) + val([couch_replicator, stream_responses, failure], All), - to_prom(couch_replicator_stream_responses_total, counter, Total); + to_prom( + couch_replicator_stream_responses_total, + counter, + "number of streaming HTTP responses received by the replicator", + Total + ); couch_to_prom([couchdb, auth_cache_hits], Info, All) -> Total = val(Info) + val([couchdb, auth_cache_misses], All), - to_prom(auth_cache_requests_total, counter, Total); + to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total); couch_to_prom([couchdb, auth_cache_misses], Info, _All) -> - to_prom(auth_cache_misses_total, counter, val(Info)); + to_prom(auth_cache_misses_total, counter, desc(Info), val(Info)); +% force a # TYPE and # HELP definition for httpd_request_methods couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) -> - to_prom(httpd_request_methods, counter, {[{method, 'COPY'}], val(Info)}); + to_prom(httpd_request_methods, counter, "number of HTTP requests by method", { + [{method, 'COPY'}], val(Info) + }); couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) -> to_prom(httpd_request_methods, {[{method, Method}], val(Info)}); +% force a # TYPE and # HELP definition for httpd_status_codes +couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) -> + to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", { + [{code, 200}], val(Info) + }); couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) -> to_prom(httpd_status_codes, {[{code, Code}], val(Info)}); couch_to_prom([ddoc_cache, hit], Info, All) -> Total = val(Info) + val([ddoc_cache, miss], All), - to_prom(ddoc_cache_requests_total, counter, Total); + to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total); couch_to_prom([ddoc_cache, miss], Info, _All) -> - to_prom(ddoc_cache_requests_failures_total, counter, val(Info)); + to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info)); couch_to_prom([ddoc_cache, recovery], Info, _All) -> - to_prom(ddoc_cache_requests_recovery_total, counter, val(Info)); + to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info)); couch_to_prom([fabric, read_repairs, failure], Info, _All) -> - to_prom(fabric_read_repairs_failures_total, counter, val(Info)); + to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info)); couch_to_prom([fabric, read_repairs, success], Info, All) -> Total = val(Info) + val([fabric, read_repairs, failure], All), - to_prom(fabric_read_repairs_total, counter, Total); + to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total); couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) -> - to_prom(rexi_streams_timeout_total, counter, {[{stage, init_stream}], val(Info)}); + to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", { + [{stage, init_stream}], val(Info) + }); couch_to_prom([rexi_streams, timeout, Stage], Info, _All) -> to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)}); couch_to_prom([couchdb | Rest], Info, All) -> @@ -73,15 +95,22 @@ couch_to_prom(Path, Info, _All) -> case lists:keyfind(type, 1, Info) of {type, counter} -> Metric = counter_metric(Path), - to_prom(Metric, counter, val(Info)); + to_prom(Metric, counter, desc(Info), val(Info)); {type, gauge} -> - to_prom(path_to_name(Path), gauge, val(Info)); + to_prom(path_to_name(Path), gauge, desc(Info), val(Info)); {type, histogram} -> to_prom_summary(Path, Info) end. -to_prom(Metric, Type, Data) -> - TypeStr = to_bin(io_lib:format("# TYPE ~s ~s", [to_prom_name(Metric), Type])), +type_def(Metric, Type, Desc) -> + Name = to_prom_name(Metric), + [ + to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])), + to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type])) + ]. + +to_prom(Metric, Type, Desc, Data) -> + TypeStr = type_def(Metric, Type, Desc), [TypeStr] ++ to_prom(Metric, Data). to_prom(Metric, Instances) when is_list(Instances) -> @@ -130,7 +159,7 @@ to_prom_summary(Path, Info) -> SumStat = to_prom(SumMetric, Count * Mean), CountMetric = path_to_name(Path ++ ["seconds", "count"]), CountStat = to_prom(CountMetric, Count), - to_prom(Metric, summary, Quantiles) ++ [SumStat, CountStat]. + to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat]. to_prom_name(Metric) -> to_bin(io_lib:format("couchdb_~s", [Metric])). @@ -168,23 +197,31 @@ val(Key, Stats) -> {Key, Data} = lists:keyfind(Key, 1, Stats), val(Data). +desc(Info) -> + {desc, V} = lists:keyfind(desc, 1, Info), + V. + -ifdef(TEST). -include_lib("couch/include/couch_eunit.hrl"). to_prom_counter_test() -> - ?assertEqual( - <<"couchdb_ddoc_cache 10">>, - test_to_prom_output(ddoc_cache, counter, 10) - ), - ?assertEqual( - <<"couchdb_httpd_status_codes{code=\"200\"} 3">>, - test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3}) - ). + [ + ?assertEqual( + <<"couchdb_ddoc_cache 10">>, + test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10) + ), + ?assertEqual( + <<"couchdb_httpd_status_codes{code=\"200\"} 3">>, + test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", { + [{code, 200}], 3 + }) + ) + ]. to_prom_gauge_test() -> ?assertEqual( <<"couchdb_temperature_celsius 36">>, - test_to_prom_output(temperature_celsius, gauge, 36) + test_to_prom_output(temperature_celsius, gauge, "temp", 36) ). to_prom_summary_test() -> @@ -232,8 +269,8 @@ counter_metric_test_() -> ) ]. -test_to_prom_output(Metric, Type, Val) -> - Out = to_prom(Metric, Type, Val), +test_to_prom_output(Metric, Type, Desc, Val) -> + Out = to_prom(Metric, Type, Desc, Val), lists:nth(2, Out). test_to_prom_summary_output(Metric, Info) -> |