summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWill Holley <willholley@apache.org>2023-03-13 16:48:15 +0000
committerWill Holley <will.holley@uk.ibm.com>2023-03-20 12:31:51 +0000
commitf7a22d4cd466675986076e24dba3b6ed4db393fc (patch)
tree87fd95b8cfa702bcccbb978b044129ec6ad69c4b
parenta25336f917c68b03da826f9ef2a0665e1a842438 (diff)
downloadcouchdb-f7a22d4cd466675986076e24dba3b6ed4db393fc.tar.gz
feat: add type and descriptions to prometheus output
The `/_node/_local/_prometheus` is a missing `TYPE` annotation for `couchdb_httpd_status_codes`. In addition, it contains no `HELP` annotations, which are useful when exploring the metrics, particularly where metrics do not strictly match those returned by the `_stats` or `_system` endpoints. This PR adds the missing `TYPE` annotation and adds `HELP` annotations to all metrics. The spec for the prometheus text format is at https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md, for reference. It also adds additional spacing between the metrics series, making it easier for humans to parse. ## couch_prometheus_util:to_prom/3 `couch_prometheus_util:to_prom/3` is replaced by `couch_prometheus_util:to_prom/4`. which now expects a description alongside the metric name and type. ## couch_prometheus_util:couch_to_prom/3 `couch_prometheus_util:couch_to_prom/3` now extracts the metrics description from the metric metadata returned by `couch_stats`. In some cases, where the metrics are transformed e.g. from multiple metrics to a single metric with a tag, the description is explicitly specified to match the new metric semantics.
-rw-r--r--src/couch_prometheus/src/couch_prometheus_server.erl83
-rw-r--r--src/couch_prometheus/src/couch_prometheus_util.erl103
2 files changed, 135 insertions, 51 deletions
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 7597c7e28..5e446a914 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -16,7 +16,7 @@
-import(couch_prometheus_util, [
couch_to_prom/3,
- to_prom/3,
+ to_prom/4,
to_prom_summary/2
]).
@@ -116,7 +116,7 @@ get_system_stats() ->
]).
get_uptime_stat() ->
- to_prom(uptime_seconds, counter, couch_app:uptime() div 1000).
+ to_prom(uptime_seconds, counter, "couchdb uptime", couch_app:uptime() div 1000).
get_vm_stats() ->
MemLabels = lists:map(
@@ -131,29 +131,70 @@ get_vm_stats() ->
ProcCount = erlang:system_info(process_count),
ProcLimit = erlang:system_info(process_limit),
[
- to_prom(erlang_memory_bytes, gauge, MemLabels),
- to_prom(erlang_gc_collections_total, counter, NumGCs),
- to_prom(erlang_gc_words_reclaimed_total, counter, WordsReclaimed),
- to_prom(erlang_context_switches_total, counter, CtxSwitches),
- to_prom(erlang_reductions_total, counter, Reds),
- to_prom(erlang_processes, gauge, ProcCount),
- to_prom(erlang_process_limit, gauge, ProcLimit)
+ to_prom(
+ erlang_memory_bytes,
+ gauge,
+ "size of memory dynamically allocated by the Erlang emulator",
+ MemLabels
+ ),
+ to_prom(
+ erlang_gc_collections_total,
+ counter,
+ "number of garbage collections by the Erlang emulator",
+ NumGCs
+ ),
+ to_prom(
+ erlang_gc_words_reclaimed_total,
+ counter,
+ "number of words reclaimed by garbage collections",
+ WordsReclaimed
+ ),
+ to_prom(
+ erlang_context_switches_total, counter, "total number of context switches", CtxSwitches
+ ),
+ to_prom(erlang_reductions_total, counter, "total number of reductions", Reds),
+ to_prom(erlang_processes, gauge, "the number of Erlang processes", ProcCount),
+ to_prom(
+ erlang_process_limit,
+ gauge,
+ "the maximum number of simultaneously existing Erlang processes",
+ ProcLimit
+ )
].
get_io_stats() ->
{{input, In}, {output, Out}} = erlang:statistics(io),
[
- to_prom(erlang_io_recv_bytes_total, counter, In),
- to_prom(erlang_io_sent_bytes_total, counter, Out)
+ to_prom(
+ erlang_io_recv_bytes_total,
+ counter,
+ "the total number of bytes received through ports",
+ In
+ ),
+ to_prom(
+ erlang_io_sent_bytes_total, counter, "the total number of bytes output to ports", Out
+ )
].
get_message_queue_stats() ->
QLenFun = fun(Name) -> message_queue_len(whereis(Name)) end,
Queues = lists:map(QLenFun, registered()),
[
- to_prom(erlang_message_queues, gauge, lists:sum(Queues)),
- to_prom(erlang_message_queue_min, gauge, lists:min(Queues)),
- to_prom(erlang_message_queue_max, gauge, lists:max(Queues))
+ to_prom(
+ erlang_message_queues, gauge, "total size of all message queues", lists:sum(Queues)
+ ),
+ to_prom(
+ erlang_message_queue_min,
+ gauge,
+ "minimum size across all message queues",
+ lists:min(Queues)
+ ),
+ to_prom(
+ erlang_message_queue_max,
+ gauge,
+ "maximum size across all message queues",
+ lists:max(Queues)
+ )
].
message_queue_len(undefined) ->
@@ -177,13 +218,18 @@ get_run_queue_stats() ->
{lists:sum(SQs), DCQ}
end,
[
- to_prom(erlang_scheduler_queues, gauge, Normal),
- to_prom(erlang_dirty_cpu_scheduler_queues, gauge, Dirty)
+ to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", Normal),
+ to_prom(
+ erlang_dirty_cpu_scheduler_queues,
+ gauge,
+ "the total size of all dirty CPU scheduler run queues",
+ Dirty
+ )
].
get_ets_stats() ->
NumTabs = length(ets:all()),
- to_prom(erlang_ets_table, gauge, NumTabs).
+ to_prom(erlang_ets_table, gauge, "number of ETS tables", NumTabs).
drain_refresh_messages() ->
receive
@@ -205,7 +251,8 @@ system_stats_test() ->
lists:foreach(
fun(Line) ->
?assert(is_binary(Line)),
- ?assert((starts_with(<<"couchdb_">>, Line) orelse starts_with(<<"# TYPE ">>, Line)))
+ Trimmed = string:trim(Line),
+ ?assert(starts_with(<<"couchdb_">>, Trimmed) orelse starts_with(<<"# ">>, Trimmed))
end,
get_system_stats()
).
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index 7147a6539..9e4a74e49 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -15,56 +15,78 @@
-export([
couch_to_prom/3,
to_bin/1,
- to_prom/3,
+ to_prom/4,
to_prom_summary/2
]).
-include("couch_prometheus.hrl").
couch_to_prom([couch_log, level, alert], Info, _All) ->
- to_prom(couch_log_requests_total, counter, {[{level, alert}], val(Info)});
+ to_prom(couch_log_requests_total, counter, "number of logged messages", {
+ [{level, alert}], val(Info)
+ });
couch_to_prom([couch_log, level, Level], Info, _All) ->
to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
- to_prom(couch_replicator_checkpoints_failure_total, counter, val(Info));
+ to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
- to_prom(couch_replicator_checkpoints_total, counter, Total);
+ to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
- to_prom(couch_replicator_responses_failure_total, counter, val(Info));
+ to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, responses, success], Info, All) ->
Total = val(Info) + val([couch_replicator, responses, failure], All),
- to_prom(couch_replicator_responses_total, counter, Total);
+ to_prom(
+ couch_replicator_responses_total,
+ counter,
+ "number of HTTP responses received by the replicator",
+ Total
+ );
couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
- to_prom(couch_replicator_stream_responses_failure_total, counter, val(Info));
+ to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
- to_prom(couch_replicator_stream_responses_total, counter, Total);
+ to_prom(
+ couch_replicator_stream_responses_total,
+ counter,
+ "number of streaming HTTP responses received by the replicator",
+ Total
+ );
couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
Total = val(Info) + val([couchdb, auth_cache_misses], All),
- to_prom(auth_cache_requests_total, counter, Total);
+ to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
- to_prom(auth_cache_misses_total, counter, val(Info));
+ to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
+% force a # TYPE and # HELP definition for httpd_request_methods
couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
- to_prom(httpd_request_methods, counter, {[{method, 'COPY'}], val(Info)});
+ to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
+ [{method, 'COPY'}], val(Info)
+ });
couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
+% force a # TYPE and # HELP definition for httpd_status_codes
+couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
+ to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
+ [{code, 200}], val(Info)
+ });
couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
couch_to_prom([ddoc_cache, hit], Info, All) ->
Total = val(Info) + val([ddoc_cache, miss], All),
- to_prom(ddoc_cache_requests_total, counter, Total);
+ to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
couch_to_prom([ddoc_cache, miss], Info, _All) ->
- to_prom(ddoc_cache_requests_failures_total, counter, val(Info));
+ to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
couch_to_prom([ddoc_cache, recovery], Info, _All) ->
- to_prom(ddoc_cache_requests_recovery_total, counter, val(Info));
+ to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
- to_prom(fabric_read_repairs_failures_total, counter, val(Info));
+ to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
couch_to_prom([fabric, read_repairs, success], Info, All) ->
Total = val(Info) + val([fabric, read_repairs, failure], All),
- to_prom(fabric_read_repairs_total, counter, Total);
+ to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
- to_prom(rexi_streams_timeout_total, counter, {[{stage, init_stream}], val(Info)});
+ to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
+ [{stage, init_stream}], val(Info)
+ });
couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
couch_to_prom([couchdb | Rest], Info, All) ->
@@ -73,15 +95,22 @@ couch_to_prom(Path, Info, _All) ->
case lists:keyfind(type, 1, Info) of
{type, counter} ->
Metric = counter_metric(Path),
- to_prom(Metric, counter, val(Info));
+ to_prom(Metric, counter, desc(Info), val(Info));
{type, gauge} ->
- to_prom(path_to_name(Path), gauge, val(Info));
+ to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
{type, histogram} ->
to_prom_summary(Path, Info)
end.
-to_prom(Metric, Type, Data) ->
- TypeStr = to_bin(io_lib:format("# TYPE ~s ~s", [to_prom_name(Metric), Type])),
+type_def(Metric, Type, Desc) ->
+ Name = to_prom_name(Metric),
+ [
+ to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
+ to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
+ ].
+
+to_prom(Metric, Type, Desc, Data) ->
+ TypeStr = type_def(Metric, Type, Desc),
[TypeStr] ++ to_prom(Metric, Data).
to_prom(Metric, Instances) when is_list(Instances) ->
@@ -130,7 +159,7 @@ to_prom_summary(Path, Info) ->
SumStat = to_prom(SumMetric, Count * Mean),
CountMetric = path_to_name(Path ++ ["seconds", "count"]),
CountStat = to_prom(CountMetric, Count),
- to_prom(Metric, summary, Quantiles) ++ [SumStat, CountStat].
+ to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].
to_prom_name(Metric) ->
to_bin(io_lib:format("couchdb_~s", [Metric])).
@@ -168,23 +197,31 @@ val(Key, Stats) ->
{Key, Data} = lists:keyfind(Key, 1, Stats),
val(Data).
+desc(Info) ->
+ {desc, V} = lists:keyfind(desc, 1, Info),
+ V.
+
-ifdef(TEST).
-include_lib("couch/include/couch_eunit.hrl").
to_prom_counter_test() ->
- ?assertEqual(
- <<"couchdb_ddoc_cache 10">>,
- test_to_prom_output(ddoc_cache, counter, 10)
- ),
- ?assertEqual(
- <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
- test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3})
- ).
+ [
+ ?assertEqual(
+ <<"couchdb_ddoc_cache 10">>,
+ test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
+ ),
+ ?assertEqual(
+ <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
+ test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
+ [{code, 200}], 3
+ })
+ )
+ ].
to_prom_gauge_test() ->
?assertEqual(
<<"couchdb_temperature_celsius 36">>,
- test_to_prom_output(temperature_celsius, gauge, 36)
+ test_to_prom_output(temperature_celsius, gauge, "temp", 36)
).
to_prom_summary_test() ->
@@ -232,8 +269,8 @@ counter_metric_test_() ->
)
].
-test_to_prom_output(Metric, Type, Val) ->
- Out = to_prom(Metric, Type, Val),
+test_to_prom_output(Metric, Type, Desc, Val) ->
+ Out = to_prom(Metric, Type, Desc, Val),
lists:nth(2, Out).
test_to_prom_summary_output(Metric, Info) ->