feat: add type and descriptions to prometheus output

The `/_node/_local/_prometheus` is a missing `TYPE` annotation for `couchdb_httpd_status_codes`. In addition, it contains no `HELP` annotations, which are useful when exploring the metrics, particularly where metrics do not strictly match those returned by the `_stats` or `_system` endpoints. This PR adds the missing `TYPE` annotation and adds `HELP` annotations to all metrics. The spec for the prometheus text format is at https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md, for reference. It also adds additional spacing between the metrics series, making it easier for humans to parse. ## couch_prometheus_util:to_prom/3 `couch_prometheus_util:to_prom/3` is replaced by `couch_prometheus_util:to_prom/4`. which now expects a description alongside the metric name and type. ## couch_prometheus_util:couch_to_prom/3 `couch_prometheus_util:couch_to_prom/3` now extracts the metrics description from the metric metadata returned by `couch_stats`. In some cases, where the metrics are transformed e.g. from multiple metrics to a single metric with a tag, the description is explicitly specified to match the new metric semantics.
author: Will Holley <willholley@apache.org> 2023-03-13 16:48:15 +0000
committer: Will Holley <will.holley@uk.ibm.com> 2023-03-20 12:31:51 +0000
commit: f7a22d4cd466675986076e24dba3b6ed4db393fc (patch)
tree: 87fd95b8cfa702bcccbb978b044129ec6ad69c4b
parent: a25336f917c68b03da826f9ef2a0665e1a842438 (diff)
download: couchdb-f7a22d4cd466675986076e24dba3b6ed4db393fc.tar.gz
2 files changed, 135 insertions, 51 deletions
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 7597c7e28..5e446a914 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -16,7 +16,7 @@
 
 -import(couch_prometheus_util, [
     couch_to_prom/3,
-    to_prom/3,
+    to_prom/4,
     to_prom_summary/2
 ]).
 
@@ -116,7 +116,7 @@ get_system_stats() ->
     ]).
 
 get_uptime_stat() ->
-    to_prom(uptime_seconds, counter, couch_app:uptime() div 1000).
+    to_prom(uptime_seconds, counter, "couchdb uptime", couch_app:uptime() div 1000).
 
 get_vm_stats() ->
     MemLabels = lists:map(
@@ -131,29 +131,70 @@ get_vm_stats() ->
     ProcCount = erlang:system_info(process_count),
     ProcLimit = erlang:system_info(process_limit),
     [
-        to_prom(erlang_memory_bytes, gauge, MemLabels),
-        to_prom(erlang_gc_collections_total, counter, NumGCs),
-        to_prom(erlang_gc_words_reclaimed_total, counter, WordsReclaimed),
-        to_prom(erlang_context_switches_total, counter, CtxSwitches),
-        to_prom(erlang_reductions_total, counter, Reds),
-        to_prom(erlang_processes, gauge, ProcCount),
-        to_prom(erlang_process_limit, gauge, ProcLimit)
+        to_prom(
+            erlang_memory_bytes,
+            gauge,
+            "size of memory dynamically allocated by the Erlang emulator",
+            MemLabels
+        ),
+        to_prom(
+            erlang_gc_collections_total,
+            counter,
+            "number of garbage collections by the Erlang emulator",
+            NumGCs
+        ),
+        to_prom(
+            erlang_gc_words_reclaimed_total,
+            counter,
+            "number of words reclaimed by garbage collections",
+            WordsReclaimed
+        ),
+        to_prom(
+            erlang_context_switches_total, counter, "total number of context switches", CtxSwitches
+        ),
+        to_prom(erlang_reductions_total, counter, "total number of reductions", Reds),
+        to_prom(erlang_processes, gauge, "the number of Erlang processes", ProcCount),
+        to_prom(
+            erlang_process_limit,
+            gauge,
+            "the maximum number of simultaneously existing Erlang processes",
+            ProcLimit
+        )
     ].
 
 get_io_stats() ->
     {{input, In}, {output, Out}} = erlang:statistics(io),
     [
-        to_prom(erlang_io_recv_bytes_total, counter, In),
-        to_prom(erlang_io_sent_bytes_total, counter, Out)
+        to_prom(
+            erlang_io_recv_bytes_total,
+            counter,
+            "the total number of bytes received through ports",
+            In
+        ),
+        to_prom(
+            erlang_io_sent_bytes_total, counter, "the total number of bytes output to ports", Out
+        )
     ].
 
 get_message_queue_stats() ->
     QLenFun = fun(Name) -> message_queue_len(whereis(Name)) end,
     Queues = lists:map(QLenFun, registered()),
     [
-        to_prom(erlang_message_queues, gauge, lists:sum(Queues)),
-        to_prom(erlang_message_queue_min, gauge, lists:min(Queues)),
-        to_prom(erlang_message_queue_max, gauge, lists:max(Queues))
+        to_prom(
+            erlang_message_queues, gauge, "total size of all message queues", lists:sum(Queues)
+        ),
+        to_prom(
+            erlang_message_queue_min,
+            gauge,
+            "minimum size across all message queues",
+            lists:min(Queues)
+        ),
+        to_prom(
+            erlang_message_queue_max,
+            gauge,
+            "maximum size across all message queues",
+            lists:max(Queues)
+        )
     ].
 
 message_queue_len(undefined) ->
@@ -177,13 +218,18 @@ get_run_queue_stats() ->
                 {lists:sum(SQs), DCQ}
         end,
     [
-        to_prom(erlang_scheduler_queues, gauge, Normal),
-        to_prom(erlang_dirty_cpu_scheduler_queues, gauge, Dirty)
+        to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", Normal),
+        to_prom(
+            erlang_dirty_cpu_scheduler_queues,
+            gauge,
+            "the total size of all dirty CPU scheduler run queues",
+            Dirty
+        )
     ].
 
 get_ets_stats() ->
     NumTabs = length(ets:all()),
-    to_prom(erlang_ets_table, gauge, NumTabs).
+    to_prom(erlang_ets_table, gauge, "number of ETS tables", NumTabs).
 
 drain_refresh_messages() ->
     receive
@@ -205,7 +251,8 @@ system_stats_test() ->
     lists:foreach(
         fun(Line) ->
             ?assert(is_binary(Line)),
-            ?assert((starts_with(<<"couchdb_">>, Line) orelse starts_with(<<"# TYPE ">>, Line)))
+            Trimmed = string:trim(Line),
+            ?assert(starts_with(<<"couchdb_">>, Trimmed) orelse starts_with(<<"# ">>, Trimmed))
         end,
         get_system_stats()
     ).
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index 7147a6539..9e4a74e49 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -15,56 +15,78 @@
 -export([
     couch_to_prom/3,
     to_bin/1,
-    to_prom/3,
+    to_prom/4,
     to_prom_summary/2
 ]).
 
 -include("couch_prometheus.hrl").
 
 couch_to_prom([couch_log, level, alert], Info, _All) ->
-    to_prom(couch_log_requests_total, counter, {[{level, alert}], val(Info)});
+    to_prom(couch_log_requests_total, counter, "number of logged messages", {
+        [{level, alert}], val(Info)
+    });
 couch_to_prom([couch_log, level, Level], Info, _All) ->
     to_prom(couch_log_requests_total, {[{level, Level}], val(Info)});
 couch_to_prom([couch_replicator, checkpoints, failure], Info, _All) ->
-    to_prom(couch_replicator_checkpoints_failure_total, counter, val(Info));
+    to_prom(couch_replicator_checkpoints_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, checkpoints, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, checkpoints, failure], All),
-    to_prom(couch_replicator_checkpoints_total, counter, Total);
+    to_prom(couch_replicator_checkpoints_total, counter, "number of checkpoint saves", Total);
 couch_to_prom([couch_replicator, responses, failure], Info, _All) ->
-    to_prom(couch_replicator_responses_failure_total, counter, val(Info));
+    to_prom(couch_replicator_responses_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, responses, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, responses, failure], All),
-    to_prom(couch_replicator_responses_total, counter, Total);
+    to_prom(
+        couch_replicator_responses_total,
+        counter,
+        "number of HTTP responses received by the replicator",
+        Total
+    );
 couch_to_prom([couch_replicator, stream_responses, failure], Info, _All) ->
-    to_prom(couch_replicator_stream_responses_failure_total, counter, val(Info));
+    to_prom(couch_replicator_stream_responses_failure_total, counter, desc(Info), val(Info));
 couch_to_prom([couch_replicator, stream_responses, success], Info, All) ->
     Total = val(Info) + val([couch_replicator, stream_responses, failure], All),
-    to_prom(couch_replicator_stream_responses_total, counter, Total);
+    to_prom(
+        couch_replicator_stream_responses_total,
+        counter,
+        "number of streaming HTTP responses received by the replicator",
+        Total
+    );
 couch_to_prom([couchdb, auth_cache_hits], Info, All) ->
     Total = val(Info) + val([couchdb, auth_cache_misses], All),
-    to_prom(auth_cache_requests_total, counter, Total);
+    to_prom(auth_cache_requests_total, counter, "number of authentication cache requests", Total);
 couch_to_prom([couchdb, auth_cache_misses], Info, _All) ->
-    to_prom(auth_cache_misses_total, counter, val(Info));
+    to_prom(auth_cache_misses_total, counter, desc(Info), val(Info));
+% force a # TYPE and # HELP definition for httpd_request_methods
 couch_to_prom([couchdb, httpd_request_methods, 'COPY'], Info, _All) ->
-    to_prom(httpd_request_methods, counter, {[{method, 'COPY'}], val(Info)});
+    to_prom(httpd_request_methods, counter, "number of HTTP requests by method", {
+        [{method, 'COPY'}], val(Info)
+    });
 couch_to_prom([couchdb, httpd_request_methods, Method], Info, _All) ->
     to_prom(httpd_request_methods, {[{method, Method}], val(Info)});
+% force a # TYPE and # HELP definition for httpd_status_codes
+couch_to_prom([couchdb, httpd_status_codes, 200], Info, _All) ->
+    to_prom(httpd_status_codes, counter, "number of HTTP responses by status code", {
+        [{code, 200}], val(Info)
+    });
 couch_to_prom([couchdb, httpd_status_codes, Code], Info, _All) ->
     to_prom(httpd_status_codes, {[{code, Code}], val(Info)});
 couch_to_prom([ddoc_cache, hit], Info, All) ->
     Total = val(Info) + val([ddoc_cache, miss], All),
-    to_prom(ddoc_cache_requests_total, counter, Total);
+    to_prom(ddoc_cache_requests_total, counter, "number of design doc cache requests", Total);
 couch_to_prom([ddoc_cache, miss], Info, _All) ->
-    to_prom(ddoc_cache_requests_failures_total, counter, val(Info));
+    to_prom(ddoc_cache_requests_failures_total, counter, desc(Info), val(Info));
 couch_to_prom([ddoc_cache, recovery], Info, _All) ->
-    to_prom(ddoc_cache_requests_recovery_total, counter, val(Info));
+    to_prom(ddoc_cache_requests_recovery_total, counter, desc(Info), val(Info));
 couch_to_prom([fabric, read_repairs, failure], Info, _All) ->
-    to_prom(fabric_read_repairs_failures_total, counter, val(Info));
+    to_prom(fabric_read_repairs_failures_total, counter, desc(Info), val(Info));
 couch_to_prom([fabric, read_repairs, success], Info, All) ->
     Total = val(Info) + val([fabric, read_repairs, failure], All),
-    to_prom(fabric_read_repairs_total, counter, Total);
+    to_prom(fabric_read_repairs_total, counter, "number of fabric read repairs", Total);
 couch_to_prom([rexi, streams, timeout, init_stream], Info, _All) ->
-    to_prom(rexi_streams_timeout_total, counter, {[{stage, init_stream}], val(Info)});
+    to_prom(rexi_streams_timeout_total, counter, "number of rexi stream timeouts", {
+        [{stage, init_stream}], val(Info)
+    });
 couch_to_prom([rexi_streams, timeout, Stage], Info, _All) ->
     to_prom(rexi_streams_timeout_total, {[{stage, Stage}], val(Info)});
 couch_to_prom([couchdb | Rest], Info, All) ->
@@ -73,15 +95,22 @@ couch_to_prom(Path, Info, _All) ->
     case lists:keyfind(type, 1, Info) of
         {type, counter} ->
             Metric = counter_metric(Path),
-            to_prom(Metric, counter, val(Info));
+            to_prom(Metric, counter, desc(Info), val(Info));
         {type, gauge} ->
-            to_prom(path_to_name(Path), gauge, val(Info));
+            to_prom(path_to_name(Path), gauge, desc(Info), val(Info));
         {type, histogram} ->
             to_prom_summary(Path, Info)
     end.
 
-to_prom(Metric, Type, Data) ->
-    TypeStr = to_bin(io_lib:format("# TYPE ~s ~s", [to_prom_name(Metric), Type])),
+type_def(Metric, Type, Desc) ->
+    Name = to_prom_name(Metric),
+    [
+        to_bin(io_lib:format("\n# HELP ~s ~s\r", [Name, Desc])),
+        to_bin(io_lib:format("# TYPE ~s ~s", [Name, Type]))
+    ].
+
+to_prom(Metric, Type, Desc, Data) ->
+    TypeStr = type_def(Metric, Type, Desc),
     [TypeStr] ++ to_prom(Metric, Data).
 
 to_prom(Metric, Instances) when is_list(Instances) ->
@@ -130,7 +159,7 @@ to_prom_summary(Path, Info) ->
     SumStat = to_prom(SumMetric, Count * Mean),
     CountMetric = path_to_name(Path ++ ["seconds", "count"]),
     CountStat = to_prom(CountMetric, Count),
-    to_prom(Metric, summary, Quantiles) ++ [SumStat, CountStat].
+    to_prom(Metric, summary, desc(Info), Quantiles) ++ [SumStat, CountStat].
 
 to_prom_name(Metric) ->
     to_bin(io_lib:format("couchdb_~s", [Metric])).
@@ -168,23 +197,31 @@ val(Key, Stats) ->
     {Key, Data} = lists:keyfind(Key, 1, Stats),
     val(Data).
 
+desc(Info) ->
+    {desc, V} = lists:keyfind(desc, 1, Info),
+    V.
+
 -ifdef(TEST).
 -include_lib("couch/include/couch_eunit.hrl").
 
 to_prom_counter_test() ->
-    ?assertEqual(
-        <<"couchdb_ddoc_cache 10">>,
-        test_to_prom_output(ddoc_cache, counter, 10)
-    ),
-    ?assertEqual(
-        <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
-        test_to_prom_output(httpd_status_codes, counter, {[{code, 200}], 3})
-    ).
+    [
+        ?assertEqual(
+            <<"couchdb_ddoc_cache 10">>,
+            test_to_prom_output(ddoc_cache, counter, "size of ddoc cache", 10)
+        ),
+        ?assertEqual(
+            <<"couchdb_httpd_status_codes{code=\"200\"} 3">>,
+            test_to_prom_output(httpd_status_codes, counter, "HTTP request status by code", {
+                [{code, 200}], 3
+            })
+        )
+    ].
 
 to_prom_gauge_test() ->
     ?assertEqual(
         <<"couchdb_temperature_celsius 36">>,
-        test_to_prom_output(temperature_celsius, gauge, 36)
+        test_to_prom_output(temperature_celsius, gauge, "temp", 36)
     ).
 
 to_prom_summary_test() ->
@@ -232,8 +269,8 @@ counter_metric_test_() ->
         )
     ].
 
-test_to_prom_output(Metric, Type, Val) ->
-    Out = to_prom(Metric, Type, Val),
+test_to_prom_output(Metric, Type, Desc, Val) ->
+    Out = to_prom(Metric, Type, Desc, Val),
     lists:nth(2, Out).
 
 test_to_prom_summary_output(Metric, Info) ->
author	Will Holley <willholley@apache.org>	2023-03-13 16:48:15 +0000
committer	Will Holley <will.holley@uk.ibm.com>	2023-03-20 12:31:51 +0000
commit	f7a22d4cd466675986076e24dba3b6ed4db393fc (patch)
tree	87fd95b8cfa702bcccbb978b044129ec6ad69c4b
parent	a25336f917c68b03da826f9ef2a0665e1a842438 (diff)
download	couchdb-f7a22d4cd466675986076e24dba3b6ed4db393fc.tar.gz