summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWill Holley <will.holley@uk.ibm.com>2023-04-12 15:08:45 +0000
committerWill Holley <will.holley@uk.ibm.com>2023-04-14 11:49:33 +0000
commitbb33142628d6de25f12ef8a6281d73c9f9965fc2 (patch)
tree3e3687f60b7b661a9c82dbff84ad23ec777035bc
parentb7686a9da8ec3059ce9a8e9df1fee837ad1a1ea3 (diff)
downloadcouchdb-bb33142628d6de25f12ef8a6281d73c9f9965fc2.tar.gz
feat (prometheus): couch_db_updater and couch_file queue stats
# What Adds summary metrics for couch_db_updater and couch_file, the same as returned by the `_system` endpoint. Unlike the other message queue stats, these are returned as a Prometheus summary type across the following metrics, using `couch_db_updater` as an example: * couchdb_erlang_message_queue_couch_db_updater{quantile="0.5"} * couchdb_erlang_message_queue_couch_db_updater{quantile="0.9"} * couchdb_erlang_message_queue_couch_db_updater{quantile="0.99"} * couchdb_erlang_message_queue_couch_db_updater_sum * couchdb_erlang_message_queue_couch_db_updater_count The count metric represents the number of processes and the sum is the total size of all message queues for those processes. In addition, min and max message queue sizes are returned, matching the _system endpoint response: * couchdb_erlang_message_queue_couch_db_updater_min * couchdb_erlang_message_queue_couch_db_updater_max # How This represents a new type of metric in the prometheus endpoint - the existing `summary` types have all been for latency histograms - so a new utility function `pid_to_prom_summary` is added to format the message queue stats into prometheus metrics series. In `chttpd_node` I've extracted the formatting step from the `db_pid_stats` function to allow for re-use between `chttpd_node` and `couch_prometheus_server`, where the result is formatted differently. `chttpd_node` doesn't seem like the best place to put shared code like this but neither does there seem an obvious place to extract it to as an alternative, so I've left it for now.
-rw-r--r--src/chttpd/src/chttpd_node.erl11
-rw-r--r--src/couch_prometheus/src/couch_prometheus_server.erl33
-rw-r--r--src/couch_prometheus/src/couch_prometheus_util.erl1
3 files changed, 41 insertions, 4 deletions
diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl
index bb3cf4798..ef586e174 100644
--- a/src/chttpd/src/chttpd_node.erl
+++ b/src/chttpd/src/chttpd_node.erl
@@ -287,7 +287,7 @@ get_stats() ->
{NumberOfGCs, WordsReclaimed, _} = statistics(garbage_collection),
{{input, Input}, {output, Output}} = statistics(io),
- {CF, CDU} = db_pid_stats(),
+ {CF, CDU} = db_pid_stats_formatted(),
MessageQueuesHist = [
{couch_file, {CF}},
{couch_db_updater, {CDU}}
@@ -315,6 +315,10 @@ get_stats() ->
{distribution, {get_distribution_stats()}}
].
+db_pid_stats_formatted() ->
+ {CF, CDU} = db_pid_stats(),
+ {format_pid_stats(CF), format_pid_stats(CDU)}.
+
db_pid_stats() ->
{monitors, M} = process_info(whereis(couch_stats_process_tracker), monitors),
Candidates = [Pid || {process, Pid} <- M],
@@ -323,7 +327,7 @@ db_pid_stats() ->
{CouchFiles, CouchDbUpdaters}.
db_pid_stats(Mod, Candidates) ->
- Mailboxes = lists:foldl(
+ lists:foldl(
fun(Pid, Acc) ->
case process_info(Pid, [message_queue_len, dictionary]) of
undefined ->
@@ -343,8 +347,7 @@ db_pid_stats(Mod, Candidates) ->
end,
[],
Candidates
- ),
- format_pid_stats(Mailboxes).
+ ).
format_pid_stats([]) ->
[];
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 7699c4fc4..0acc7543b 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -17,6 +17,7 @@
-import(couch_prometheus_util, [
couch_to_prom/3,
to_prom/4,
+ to_prom/2,
to_prom_summary/2
]).
@@ -110,6 +111,7 @@ get_system_stats() ->
get_uptime_stat(),
get_io_stats(),
get_message_queue_stats(),
+ get_db_pid_stats(),
get_run_queue_stats(),
get_vm_stats(),
get_ets_stats(),
@@ -220,6 +222,37 @@ get_message_queue_stats() ->
to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel)
].
+get_db_pid_stats() ->
+ {CF, CDU} = chttpd_node:db_pid_stats(),
+ [
+ pid_to_prom_summary(
+ "erlang_message_queue_couch_file",
+ "size of message queue across couch_file processes",
+ CF
+ ),
+ pid_to_prom_summary(
+ "erlang_message_queue_couch_db_updater",
+ "size of message queue across couch_db_updater processes",
+ CDU
+ )
+ ].
+
+pid_to_prom_summary(_, _, []) ->
+ [];
+pid_to_prom_summary(Metric, Desc, Mailboxes) ->
+ Sorted = lists:sort(Mailboxes),
+ Count = length(Sorted),
+ Quantiles = [
+ {[{quantile, <<"0.5">>}], lists:nth(round(Count * 0.5), Sorted)},
+ {[{quantile, <<"0.9">>}], lists:nth(round(Count * 0.9), Sorted)},
+ {[{quantile, <<"0.99">>}], lists:nth(round(Count * 0.99), Sorted)}
+ ],
+ SumStat = to_prom(Metric ++ ["_sum"], lists:sum(Sorted)),
+ CountStat = to_prom(Metric ++ ["_count"], length(Sorted)),
+ MinStat = to_prom(Metric ++ ["_min"], hd(Sorted)),
+ MaxStat = to_prom(Metric ++ ["_max"], lists:last(Sorted)),
+ to_prom(Metric, summary, Desc, Quantiles) ++ [SumStat, CountStat, MinStat, MaxStat].
+
get_run_queue_stats() ->
%% Workaround for https://bugs.erlang.org/browse/ERL-1355
{SQ, DCQ} = chttpd_node:run_queues(),
diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl
index 5775b9693..4665ba7f9 100644
--- a/src/couch_prometheus/src/couch_prometheus_util.erl
+++ b/src/couch_prometheus/src/couch_prometheus_util.erl
@@ -16,6 +16,7 @@
couch_to_prom/3,
to_bin/1,
to_prom/4,
+ to_prom/2,
to_prom_summary/2
]).