diff options
author | Will Holley <will.holley@uk.ibm.com> | 2023-04-12 15:08:45 +0000 |
---|---|---|
committer | Will Holley <will.holley@uk.ibm.com> | 2023-04-14 11:49:33 +0000 |
commit | bb33142628d6de25f12ef8a6281d73c9f9965fc2 (patch) | |
tree | 3e3687f60b7b661a9c82dbff84ad23ec777035bc | |
parent | b7686a9da8ec3059ce9a8e9df1fee837ad1a1ea3 (diff) | |
download | couchdb-bb33142628d6de25f12ef8a6281d73c9f9965fc2.tar.gz |
feat (prometheus): couch_db_updater and couch_file queue stats
# What
Adds summary metrics for couch_db_updater and couch_file, the same as
returned by the `_system` endpoint.
Unlike the other message queue stats, these are returned as a Prometheus
summary type across the following metrics, using `couch_db_updater` as
an example:
* couchdb_erlang_message_queue_couch_db_updater{quantile="0.5"}
* couchdb_erlang_message_queue_couch_db_updater{quantile="0.9"}
* couchdb_erlang_message_queue_couch_db_updater{quantile="0.99"}
* couchdb_erlang_message_queue_couch_db_updater_sum
* couchdb_erlang_message_queue_couch_db_updater_count
The count metric represents the number of processes and the sum is the
total size of all message queues for those processes.
In addition, min and max message queue sizes are returned, matching
the _system endpoint response:
* couchdb_erlang_message_queue_couch_db_updater_min
* couchdb_erlang_message_queue_couch_db_updater_max
# How
This represents a new type of metric in the prometheus endpoint - the
existing `summary` types have all been for latency histograms - so
a new utility function `pid_to_prom_summary` is added to format the
message queue stats into prometheus metrics series.
In `chttpd_node` I've extracted the formatting step from the `db_pid_stats`
function to allow for re-use between `chttpd_node` and
`couch_prometheus_server`, where the result is formatted differently.
`chttpd_node` doesn't seem like the best place to put shared code like
this but neither does there seem an obvious place to extract it to as
an alternative, so I've left it for now.
-rw-r--r-- | src/chttpd/src/chttpd_node.erl | 11 | ||||
-rw-r--r-- | src/couch_prometheus/src/couch_prometheus_server.erl | 33 | ||||
-rw-r--r-- | src/couch_prometheus/src/couch_prometheus_util.erl | 1 |
3 files changed, 41 insertions, 4 deletions
diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl index bb3cf4798..ef586e174 100644 --- a/src/chttpd/src/chttpd_node.erl +++ b/src/chttpd/src/chttpd_node.erl @@ -287,7 +287,7 @@ get_stats() -> {NumberOfGCs, WordsReclaimed, _} = statistics(garbage_collection), {{input, Input}, {output, Output}} = statistics(io), - {CF, CDU} = db_pid_stats(), + {CF, CDU} = db_pid_stats_formatted(), MessageQueuesHist = [ {couch_file, {CF}}, {couch_db_updater, {CDU}} @@ -315,6 +315,10 @@ get_stats() -> {distribution, {get_distribution_stats()}} ]. +db_pid_stats_formatted() -> + {CF, CDU} = db_pid_stats(), + {format_pid_stats(CF), format_pid_stats(CDU)}. + db_pid_stats() -> {monitors, M} = process_info(whereis(couch_stats_process_tracker), monitors), Candidates = [Pid || {process, Pid} <- M], @@ -323,7 +327,7 @@ db_pid_stats() -> {CouchFiles, CouchDbUpdaters}. db_pid_stats(Mod, Candidates) -> - Mailboxes = lists:foldl( + lists:foldl( fun(Pid, Acc) -> case process_info(Pid, [message_queue_len, dictionary]) of undefined -> @@ -343,8 +347,7 @@ db_pid_stats(Mod, Candidates) -> end, [], Candidates - ), - format_pid_stats(Mailboxes). + ). format_pid_stats([]) -> []; diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl index 7699c4fc4..0acc7543b 100644 --- a/src/couch_prometheus/src/couch_prometheus_server.erl +++ b/src/couch_prometheus/src/couch_prometheus_server.erl @@ -17,6 +17,7 @@ -import(couch_prometheus_util, [ couch_to_prom/3, to_prom/4, + to_prom/2, to_prom_summary/2 ]). @@ -110,6 +111,7 @@ get_system_stats() -> get_uptime_stat(), get_io_stats(), get_message_queue_stats(), + get_db_pid_stats(), get_run_queue_stats(), get_vm_stats(), get_ets_stats(), @@ -220,6 +222,37 @@ get_message_queue_stats() -> to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel) ]. +get_db_pid_stats() -> + {CF, CDU} = chttpd_node:db_pid_stats(), + [ + pid_to_prom_summary( + "erlang_message_queue_couch_file", + "size of message queue across couch_file processes", + CF + ), + pid_to_prom_summary( + "erlang_message_queue_couch_db_updater", + "size of message queue across couch_db_updater processes", + CDU + ) + ]. + +pid_to_prom_summary(_, _, []) -> + []; +pid_to_prom_summary(Metric, Desc, Mailboxes) -> + Sorted = lists:sort(Mailboxes), + Count = length(Sorted), + Quantiles = [ + {[{quantile, <<"0.5">>}], lists:nth(round(Count * 0.5), Sorted)}, + {[{quantile, <<"0.9">>}], lists:nth(round(Count * 0.9), Sorted)}, + {[{quantile, <<"0.99">>}], lists:nth(round(Count * 0.99), Sorted)} + ], + SumStat = to_prom(Metric ++ ["_sum"], lists:sum(Sorted)), + CountStat = to_prom(Metric ++ ["_count"], length(Sorted)), + MinStat = to_prom(Metric ++ ["_min"], hd(Sorted)), + MaxStat = to_prom(Metric ++ ["_max"], lists:last(Sorted)), + to_prom(Metric, summary, Desc, Quantiles) ++ [SumStat, CountStat, MinStat, MaxStat]. + get_run_queue_stats() -> %% Workaround for https://bugs.erlang.org/browse/ERL-1355 {SQ, DCQ} = chttpd_node:run_queues(), diff --git a/src/couch_prometheus/src/couch_prometheus_util.erl b/src/couch_prometheus/src/couch_prometheus_util.erl index 5775b9693..4665ba7f9 100644 --- a/src/couch_prometheus/src/couch_prometheus_util.erl +++ b/src/couch_prometheus/src/couch_prometheus_util.erl @@ -16,6 +16,7 @@ couch_to_prom/3, to_bin/1, to_prom/4, + to_prom/2, to_prom_summary/2 ]). |