summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWill Holley <will.holley@uk.ibm.com>2023-04-12 18:10:26 +0000
committerWill Holley <will.holley@uk.ibm.com>2023-04-14 14:31:58 +0000
commit16a350b9764343fabbffa22abcbe1ccd54096fe7 (patch)
tree51c2f58a54fec7797d4ccd9bf61614750ad041be
parentbb33142628d6de25f12ef8a6281d73c9f9965fc2 (diff)
downloadcouchdb-16a350b9764343fabbffa22abcbe1ccd54096fe7.tar.gz
feat (prometheus): add Erlang distribution stats
# Why The _prometheus endpoint was missing the erlang distribution stats returned by the _system endpoint. This is useful when diagnosing networking issues between couchdb nodes. # How Adds a new function `couch_prometheus_server:get_distribution_stats/0`. This gathers the distribution stats in a similar fashion to `chttpd_node:get_distribution_stats/0` but formats them in a more prometheus-friendly way. Naming convention follows prometheus standards, so the type of the value is appended to the metric name and, where counter types are used, a "_total" suffix is added. For example: ``` couchdb_erlang_distribution_recv_oct_bytes_total{node="node2@127.0.0.1"} 30609 couchdb_erlang_distribution_recv_oct_bytes_total{node="node3@127.0.0.1"} 28392 ```
-rw-r--r--src/couch_prometheus/src/couch_prometheus_server.erl137
1 files changed, 136 insertions, 1 deletions
diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl
index 0acc7543b..d40efc702 100644
--- a/src/couch_prometheus/src/couch_prometheus_server.erl
+++ b/src/couch_prometheus/src/couch_prometheus_server.erl
@@ -116,7 +116,8 @@ get_system_stats() ->
get_vm_stats(),
get_ets_stats(),
get_internal_replication_jobs_stat(),
- get_membership_stat()
+ get_membership_stat(),
+ get_distribution_stats()
]).
get_uptime_stat() ->
@@ -266,6 +267,140 @@ get_run_queue_stats() ->
)
].
+% gets the socket stat for the specified socket,
+% inverting the result from inet:getstat/1 to
+% return a map keyed on the stat_option and
+% with a value representing the node and stat value
+% e.g.
+% #{
+% recv_oct => [{[{node="node2@127.0.0.1"}], 30609}]
+% recv_cnt => [{[{node="node2@127.0.0.1"}], 123}]
+% ...
+% }
+% where there is an error fetching the socket stats,
+% return no result for the specified node.
+-spec get_sock_stats({Node, Socket}, MapAcc) ->
+ #{OptionValue := [{[{node, Node}], Value}]}
+when
+ Node :: node(),
+ Socket :: inet:socket(),
+ OptionValue :: inet:stat_option(),
+ Value :: integer(),
+ MapAcc :: #{OptionValue := [{[{node, Node}], Value}]}.
+get_sock_stats({Node, Socket}, MapAcc) ->
+ try inet:getstat(Socket) of
+ {ok, Stats} ->
+ % For each Key/Value pair in Stats, append
+ % an entry for the current Node to the result.
+ % This relies on lists:foldl returning the final
+ % accumulated map
+ lists:foldl(
+ fun({StatOption, Value}, Map0) ->
+ maps:update_with(StatOption, fun(V) -> V ++ [{[{node, Node}], Value}] end, Map0)
+ end,
+ MapAcc,
+ Stats
+ )
+ catch
+ _:_ ->
+ % no result
+ MapAcc
+ end.
+
+get_distribution_stats() ->
+ % each distribution metric has a different type,
+ % so expose each as a different metric with the erlang
+ % node as a label.
+ % This is the inverse of the structure returned by
+ % inet:getstat/1.
+
+ % This fold accumulates a map keyed on the socket
+ % stat_option (https://www.erlang.org/doc/man/inet.html#getstat-2)
+ % where the value is a list of labels/value pairs for that stat
+ % e.g.
+ % recv_oct => [{[{node="node2@127.0.0.1"}], 30609}, {[{node="node3@127.0.0.1"}], 28392}]
+ % recv_cnt => [{[{node="node2@127.0.0.1"}], 123}, {[{node="node3@127.0.0.1"}], 134}]
+ DefaultMap = #{
+ recv_oct => [],
+ recv_cnt => [],
+ recv_max => [],
+ recv_avg => [],
+ recv_dvi => [],
+ send_oct => [],
+ send_cnt => [],
+ send_max => [],
+ send_avg => [],
+ send_pend => []
+ },
+ NodeStats = erlang:system_info(dist_ctrl),
+ DistStats = lists:foldl(
+ fun get_sock_stats/2,
+ DefaultMap,
+ NodeStats
+ ),
+ [
+ to_prom(
+ erlang_distribution_recv_oct_bytes_total,
+ counter,
+ "Number of bytes received by the socket.",
+ maps:get(recv_oct, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_recv_cnt_packets_total,
+ counter,
+ "number of packets received by the socket.",
+ maps:get(recv_cnt, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_recv_max_bytes,
+ gauge,
+ "size of the largest packet, in bytes, received by the socket.",
+ maps:get(recv_max, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_recv_avg_bytes,
+ gauge,
+ "average size of packets, in bytes, received by the socket.",
+ maps:get(recv_avg, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_recv_dvi_bytes,
+ gauge,
+ "average packet size deviation, in bytes, received by the socket.",
+ maps:get(recv_dvi, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_send_oct_bytes_total,
+ counter,
+ "Number of bytes sent by the socket.",
+ maps:get(send_oct, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_send_cnt_packets_total,
+ counter,
+ "number of packets sent by the socket.",
+ maps:get(send_cnt, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_send_max_bytes,
+ gauge,
+ "size of the largest packet, in bytes, sent by the socket.",
+ maps:get(send_max, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_send_avg_bytes,
+ gauge,
+ "average size of packets, in bytes, sent by the socket.",
+ maps:get(send_avg, DistStats)
+ ),
+ to_prom(
+ erlang_distribution_send_pend_bytes,
+ gauge,
+ "number of bytes waiting to be sent by the socket.",
+ maps:get(send_pend, DistStats)
+ )
+ ].
+
get_ets_stats() ->
NumTabs = length(ets:all()),
to_prom(erlang_ets_table, gauge, "number of ETS tables", NumTabs).