diff options
author | Will Holley <will.holley@uk.ibm.com> | 2023-04-12 07:30:06 +0000 |
---|---|---|
committer | Will Holley <will.holley@uk.ibm.com> | 2023-04-14 07:52:50 +0000 |
commit | b7686a9da8ec3059ce9a8e9df1fee837ad1a1ea3 (patch) | |
tree | 70136427dbd1364c8c7bdb3cf36cbe10e72fdffd | |
parent | 865d5f8987e6b264aae5bcf2a943dafb521c424e (diff) | |
download | couchdb-b7686a9da8ec3059ce9a8e9df1fee837ad1a1ea3.tar.gz |
feat (prometheus): include aggregated couch/index message queues
In #3860 and #3366 we added sharding to `couch_index_server` and
`couch_server`.
The `_system` endpoint surfaces a "fake" message queue for each of these
contining the aggregated queue size across all shards. This commit
adds the same for the `_prometheus` endpoint.
Originally I had thought to just filter out the per-shard queue lengths
as we've not found these to be useful in Cloudant, but I'll leave them
in for now for consistency with the `_system` endpoint. Arguably, we
should filter in both places if there's agreement that the per-shard
queue lengths are just noise.
-rw-r--r-- | .devcontainer/devcontainer.json | 2 | ||||
-rw-r--r-- | src/chttpd/src/chttpd_node.erl | 30 | ||||
-rw-r--r-- | src/couch_prometheus/src/couch_prometheus_server.erl | 37 |
3 files changed, 24 insertions, 45 deletions
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 5e577d96d..3920cd9dd 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -9,7 +9,7 @@ // apache/couchdbci-debian:bullseye-erlang-24.3.4.2 // apache/couchdbci-debian:bullseye-erlang-23.3.4.15 // - "COUCHDB_IMAGE": "apache/couchdbci-debian:bullseye-erlang-25.0.2" + "COUCHDB_IMAGE": "apache/couchdbci-debian:bullseye-erlang-24.3.4.10" } }, diff --git a/src/chttpd/src/chttpd_node.erl b/src/chttpd/src/chttpd_node.erl index a63236db7..bb3cf4798 100644 --- a/src/chttpd/src/chttpd_node.erl +++ b/src/chttpd/src/chttpd_node.erl @@ -16,7 +16,9 @@ -export([ handle_node_req/1, get_stats/0, - run_queues/0 + run_queues/0, + message_queues/0, + db_pid_stats/0 ]). -include_lib("couch/include/couch_db.hrl"). @@ -284,14 +286,13 @@ get_stats() -> ], {NumberOfGCs, WordsReclaimed, _} = statistics(garbage_collection), {{input, Input}, {output, Output}} = statistics(io), + {CF, CDU} = db_pid_stats(), - MessageQueues0 = [ + MessageQueuesHist = [ {couch_file, {CF}}, - {couch_db_updater, {CDU}}, - {couch_server, couch_server:aggregate_queue_len()}, - {index_server, couch_index_server:aggregate_queue_len()} + {couch_db_updater, {CDU}} ], - MessageQueues = MessageQueues0 ++ message_queues(registered()), + MessageQueues = message_queues(), {SQ, DCQ} = run_queues(), [ {uptime, couch_app:uptime() div 1000}, @@ -309,7 +310,7 @@ get_stats() -> {stale_proc_count, couch_proc_manager:get_stale_proc_count()}, {process_count, erlang:system_info(process_count)}, {process_limit, erlang:system_info(process_limit)}, - {message_queues, {MessageQueues}}, + {message_queues, {MessageQueuesHist ++ MessageQueues}}, {internal_replication_jobs, mem3_sync:get_backlog()}, {distribution, {get_distribution_stats()}} ]. @@ -385,15 +386,22 @@ get_distribution_stats() -> erlang:system_info(dist_ctrl) ). -message_queues(Registered) -> - lists:map( +-spec message_queues() -> + [{Name :: atom(), Length :: pos_integer()}]. +message_queues() -> + MessageQueuesAgg = [ + {couch_server, couch_server:aggregate_queue_len()}, + {index_server, couch_index_server:aggregate_queue_len()} + ], + MessageQueuesReg = lists:map( fun(Name) -> Type = message_queue_len, {Type, Length} = process_info(whereis(Name), Type), {Name, Length} end, - Registered - ). + registered() + ), + MessageQueuesAgg ++ MessageQueuesReg. %% Workaround for https://bugs.erlang.org/browse/ERL-1355 run_queues() -> diff --git a/src/couch_prometheus/src/couch_prometheus_server.erl b/src/couch_prometheus/src/couch_prometheus_server.erl index 05cd26265..7699c4fc4 100644 --- a/src/couch_prometheus/src/couch_prometheus_server.erl +++ b/src/couch_prometheus/src/couch_prometheus_server.erl @@ -198,8 +198,7 @@ get_io_stats() -> ]. get_message_queue_stats() -> - QFun = fun(Name) -> {Name, message_queue_len(whereis(Name))} end, - Queues = lists:map(QFun, registered()), + Queues = chttpd_node:message_queues(), QueueLens = lists:map(fun({_, Len}) -> Len end, Queues), QueueLenByLabel = lists:map(fun({Name, Len}) -> {[{queue_name, Name}], Len} end, Queues), [ @@ -221,33 +220,16 @@ get_message_queue_stats() -> to_prom(erlang_message_queue_size, gauge, "size of message queue", QueueLenByLabel) ]. -message_queue_len(undefined) -> - 0; -message_queue_len(Pid) when is_pid(Pid) -> - case erlang:process_info(Pid, message_queue_len) of - {message_queue_len, N} -> - N; - _ -> - 0 - end. - get_run_queue_stats() -> %% Workaround for https://bugs.erlang.org/browse/ERL-1355 - {Normal, Dirty} = - case erlang:system_info(dirty_cpu_schedulers) > 0 of - false -> - {statistics(run_queue), 0}; - true -> - [DCQ | SQs] = lists:reverse(statistics(run_queue_lengths)), - {lists:sum(SQs), DCQ} - end, + {SQ, DCQ} = chttpd_node:run_queues(), [ - to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", Normal), + to_prom(erlang_scheduler_queues, gauge, "the total size of all normal run queues", SQ), to_prom( erlang_dirty_cpu_scheduler_queues, gauge, "the total size of all dirty CPU scheduler run queues", - Dirty + DCQ ) ]. @@ -271,17 +253,6 @@ update_refresh_timer() -> -include_lib("couch/include/couch_eunit.hrl"). -message_queue_len_test() -> - self() ! refresh, - ?assert(message_queue_len(self()) >= 1), - ?assertEqual(0, message_queue_len(undefined)), - {Pid, Ref} = spawn_monitor(fun() -> ok end), - receive - {'DOWN', Ref, process, Pid, _} -> - ok - end, - ?assertEqual(0, message_queue_len(Pid)). - drain_refresh_messages_test() -> self() ! refresh, {messages, Mq0} = erlang:process_info(self(), messages), |