summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Klishin <klishinm@vmware.com>2021-08-17 22:16:30 +0300
committerGitHub <noreply@github.com>2021-08-17 22:16:30 +0300
commitea75297bf875e43c733dc414d535584f54a533af (patch)
tree48791cebce99650a3a5565319c423fb6fc223b47
parent645a076742a2340cfe931804eb7c2a21c728b515 (diff)
parent1ea83beadd1bee272e7bc9509da05187e32a6e8b (diff)
downloadrabbitmq-server-git-ea75297bf875e43c733dc414d535584f54a533af.tar.gz
Merge pull request #3312 from rabbitmq/alarms-metric1
Expose alarms as Prometheus metrics
-rw-r--r--deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_alarm_metrics_collector.erl80
-rw-r--r--deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl3
-rw-r--r--release-notes/3.9.4.md9
3 files changed, 91 insertions, 1 deletions
diff --git a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_alarm_metrics_collector.erl b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_alarm_metrics_collector.erl
new file mode 100644
index 0000000000..b750aa0f6a
--- /dev/null
+++ b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_alarm_metrics_collector.erl
@@ -0,0 +1,80 @@
+%% This Source Code Form is subject to the terms of the Mozilla Public
+%% License, v. 2.0. If a copy of the MPL was not distributed with this
+%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
+%%
+%% Copyright (c) 2007-2021 VMware, Inc. or its affiliates. All rights reserved.
+%%
+-module(prometheus_rabbitmq_alarm_metrics_collector).
+
+-export([register/0, deregister_cleanup/1, collect_mf/2]).
+
+-import(prometheus_model_helpers, [create_mf/4, untyped_metric/1]).
+
+-include_lib("prometheus/include/prometheus.hrl").
+
+-behaviour(prometheus_collector).
+
+-define(METRIC_NAME_PREFIX, "rabbitmq_alarms_").
+
+%%====================================================================
+%% Collector API
+%%====================================================================
+
+register() ->
+ ok = prometheus_registry:register_collector(?MODULE).
+
+deregister_cleanup(_) ->
+ ok.
+
+-spec collect_mf(_Registry, Callback) -> ok
+ when _Registry :: prometheus_registry:registry(),
+ Callback :: prometheus_collector:callback().
+collect_mf(_Registry, Callback) ->
+ try
+ case rabbit_alarm:get_local_alarms(500) %% TODO: figure out timeout
+ of
+ Alarms when is_list(Alarms) ->
+ ActiveAlarms =
+ lists:foldl(fun ({{resource_limit, disk, _}, _}, Acc) ->
+ maps:put(disk_limit, 1, Acc);
+ ({{resource_limit, memory, _}, _}, Acc) ->
+ maps:put(memory_limit, 1, Acc);
+ ({file_descriptor_limit, _}, Acc) ->
+ maps:put(file_descriptor_limit, 1, Acc)
+ end,
+ #{},
+ Alarms),
+
+ Callback(create_mf(?METRIC_NAME(<<"file_descriptor_limit">>),
+ <<"is 1 if file descriptor limit alarm is in effect">>,
+ untyped,
+ [untyped_metric(maps:get(file_descriptor_limit,
+ ActiveAlarms,
+ 0))])),
+ Callback(create_mf(?METRIC_NAME(<<"free_disk_space_watermark">>),
+ <<"is 1 if free disk space watermark alarm is in effect">>,
+ untyped,
+ [untyped_metric(maps:get(disk_limit, ActiveAlarms, 0))])),
+ Callback(create_mf(?METRIC_NAME(<<"memory_used_watermark">>),
+ <<"is 1 if VM memory watermark alarm is in effect">>,
+ untyped,
+ [untyped_metric(maps:get(memory_limit, ActiveAlarms, 0))])),
+ ok;
+ Error ->
+ rabbit_log:error("alarm_metrics_collector failed to emit metrics: "
+ "rabbitm_alarm:get_local_alarms returned ~p",
+ [Error]),
+ %% We are not going to render any alarm metrics here.
+ %% Breaks continuity but at least doesn't crash the
+ %% whole scraping endpoint
+ ok
+ end
+ catch
+ exit:{timeout, _} ->
+ rabbit_log:error("alarm_metrics_collector failed to emit metrics: "
+ "rabbitm_alarm:get_local_alarms timed out"),
+ %% We are not going to render any alarm metrics here.
+ %% Breaks continuity but at least doesn't crash the
+ %% whole scraping endpoint
+ ok
+ end.
diff --git a/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl b/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
index 39d79f256f..bf397d8ca1 100644
--- a/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
+++ b/deps/rabbitmq_prometheus/src/rabbit_prometheus_dispatcher.erl
@@ -15,7 +15,8 @@ build_dispatcher() ->
{ok, _} = application:ensure_all_started(prometheus),
prometheus_registry:register_collectors([
prometheus_rabbitmq_core_metrics_collector,
- prometheus_rabbitmq_global_metrics_collector]),
+ prometheus_rabbitmq_global_metrics_collector,
+ prometheus_rabbitmq_alarm_metrics_collector]),
prometheus_registry:register_collectors('per-object', [
prometheus_vm_system_info_collector,
prometheus_vm_dist_collector,
diff --git a/release-notes/3.9.4.md b/release-notes/3.9.4.md
index a1ddcbe05d..51a0d3bc64 100644
--- a/release-notes/3.9.4.md
+++ b/release-notes/3.9.4.md
@@ -15,6 +15,15 @@ consistent release schedule.
#### Enhancements
+ * New Prometheus metrics for alarms:
+ * `rabbitmq_alarms_file_descriptor_limit` 1|0
+ * `rabbitmq_alarms_free_disk_space_watermark` 1|0
+ * `rabbitmq_alarms_memory_used_watermark` 1|0
+
+ While some of the alarms have cluster-wide effect, these metrics are node-local.
+
+ GitHub issue: [#2653](https://github.com/rabbitmq/rabbitmq-server/pull/2653)
+
* Nodes will now use four more environment variables, if set: `RABBITMQ_DEFAULT_USER` (overrides `default_user` in `rabbitmq.conf`), `RABBITMQ_DEFAULT_PASS` (overrides `default_pass`), `RABBITMQ_DEFAULT_VHOST` (overrides `default_vhost`) and `RABBITMQ_ERLANG_COOKIE` (sets [shared authentication secret value](https://www.rabbitmq.com/clustering.html#erlang-cookie)).
These variables **are not recommended to be used in production** but can be the only realistic option in some environment, such as service containers, ECS, and so on.
Most users should continue using `rabbitmq.conf` and a securely generated local cookie file.