summaryrefslogtreecommitdiff
path: root/monitoring
diff options
context:
space:
mode:
authorAleksei Zakharov <zaharov@selectel.ru>2020-01-21 13:44:50 +0300
committerAleksei Zakharov <zaharov@selectel.ru>2020-01-29 17:28:36 +0300
commit4eb58f7cccbdd6454db1e1d1171c27a30f80ba17 (patch)
treeb54b73199822989dd1363c23c372c1816f11e0e9 /monitoring
parent96bd77d50916e1fbcbdab5191675b17146c420ec (diff)
downloadceph-4eb58f7cccbdd6454db1e1d1171c27a30f80ba17.tar.gz
monitoring/grafana,prometheus: add per-pool pg states support
Signed-off-by: Aleksei Zakharov <zaharov@selectel.ru>
Diffstat (limited to 'monitoring')
-rw-r--r--monitoring/grafana/dashboards/ceph-cluster.json29
-rw-r--r--monitoring/prometheus/alerts/ceph_default_alerts.yml8
2 files changed, 19 insertions, 18 deletions
diff --git a/monitoring/grafana/dashboards/ceph-cluster.json b/monitoring/grafana/dashboards/ceph-cluster.json
index 2fcee528d24..93fe3372c6c 100644
--- a/monitoring/grafana/dashboards/ceph-cluster.json
+++ b/monitoring/grafana/dashboards/ceph-cluster.json
@@ -346,6 +346,7 @@
},
"id": 53,
"legend": {
+ "alignAsTable": true,
"avg": false,
"current": false,
"max": false,
@@ -402,52 +403,52 @@
"steppedLine": false,
"targets": [
{
- "expr": "ceph_pg_total",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Total",
+ "legendFormat": "{{name}} Total",
"refId": "A"
},
{
- "expr": "ceph_pg_active",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_active)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Active",
+ "legendFormat": "{{name}} Active",
"refId": "B"
},
{
- "expr": "ceph_pg_total - ceph_pg_active",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Inactive",
+ "legendFormat": "{{name}} Inactive",
"refId": "G"
},
{
- "expr": "ceph_pg_undersized",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_undersized)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Undersized",
+ "legendFormat": "{{name}} Undersized",
"refId": "F"
},
{
- "expr": "ceph_pg_degraded",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_degraded)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Degraded",
+ "legendFormat": "{{name}} Degraded",
"refId": "C"
},
{
- "expr": "ceph_pg_inconsistent",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_inconsistent)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Inconsistent",
+ "legendFormat": "{{name}} Inconsistent",
"refId": "D"
},
{
- "expr": "ceph_pg_down",
+ "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_down)",
"format": "time_series",
"intervalFactor": 1,
- "legendFormat": "Down",
+ "legendFormat": "{{name}} Down",
"refId": "E"
}
],
diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml
index 716ccc935df..3f58aeeaeba 100644
--- a/monitoring/prometheus/alerts/ceph_default_alerts.yml
+++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml
@@ -139,7 +139,7 @@ groups:
- name: pgs
rules:
- alert: pgs inactive
- expr: ceph_pg_total - ceph_pg_active > 0
+ expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0
for: 5m
labels:
severity: critical
@@ -147,11 +147,11 @@ groups:
oid: 1.3.6.1.4.1.50495.15.1.2.7.1
annotations:
description: >
- {{ $value }} PGs have been inactive for more than 5 minutes.
+ {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}.
Inactive placement groups aren't able to serve read/write
requests.
- alert: pgs unclean
- expr: ceph_pg_total - ceph_pg_clean > 0
+ expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0
for: 15m
labels:
severity: warning
@@ -159,7 +159,7 @@ groups:
oid: 1.3.6.1.4.1.50495.15.1.2.7.2
annotations:
description: >
- {{ $value }} PGs haven't been clean for more than 15 minutes.
+ {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}.
Unclean PGs haven't been able to completely recover from a
previous failure.
- name: nodes