diff options
author | Aleksei Zakharov <zaharov@selectel.ru> | 2020-01-21 13:44:50 +0300 |
---|---|---|
committer | Aleksei Zakharov <zaharov@selectel.ru> | 2020-01-29 17:28:36 +0300 |
commit | 4eb58f7cccbdd6454db1e1d1171c27a30f80ba17 (patch) | |
tree | b54b73199822989dd1363c23c372c1816f11e0e9 /monitoring | |
parent | 96bd77d50916e1fbcbdab5191675b17146c420ec (diff) | |
download | ceph-4eb58f7cccbdd6454db1e1d1171c27a30f80ba17.tar.gz |
monitoring/grafana,prometheus: add per-pool pg states support
Signed-off-by: Aleksei Zakharov <zaharov@selectel.ru>
Diffstat (limited to 'monitoring')
-rw-r--r-- | monitoring/grafana/dashboards/ceph-cluster.json | 29 | ||||
-rw-r--r-- | monitoring/prometheus/alerts/ceph_default_alerts.yml | 8 |
2 files changed, 19 insertions, 18 deletions
diff --git a/monitoring/grafana/dashboards/ceph-cluster.json b/monitoring/grafana/dashboards/ceph-cluster.json index 2fcee528d24..93fe3372c6c 100644 --- a/monitoring/grafana/dashboards/ceph-cluster.json +++ b/monitoring/grafana/dashboards/ceph-cluster.json @@ -346,6 +346,7 @@ }, "id": 53, "legend": { + "alignAsTable": true, "avg": false, "current": false, "max": false, @@ -402,52 +403,52 @@ "steppedLine": false, "targets": [ { - "expr": "ceph_pg_total", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Total", + "legendFormat": "{{name}} Total", "refId": "A" }, { - "expr": "ceph_pg_active", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_active)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Active", + "legendFormat": "{{name}} Active", "refId": "B" }, { - "expr": "ceph_pg_total - ceph_pg_active", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Inactive", + "legendFormat": "{{name}} Inactive", "refId": "G" }, { - "expr": "ceph_pg_undersized", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_undersized)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Undersized", + "legendFormat": "{{name}} Undersized", "refId": "F" }, { - "expr": "ceph_pg_degraded", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_degraded)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Degraded", + "legendFormat": "{{name}} Degraded", "refId": "C" }, { - "expr": "ceph_pg_inconsistent", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_inconsistent)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Inconsistent", + "legendFormat": "{{name}} Inconsistent", "refId": "D" }, { - "expr": "ceph_pg_down", + "expr": "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_down)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "Down", + "legendFormat": "{{name}} Down", "refId": "E" } ], diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml index 716ccc935df..3f58aeeaeba 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml @@ -139,7 +139,7 @@ groups: - name: pgs rules: - alert: pgs inactive - expr: ceph_pg_total - ceph_pg_active > 0 + expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0 for: 5m labels: severity: critical @@ -147,11 +147,11 @@ groups: oid: 1.3.6.1.4.1.50495.15.1.2.7.1 annotations: description: > - {{ $value }} PGs have been inactive for more than 5 minutes. + {{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups aren't able to serve read/write requests. - alert: pgs unclean - expr: ceph_pg_total - ceph_pg_clean > 0 + expr: ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0 for: 15m labels: severity: warning @@ -159,7 +159,7 @@ groups: oid: 1.3.6.1.4.1.50495.15.1.2.7.2 annotations: description: > - {{ $value }} PGs haven't been clean for more than 15 minutes. + {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs haven't been able to completely recover from a previous failure. - name: nodes |