summaryrefslogtreecommitdiff
path: root/config/prometheus/self_monitoring_default.yml
blob: 024733bf2f0de36c5de2c8d3fd088fcc2d27fa42 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
dashboard: 'Overview'
priority: 1

templating:
  variables:
    instance:
      type: 'text'
      label: 'Instance label regex'
      options:
        default_value: '.+'

panel_groups:

- group: 'Resource usage'
  panels:
  - title: "Memory usage"
    type: "line-chart"
    y_label: "% memory used"
    metrics:
    - id: node_memory_usage_percentage
      query_range: '(1 - (node_memory_MemAvailable_bytes{instance=~"{{instance}}"} or (node_memory_MemFree_bytes{instance=~"{{instance}}"} + node_memory_Buffers_bytes{instance=~"{{instance}}"} + node_memory_Cached_bytes{instance=~"{{instance}}"} + node_memory_Slab_bytes{instance=~"{{instance}}"})) / node_memory_MemTotal_bytes{instance=~"{{instance}}"}) * 100'
      unit: "%"
      label: instance

  - title: "CPU usage"
    type: "line-chart"
    y_label: "% CPU used"
    metrics:
    - id: node_cpu_usage_percentage
      query_range: '(avg without (mode,cpu) (1 - irate(node_cpu_seconds_total{mode="idle",instance=~"{{instance}}"}[5m]))) * 100'
      unit: "%"
      label: instance

- group: Web Service
  panels:
    - title: Web Service - Error Ratio
      type: line-chart
      y_label: "Unhandled Exceptions (%)"
      metrics:
        - id: wser_web_service
          query_range: 'max(max_over_time(gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="web", stage="main"}[1m])) by (type) * 100'
          unit: "%"
          label: "Error Ratio"
        - id: wser_degradation_slo
          query_range: 'avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="web", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"}) * 100'
          unit: "%"
          label: "Degradation SLO"
        - id: wser_outage_slo
          query_range: '2 * (avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="web", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"})) * 100'
          unit: "%"
          label: "Outage SLO"
- group: API Service
  panels:
    - title: API Service - Error Ratio
      type: line-chart
      y_label: "Unhandled Exceptions (%)"
      metrics:
        - id: aser_web_service
          query_range: 'max(max_over_time(gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="api", stage="main"}[1m])) by (type) * 100'
          unit: "%"
          label: "Error Ratio"
        - id: aser_degradation_slo
          query_range: 'avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="api", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"}) * 100'
          unit: "%"
          label: "Degradation SLO"
        - id: aser_outage_slo
          query_range: '2 * (avg(slo:max:gitlab_service_errors:ratio{environment="{{ci_environment_slug}}", type="api", stage="main"}) or avg(slo:max:gitlab_service_errors:ratio{type="web"})) * 100'
          unit: "%"
          label: "Outage SLO"