diff options
Diffstat (limited to 'config/prometheus/common_metrics.yml')
-rw-r--r-- | config/prometheus/common_metrics.yml | 215 |
1 files changed, 215 insertions, 0 deletions
diff --git a/config/prometheus/common_metrics.yml b/config/prometheus/common_metrics.yml new file mode 100644 index 00000000000..52023a2e3cb --- /dev/null +++ b/config/prometheus/common_metrics.yml @@ -0,0 +1,215 @@ +- group: Response metrics (NGINX Ingress) + priority: 10 + metrics: + - title: "Throughput" + y_label: "Requests / Sec" + required_metrics: + - nginx_upstream_responses_total + weight: 1 + queries: + - id: response_metrics_nginx_ingress_throughput_status_code + query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)' + unit: req / sec + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: orange + - value: 5xx + color: red + - title: "Latency" + y_label: "Latency (ms)" + required_metrics: + - nginx_upstream_response_msecs_avg + weight: 1 + queries: + - id: response_metrics_nginx_ingress_latency_pod_average + query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})' + label: Pod average + unit: ms + - title: "HTTP Error Rate" + y_label: "HTTP Errors" + required_metrics: + - nginx_upstream_responses_total + weight: 1 + queries: + - id: response_metrics_nginx_ingress_http_error_rate + query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100' + label: 5xx Errors + unit: "%" +- group: Response metrics (HA Proxy) + priority: 10 + metrics: + - title: "Throughput" + y_label: "Requests / Sec" + required_metrics: + - haproxy_frontend_http_requests_total + weight: 1 + queries: + - id: response_metrics_ha_proxy_throughput_status_code + query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)' + unit: req / sec + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: yellow + - value: 5xx + color: red + - title: "HTTP Error Rate" + y_label: "Error Rate (%)" + required_metrics: + - haproxy_frontend_http_responses_total + weight: 1 + queries: + - id: response_metrics_ha_proxy_http_error_rate + query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))' + label: HTTP Errors + unit: "%" +- group: Response metrics (AWS ELB) + priority: 10 + metrics: + - title: "Throughput" + y_label: "Requests / Sec" + required_metrics: + - aws_elb_request_count_sum + weight: 1 + queries: + - id: response_metrics_aws_elb_throughput_requests + query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60' + label: Total + unit: req / sec + - title: "Latency" + y_label: "Latency (ms)" + required_metrics: + - aws_elb_latency_average + weight: 1 + queries: + - id: response_metrics_aws_elb_latency_average + query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000' + label: Average + unit: ms + - title: "HTTP Error Rate" + y_label: "Error Rate (%)" + required_metrics: + - aws_elb_request_count_sum + - aws_elb_httpcode_backend_5_xx_sum + weight: 1 + queries: + - id: response_metrics_aws_elb_http_error_rate + query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})' + label: HTTP Errors + unit: "%" +- group: Response metrics (NGINX) + priority: 10 + metrics: + - title: "Throughput" + y_label: "Requests / Sec" + required_metrics: + - nginx_server_requests + weight: 1 + queries: + - id: response_metrics_nginx_throughput_status_code + query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)' + unit: req / sec + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: orange + - value: 5xx + color: red + - title: "Latency" + y_label: "Latency (ms)" + required_metrics: + - nginx_server_requestMsec + weight: 1 + queries: + - id: response_metrics_nginx_latency + query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})' + label: Upstream + unit: ms + - title: "HTTP Error Rate" + y_label: "HTTP 500 Errors / Sec" + required_metrics: + - nginx_server_requests + weight: 1 + queries: + - id: response_metrics_nginx_http_error_rate + query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))' + label: HTTP Errors + unit: "errors / sec" +- group: System metrics (Kubernetes) + priority: 5 + metrics: + - title: "Memory Usage (Total)" + y_label: "Total Memory Used" + required_metrics: + - container_memory_usage_bytes + weight: 4 + queries: + - id: system_metrics_kubernetes_container_memory_total + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024' + label: Total + unit: GB + - title: "Core Usage (Total)" + y_label: "Total Cores" + required_metrics: + - container_cpu_usage_seconds_total + weight: 3 + queries: + - id: system_metrics_kubernetes_container_cores_total + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)' + label: Total + unit: "cores" + - title: "Memory Usage (Pod average)" + y_label: "Memory Used per Pod" + required_metrics: + - container_memory_usage_bytes + weight: 2 + queries: + - id: system_metrics_kubernetes_container_memory_average + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average + unit: MB + - title: "Canary: Memory Usage (Pod Average)" + y_label: "Memory Used per Pod" + required_metrics: + - container_memory_usage_bytes + weight: 2 + queries: + - id: system_metrics_kubernetes_container_memory_average_canary + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average + unit: MB + track: canary + - title: "Core Usage (Pod Average)" + y_label: "Cores per Pod" + required_metrics: + - container_cpu_usage_seconds_total + weight: 1 + queries: + - id: system_metrics_kubernetes_container_core_usage + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average + unit: "cores" + - title: "Canary: Core Usage (Pod Average)" + y_label: "Cores per Pod" + required_metrics: + - container_cpu_usage_seconds_total + weight: 1 + queries: + - id: system_metrics_kubernetes_container_core_usage_canary + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average + unit: "cores" + track: canary |