diff options
author | syasonik <syasonik@gitlab.com> | 2019-04-11 13:58:18 +0800 |
---|---|---|
committer | syasonik <syasonik@gitlab.com> | 2019-04-24 18:23:03 +0800 |
commit | a2920682ec307b9aa830903014139948cdbb9b1f (patch) | |
tree | 0538f32dafcb794e2fc6ead9a45a6b576862f80c /config | |
parent | 6735f4c705ac6a08ac0b5e847c9f197b81c8502a (diff) | |
download | gitlab-ce-a2920682ec307b9aa830903014139948cdbb9b1f.tar.gz |
Add inital dashboard endpoint support
Diffstat (limited to 'config')
-rw-r--r-- | config/prometheus/system_dashboard.yml | 274 | ||||
-rw-r--r-- | config/routes/project.rb | 1 |
2 files changed, 275 insertions, 0 deletions
diff --git a/config/prometheus/system_dashboard.yml b/config/prometheus/system_dashboard.yml new file mode 100644 index 00000000000..694d6531034 --- /dev/null +++ b/config/prometheus/system_dashboard.yml @@ -0,0 +1,274 @@ +dashboard: 'System Metrics' +order: 0 +panel_groups: + # NGINX Ingress metrics for pre-0.16.0 versions + - group: Response metrics (NGINX Ingress VTS) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_nginx_ingress_throughput_status_code + query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)' + unit: req / sec + label: Status Code + required_metrics: + - nginx_upstream_responses_total + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: orange + - value: 5xx + color: red + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_nginx_ingress_latency_pod_average + query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})' + label: Pod average + unit: ms + required_metrics: + - nginx_upstream_response_msecs_avg + - type: area-chart + title: "HTTP Error Rate" + y_label: "HTTP Errors" + metrics: + - id: response_metrics_nginx_ingress_http_error_rate + query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100' + label: 5xx Errors + unit: "%" + required_metrics: + - nginx_upstream_responses_total + # NGINX Ingress metrics for post-0.16.0 versions + - group: Response metrics (NGINX Ingress) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_nginx_ingress_16_throughput_status_code + query_range: 'sum(label_replace(rate(nginx_ingress_controller_requests{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m]), "status_code", "${1}xx", "status", "(.)..")) by (status_code)' + unit: req / sec + required_metrics: + - nginx_ingress_controller_requests + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 3xx + color: blue + - value: 4xx + color: orange + - value: 5xx + color: red + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_nginx_ingress_16_latency_pod_average + query_range: 'sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_sum{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) / sum(rate(nginx_ingress_controller_ingress_upstream_latency_seconds_count{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) * 1000' + label: Pod average + unit: ms + required_metrics: + - nginx_ingress_controller_ingress_upstream_latency_seconds_sum + - type: area-chart + title: "HTTP Error Rate" + y_label: "HTTP Errors" + metrics: + - id: response_metrics_nginx_ingress_16_http_error_rate + query_range: 'sum(rate(nginx_ingress_controller_requests{status=~"5.*",namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) / sum(rate(nginx_ingress_controller_requests{namespace="%{kube_namespace}",ingress=~".*%{ci_environment_slug}.*"}[2m])) * 100' + label: 5xx Errors + unit: "%" + required_metrics: + - nginx_ingress_controller_requests + - group: Response metrics (HA Proxy) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_ha_proxy_throughput_status_code + query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)' + unit: req / sec + label: Status Code + required_metrics: + - haproxy_frontend_http_requests_total + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: yellow + - value: 5xx + color: red + - type: area-chart + title: "HTTP Error Rate" + y_label: "Error Rate (%)" + metrics: + - id: response_metrics_ha_proxy_http_error_rate + query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))' + label: HTTP Errors + unit: "%" + required_metrics: + - haproxy_frontend_http_responses_total + - group: Response metrics (AWS ELB) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_aws_elb_throughput_requests + query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60' + label: Total + unit: req / sec + required_metrics: + - aws_elb_request_count_sum + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_aws_elb_latency_average + query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000' + label: Average + unit: ms + required_metrics: + - aws_elb_latency_average + - type: area-chart + title: "HTTP Error Rate" + y_label: "Error Rate (%)" + metrics: + - id: response_metrics_aws_elb_http_error_rate + query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})' + label: HTTP Errors + unit: "%" + required_metrics: + - aws_elb_request_count_sum + - aws_elb_httpcode_backend_5_xx_sum + - group: Response metrics (NGINX) + priority: 10 + panels: + - type: area-chart + title: "Throughput" + y_label: "Requests / Sec" + metrics: + - id: response_metrics_nginx_throughput_status_code + query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)' + unit: req / sec + required_metrics: + - nginx_server_requests + label: Status Code + series: + - label: status_code + when: + - value: 2xx + color: green + - value: 4xx + color: orange + - value: 5xx + color: red + - type: area-chart + title: "Latency" + y_label: "Latency (ms)" + metrics: + - id: response_metrics_nginx_latency + query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})' + label: Upstream + unit: ms + required_metrics: + - nginx_server_requestMsec + - type: area-chart + title: "HTTP Error Rate" + y_label: "HTTP 500 Errors / Sec" + metrics: + - id: response_metrics_nginx_http_error_rate + query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))' + label: HTTP Errors + unit: "errors / sec" + required_metrics: + - nginx_server_requests + - group: System metrics (Kubernetes) + priority: 5 + panels: + - type: area-chart + title: "Memory Usage (Total)" + y_label: "Total Memory Used" + metrics: + - id: system_metrics_kubernetes_container_memory_total + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024' + label: Total + unit: GB + required_metrics: + - container_memory_usage_bytes + - type: area-chart + title: "Core Usage (Total)" + y_label: "Total Cores" + metrics: + - id: system_metrics_kubernetes_container_cores_total + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)' + label: Total + unit: "cores" + required_metrics: + - container_cpu_usage_seconds_total + - type: area-chart + title: "Memory Usage (Pod Average)" + y_label: "Memory Used per Pod" + metrics: + - id: system_metrics_kubernetes_container_memory_average + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average + unit: MB + required_metrics: + - container_memory_usage_bytes + - type: area-chart + title: "Canary: Memory Usage (Pod Average)" + y_label: "Memory Used per Pod" + metrics: + - id: system_metrics_kubernetes_container_memory_average_canary + query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024' + label: Pod average + unit: MB + required_metrics: + - container_memory_usage_bytes + track: canary + - type: area-chart + title: "Core Usage (Pod Average)" + y_label: "Cores per Pod" + metrics: + - id: system_metrics_kubernetes_container_core_usage + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-([^c].*|c([^a]|a([^n]|n([^a]|a([^r]|r[^y])))).*|)-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average + unit: "cores" + required_metrics: + - container_cpu_usage_seconds_total + - type: area-chart + title: "Canary: Core Usage (Pod Average)" + y_label: "Cores per Pod" + metrics: + - id: system_metrics_kubernetes_container_core_usage_canary + query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))' + label: Pod average + unit: "cores" + track: canary + required_metrics: + - container_cpu_usage_seconds_total + - type: area-chart + title: "Knative function invocations" + y_label: "Invocations" + metrics: + - id: system_metrics_knative_function_invocation_count + query_range: 'floor(sum(rate(istio_revision_request_count{destination_configuration="%{function_name}", destination_namespace="%{kube_namespace}"}[1m])*30))' + label: invocations / minute + unit: requests + required_metrics: + - istio_revision_request_count diff --git a/config/routes/project.rb b/config/routes/project.rb index 93d168fc595..f7841bbe595 100644 --- a/config/routes/project.rb +++ b/config/routes/project.rb @@ -218,6 +218,7 @@ constraints(::Constraints::ProjectUrlConstrainer.new) do get :terminal get :metrics get :additional_metrics + get :metrics_dashboard get '/terminal.ws/authorize', to: 'environments#terminal_websocket_authorize', constraints: { format: nil } get '/prometheus/api/v1/*proxy_path', to: 'environments/prometheus_api#proxy' |