diff --git a/monitoring/Makefile b/monitoring/Makefile index 3174ac4..923a0a4 100644 --- a/monitoring/Makefile +++ b/monitoring/Makefile @@ -4,4 +4,6 @@ build: update: docker run --rm -v $(shell pwd):$(shell pwd) --workdir $(shell pwd) quay.io/coreos/jsonnet-ci jb update + sudo chown -R tobru. vendor/ + make build .PHONY: update diff --git a/monitoring/jsonnetfile.lock.json b/monitoring/jsonnetfile.lock.json index 750c3eb..2c9cc78 100644 --- a/monitoring/jsonnetfile.lock.json +++ b/monitoring/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "52fba431b686f6a5c30d60a0bbaf9fafc14bae35", + "version": "cb633418a2a67a41cd2f30d556f19e995ed8f274", "sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0=" }, { @@ -30,7 +30,7 @@ "subdir": "grafana-builder" } }, - "version": "7ac7da1a0fe165b68cdb718b2521b560d51bd1f4", + "version": "66eb3af2bd87c4ee18b97d5b2d366b234eef89cc", "sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE=" }, { @@ -74,7 +74,7 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "22d195f20a20b51cf14b5ff01bb4a200c65196da", + "version": "89ede10b19d7ef0145777717351cabe14b113c01", "sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA=" }, { @@ -85,7 +85,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "22d195f20a20b51cf14b5ff01bb4a200c65196da", + "version": "89ede10b19d7ef0145777717351cabe14b113c01", "sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU=" }, { @@ -96,8 +96,8 @@ "subdir": "" } }, - "version": "3cf851b2c8ff8bf98c12eac7f37d97f086cd0fc9", - "sum": "CydKHxWA9LG9w1+sjlqREHXPQTdbiTwy40rnyXfHfGE=" + "version": "02b62082e3feb271b8fd476603dceaa1fd2054c0", + "sum": "h+ZL4TFVFbSdlsY25mi5x1nRts3PY3JmKz3QXUgnXJk=" }, { "name": "node-mixin", @@ -107,7 +107,7 @@ "subdir": "docs/node-mixin" } }, - "version": "ef7c05816adcb0e8923defe34e97f6afcce0a939", + "version": "a7c31ff7ed0990545ed4cc62690fc53563ee8860", "sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg=" }, { @@ -118,7 +118,7 @@ "subdir": "documentation/prometheus-mixin" } }, - "version": "65a19421a42c69e16241eec24c66b98e4c8fa5da", + "version": "babadf13e852654cfc87c06fc8ff0b843586a00e", "sum": "5EUgr6Spr1zNR8Y2/NevjvEkGV9WMvKo6nEScNER1Lc=" }, { @@ -151,7 +151,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "3cf851b2c8ff8bf98c12eac7f37d97f086cd0fc9", + "version": "02b62082e3feb271b8fd476603dceaa1fd2054c0", "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" }, { diff --git a/monitoring/manifests/grafana-dashboardDefinitions.yaml b/monitoring/manifests/grafana-dashboardDefinitions.yaml index d58b9ad..b7011ef 100644 --- a/monitoring/manifests/grafana-dashboardDefinitions.yaml +++ b/monitoring/manifests/grafana-dashboardDefinitions.yaml @@ -23,7 +23,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -2980,7 +2980,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -3150,7 +3150,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -4305,7 +4305,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[1m]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$interval]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -6763,11 +6763,11 @@ items: "options": [ { "selected": true, - "text": "4h", - "value": "4h" + "text": "$__interval", + "value": "$__interval" } ], - "query": "4h", + "query": "$__interval", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -6779,6 +6779,33 @@ items: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(node_cpu_seconds_total, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -6838,6 +6865,354 @@ items: ], "refresh": "10s", "rows": [ + { + "collapse": false, + "height": "100px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation (from requests)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", namespace=\"$namespace\"}) / sum(kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation (from limits)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilization (from requests)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\",container!=\"\"}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation (from limits)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Headlines", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -6851,7 +7226,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 1, + "id": 5, "legend": { "avg": false, "current": false, @@ -6872,7 +7247,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -6886,6 +7280,22 @@ items: "legendFormat": "{{pod}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -6949,7 +7359,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 6, "legend": { "avg": false, "current": false, @@ -7216,7 +7626,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 7, "legend": { "avg": false, "current": false, @@ -7237,7 +7647,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -7251,6 +7680,22 @@ items: "legendFormat": "{{pod}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -7314,7 +7759,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 8, "legend": { "avg": false, "current": false, @@ -7662,7 +8107,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 9, "legend": { "avg": false, "current": false, @@ -7956,7 +8401,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 10, "legend": { "avg": false, "current": false, @@ -8054,7 +8499,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 11, "legend": { "avg": false, "current": false, @@ -8152,7 +8597,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 12, "legend": { "avg": false, "current": false, @@ -8250,7 +8695,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 13, "legend": { "avg": false, "current": false, @@ -8348,7 +8793,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 14, "legend": { "avg": false, "current": false, @@ -8446,7 +8891,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 11, + "id": 15, "legend": { "avg": false, "current": false, @@ -8555,60 +9000,6 @@ items: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -8627,11 +9018,11 @@ items: "options": [ { "selected": true, - "text": "4h", - "value": "4h" + "text": "$__interval", + "value": "$__interval" } ], - "query": "4h", + "query": "$__interval", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -8643,6 +9034,60 @@ items: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -9539,14 +9984,49 @@ items: }, { "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", "current": { - "text": "prod", - "value": "prod" + "text": "5m", + "value": "5m" }, "datasource": "$datasource", "hide": 2, "includeAll": false, - "label": "cluster", + "label": null, + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "$__interval", + "value": "$__interval" + } + ], + "query": "$__interval", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "interval", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, "multi": false, "name": "cluster", "options": [ @@ -9555,7 +10035,7 @@ items: "query": "label_values(kube_pod_info, cluster)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -9567,13 +10047,13 @@ items: { "allValue": null, "current": { - "text": "prod", - "value": "prod" + "text": "", + "value": "" }, "datasource": "$datasource", "hide": 0, "includeAll": false, - "label": "node", + "label": null, "multi": false, "name": "node", "options": [ @@ -9582,7 +10062,7 @@ items: "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, node)", "refresh": 1, "regex": "", - "sort": 2, + "sort": 1, "tagValuesQuery": "", "tags": [ @@ -9683,7 +10163,24 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "requests", + "color": "#F2495C", + "fill": 0, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + }, + { + "alias": "limits", + "color": "#FF9830", + "fill": 0, + "hideTooltip": true, + "legend": true, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -9697,6 +10194,22 @@ items: "legendFormat": "{{container}}", "legendLink": null, "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_requests_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum(\n kube_pod_container_resource_limits_cpu_cores{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -9759,8 +10272,113 @@ items: "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 1, + "fill": 10, "id": 2, + "legend": { + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(container_cpu_cfs_throttled_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", cluster=\"$cluster\"}[5m])) by (container)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.25, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Throttling", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Throttling", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, "legend": { "avg": false, "current": false, @@ -10027,7 +10645,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -10048,7 +10666,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -10056,26 +10693,26 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_rss{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (RSS)", + "legendFormat": "{{container}}", "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_cache{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(\n kube_pod_container_resource_requests_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (Cache)", + "legendFormat": "requests", "legendLink": null, "step": 10 }, { - "expr": "sum(container_memory_swap{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container!=\"POD\", container!=\"\"}) by (container)", + "expr": "sum(\n kube_pod_container_resource_limits_memory_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\"})\n", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{container}} (Swap)", + "legendFormat": "limits", "legendLink": null, "step": 10 } @@ -10141,7 +10778,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -10489,7 +11126,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -10587,7 +11224,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -10685,7 +11322,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -10783,7 +11420,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -10881,7 +11518,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -10979,7 +11616,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 10, + "id": 11, "legend": { "avg": false, "current": false, @@ -11088,87 +11725,6 @@ items: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "pod", - "multi": false, - "name": "pod", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -11187,11 +11743,11 @@ items: "options": [ { "selected": true, - "text": "4h", - "value": "4h" + "text": "$__interval", + "value": "$__interval" } ], - "query": "4h", + "query": "$__interval", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -11203,6 +11759,87 @@ items: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\", namespace=\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -13094,114 +13731,6 @@ items: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "workload", - "multi": false, - "name": "workload", - "options": [ - - ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "type", - "multi": false, - "name": "type", - "options": [ - - ], - "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -13220,11 +13749,11 @@ items: "options": [ { "selected": true, - "text": "4h", - "value": "4h" + "text": "$__interval", + "value": "$__interval" } ], - "query": "4h", + "query": "$__interval", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -13236,6 +13765,114 @@ items: "tagsQuery": "", "type": "interval", "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "workload", + "options": [ + + ], + "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\"}, workload)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "type", + "options": [ + + ], + "query": "label_values(mixin_pod_workload{cluster=\"$cluster\", namespace=\"$namespace\", workload=\"$workload\"}, workload_type)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false } ] }, @@ -13329,7 +13966,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -13343,6 +13999,22 @@ items: "legendFormat": "{{workload}} - {{workload_type}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.cpu\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -13739,7 +14411,26 @@ items: "points": false, "renderer": "flot", "seriesOverrides": [ - + { + "alias": "quota - requests", + "color": "#F2495C", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + }, + { + "alias": "quota - limits", + "color": "#FF9830", + "dashes": true, + "fill": 0, + "hideTooltip": true, + "legend": false, + "linewidth": 2, + "stack": false + } ], "spaceLength": 10, "span": 12, @@ -13753,6 +14444,22 @@ items: "legendFormat": "{{workload}} - {{workload_type}}", "legendLink": null, "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"requests.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - requests", + "legendLink": null, + "step": 10 + }, + { + "expr": "scalar(kube_resourcequota{cluster=\"$cluster\", namespace=\"$namespace\", type=\"hard\",resource=\"limits.memory\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "quota - limits", + "legendLink": null, + "step": 10 } ], "thresholds": [ @@ -15235,60 +15942,6 @@ items: "regex": "", "type": "datasource" }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 2, - "includeAll": false, - "label": "cluster", - "multi": false, - "name": "cluster", - "options": [ - - ], - "query": "label_values(kube_pod_info, cluster)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "text": "prod", - "value": "prod" - }, - "datasource": "$datasource", - "hide": 0, - "includeAll": false, - "label": "namespace", - "multi": false, - "name": "namespace", - "options": [ - - ], - "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", - "refresh": 1, - "regex": "", - "sort": 2, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false - }, { "allValue": null, "auto": false, @@ -15307,11 +15960,11 @@ items: "options": [ { "selected": true, - "text": "4h", - "value": "4h" + "text": "$__interval", + "value": "$__interval" } ], - "query": "4h", + "query": "$__interval", "refresh": 2, "regex": "", "skipUrlSync": false, @@ -15351,6 +16004,60 @@ items: "tagValuesQuery": "", "tags": [ + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "", + "value": "" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + ], "tagsQuery": "", "type": "query", @@ -15419,7 +16126,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -19105,7 +19812,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -20744,7 +21451,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -23872,7 +24579,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -25369,7 +26076,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], @@ -28433,7 +29140,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -29628,7 +30335,7 @@ items: "links": [ ], - "refresh": "", + "refresh": "10s", "rows": [ { "collapse": false, @@ -31780,7 +32487,7 @@ items: "type": "row" } ], - "refresh": "30s", + "refresh": "10s", "rows": [ ], diff --git a/monitoring/manifests/prometheus-rules.yaml b/monitoring/manifests/prometheus-rules.yaml index 2c64e88..51fcf6e 100644 --- a/monitoring/manifests/prometheus-rules.yaml +++ b/monitoring/manifests/prometheus-rules.yaml @@ -209,23 +209,33 @@ spec: - expr: | sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) max by(cluster, namespace, pod, node) (kube_pod_info) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - expr: | container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_working_set_bytes - expr: | container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_rss - expr: | container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_cache - expr: | container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) record: node_namespace_pod_container:container_memory_swap - expr: | sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace) @@ -253,35 +263,39 @@ spec: ) record: namespace:kube_pod_container_resource_requests_cpu_cores:sum - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: deployment record: mixin_pod_workload - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: daemonset record: mixin_pod_workload - expr: | - sum( + max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (cluster, namespace, workload, pod) + ) labels: workload_type: statefulset record: mixin_pod_workload @@ -338,7 +352,10 @@ spec: sum(min(kube_pod_info) by (cluster, node)) record: ':kube_pod_info_node_count:' - expr: | - max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)") + )) record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (cluster, node) (sum by (node, cpu) ( @@ -1164,6 +1181,16 @@ spec: for: 5m labels: severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds + on node {{ $labels.node }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 5 + for: 15m + labels: + severity: warning - alert: KubeletDown annotations: message: Kubelet has disappeared from Prometheus target discovery. diff --git a/monitoring/vendor/kubernetes-mixin/alerts/kubelet.libsonnet b/monitoring/vendor/kubernetes-mixin/alerts/kubelet.libsonnet index 58532e9..c6b19e1 100644 --- a/monitoring/vendor/kubernetes-mixin/alerts/kubelet.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/alerts/kubelet.libsonnet @@ -74,6 +74,19 @@ message: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.', }, }, + { + alert: 'KubeletPodStartUpLatencyHigh', + expr: ||| + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(kubeletSelector)s}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 5 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.', + }, + }, (import '../lib/absent_alert.libsonnet') { componentName:: 'Kubelet', selector:: $._config.kubeletSelector, diff --git a/monitoring/vendor/kubernetes-mixin/config.libsonnet b/monitoring/vendor/kubernetes-mixin/config.libsonnet index f66bc45..2bfd5f3 100644 --- a/monitoring/vendor/kubernetes-mixin/config.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/config.libsonnet @@ -64,6 +64,9 @@ local slo = import 'slo-libsonnet/slo.libsonnet'; // For links between grafana dashboards, you need to tell us if your grafana // servers under some non-root path. linkPrefix: '.', + + // The default refresh time for all dashboards, default to 10s + refresh: '10s', }, // Opt-in to multiCluster dashboards by overriding this and the clusterLabel. diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/apiserver.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/apiserver.libsonnet index 6aa177f..90d13de 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/apiserver.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/apiserver.libsonnet @@ -208,6 +208,6 @@ local singlestat = grafana.singlestat; .addPanel(memory) .addPanel(cpu) .addPanel(goroutines) - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/controller-manager.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/controller-manager.libsonnet index 24b96f0..905cf3a 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/controller-manager.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/controller-manager.libsonnet @@ -180,6 +180,6 @@ local singlestat = grafana.singlestat; .addPanel(memory) .addPanel(cpu) .addPanel(goroutines) - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/kubelet.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/kubelet.libsonnet index 7ce550d..fd50b6b 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/kubelet.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/kubelet.libsonnet @@ -413,6 +413,6 @@ local singlestat = grafana.singlestat; .addPanel(memory) .addPanel(cpu) .addPanel(goroutines) - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/cluster-total.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/cluster-total.libsonnet index f5fd099..73786f9 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/cluster-total.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/cluster-total.libsonnet @@ -343,7 +343,7 @@ local gauge = promgrafonnet.gauge; tags=($._config.grafanaK8s.dashboardTags), editable=true, schemaVersion=18, - refresh='30s', + refresh=($._config.grafanaK8s.refresh), time_from='now-1h', time_to='now', ) diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-pod.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-pod.libsonnet index fce3a06..ca0b89a 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-pod.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-pod.libsonnet @@ -345,7 +345,7 @@ local gauge = promgrafonnet.gauge; tags=($._config.grafanaK8s.dashboardTags), editable=true, schemaVersion=18, - refresh='30s', + refresh=($._config.grafanaK8s.refresh), time_from='now-1h', time_to='now', ) diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-workload.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-workload.libsonnet index 802d914..1bd5e36 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-workload.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/namespace-by-workload.libsonnet @@ -373,7 +373,7 @@ local gauge = promgrafonnet.gauge; tags=($._config.grafanaK8s.dashboardTags), editable=true, schemaVersion=18, - refresh='30s', + refresh=($._config.grafanaK8s.refresh), time_from='now-1h', time_to='now', ) diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/pod-total.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/pod-total.libsonnet index c5c7715..913c054 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/pod-total.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/pod-total.libsonnet @@ -242,7 +242,7 @@ local gauge = promgrafonnet.gauge; tags=($._config.grafanaK8s.dashboardTags), editable=true, schemaVersion=18, - refresh='30s', + refresh=($._config.grafanaK8s.refresh), time_from='now-1h', time_to='now', ) diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/workload-total.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/workload-total.libsonnet index 0ac399e..7575574 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/workload-total.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/network-usage/workload-total.libsonnet @@ -257,7 +257,7 @@ local gauge = promgrafonnet.gauge; tags=($._config.grafanaK8s.dashboardTags), editable=true, schemaVersion=18, - refresh='30s', + refresh=($._config.grafanaK8s.refresh), time_from='now-1h', time_to='now', ) diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/persistentvolumesusage.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/persistentvolumesusage.libsonnet index 0e98bf6..1b8c4da 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/persistentvolumesusage.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/persistentvolumesusage.libsonnet @@ -166,6 +166,6 @@ local gauge = promgrafonnet.gauge; row.new() .addPanel(inodesGraph) .addPanel(inodeGauge) - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/proxy.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/proxy.libsonnet index 0739f25..80b6c63 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/proxy.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/proxy.libsonnet @@ -186,6 +186,6 @@ local singlestat = grafana.singlestat; .addPanel(memory) .addPanel(cpu) .addPanel(goroutines) - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources.libsonnet index 10f9f7e..484d2f1 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources.libsonnet @@ -4,4 +4,4 @@ (import 'resources/node.libsonnet') + (import 'resources/pod.libsonnet') + (import 'resources/workload-namespace.libsonnet') + -(import 'resources/workload.libsonnet') \ No newline at end of file +(import 'resources/workload.libsonnet') diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/cluster.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/cluster.libsonnet index 18143d2..21b2aa1 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/cluster.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/cluster.libsonnet @@ -3,272 +3,289 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: { - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', - }, - ], + grafanaDashboards+:: { + local intervalTemplate = + template.new( + name='interval', + datasource='$datasource', + query='$__interval', + current='5m', + hide=2, + refresh=2, + includeAll=false, + sort=1 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + skipUrlSync: false, + type: 'interval', + options: [ + { + selected: true, + text: '$__interval', + value: '$__interval', + }, + ], + }, + + local clusterTemplate = + template.new( + name='cluster', + datasource='$datasource', + query='label_values(node_cpu_seconds_total, %s)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=2, + includeAll=false, + sort=1 + ), + + 'k8s-resources-cluster.json': + local tableStyles = { + namespace: { + alias: 'Namespace', + link: '%(prefix)s/d/%(uid)s/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-namespace.json') }, + linkTooltip: 'Drill down to pods', }, + 'Value #A': { + alias: 'Pods', + linkTooltip: 'Drill down to pods', + link: '%(prefix)s/d/%(uid)s/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-namespace.json') }, + decimals: 0, + }, + 'Value #B': { + alias: 'Workloads', + linkTooltip: 'Drill down to workloads', + link: '%(prefix)s/d/%(uid)s/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-workloads-namespace.json') }, + decimals: 0, + }, + }; - 'k8s-resources-cluster.json': - local tableStyles = { - namespace: { - alias: 'Namespace', - link: '%(prefix)s/d/%(uid)s/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-namespace.json') }, - linkTooltip: 'Drill down to pods', - }, - 'Value #A': { - alias: 'Pods', - linkTooltip: 'Drill down to pods', - link: '%(prefix)s/d/%(uid)s/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-namespace.json') }, - decimals: 0, - }, - 'Value #B': { - alias: 'Workloads', - linkTooltip: 'Drill down to workloads', - link: '%(prefix)s/d/%(uid)s/k8s-resources-workloads-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell_1' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-workloads-namespace.json') }, - decimals: 0, - }, - }; - local podWorkloadColumns = [ - 'count(mixin_pod_workload{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'count(avg(mixin_pod_workload{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config, - ]; + local podWorkloadColumns = [ + 'count(mixin_pod_workload{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'count(avg(mixin_pod_workload{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config, + ]; - local networkColumns = [ - 'sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, - 'sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, - 'sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, - 'sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, - 'sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, - 'sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, - ]; + local networkColumns = [ + 'sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, + 'sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, + 'sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, + 'sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, + 'sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, + 'sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, + ]; - local networkTableStyles = { - namespace: { - alias: 'Namespace', - link: '%(prefix)s/d/%(uid)s/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-namespace.json') }, - linkTooltip: 'Drill down to pods', - }, - 'Value #A': { - alias: 'Current Receive Bandwidth', - unit: 'Bps', - }, - 'Value #B': { - alias: 'Current Transmit Bandwidth', - unit: 'Bps', - }, - 'Value #C': { - alias: 'Rate of Received Packets', - unit: 'pps', - }, - 'Value #D': { - alias: 'Rate of Transmitted Packets', - unit: 'pps', - }, - 'Value #E': { - alias: 'Rate of Received Packets Dropped', - unit: 'pps', - }, - 'Value #F': { - alias: 'Rate of Transmitted Packets Dropped', - unit: 'pps', - }, - }; + local networkTableStyles = { + namespace: { + alias: 'Namespace', + link: '%(prefix)s/d/%(uid)s/k8s-resources-namespace?var-datasource=$datasource&var-cluster=$cluster&var-namespace=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-namespace.json') }, + linkTooltip: 'Drill down to pods', + }, + 'Value #A': { + alias: 'Current Receive Bandwidth', + unit: 'Bps', + }, + 'Value #B': { + alias: 'Current Transmit Bandwidth', + unit: 'Bps', + }, + 'Value #C': { + alias: 'Rate of Received Packets', + unit: 'pps', + }, + 'Value #D': { + alias: 'Rate of Transmitted Packets', + unit: 'pps', + }, + 'Value #E': { + alias: 'Rate of Received Packets Dropped', + unit: 'pps', + }, + 'Value #F': { + alias: 'Rate of Transmitted Packets Dropped', + unit: 'pps', + }, + }; - g.dashboard( - '%(dashboardNamePrefix)sCompute Resources / Cluster' % $._config.grafanaK8s, - uid=($._config.grafanaDashboardIDs['k8s-resources-cluster.json']), - ).addTemplate('cluster', 'node_cpu_seconds_total', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) - .addRow( - (g.row('Headlines') + - { - height: '100px', - showTitle: false, - }) - .addPanel( - g.panel('CPU Utilisation') + - g.statPanel('1 - avg(rate(node_cpu_seconds_total{mode="idle", %(clusterLabel)s="$cluster"}[1m]))' % $._config) - ) - .addPanel( - g.panel('CPU Requests Commitment') + - g.statPanel('sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_cpu_cores{%(clusterLabel)s="$cluster"})' % $._config) - ) - .addPanel( - g.panel('CPU Limits Commitment') + - g.statPanel('sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_cpu_cores{%(clusterLabel)s="$cluster"})' % $._config) - ) - .addPanel( - g.panel('Memory Utilisation') + - g.statPanel('1 - sum(:node_memory_MemAvailable_bytes:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_memory_bytes{%(clusterLabel)s="$cluster"})' % $._config) - ) - .addPanel( - g.panel('Memory Requests Commitment') + - g.statPanel('sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_memory_bytes{%(clusterLabel)s="$cluster"})' % $._config) - ) - .addPanel( - g.panel('Memory Limits Commitment') + - g.statPanel('sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_memory_bytes{%(clusterLabel)s="$cluster"})' % $._config) - ) + g.dashboard( + '%(dashboardNamePrefix)sCompute Resources / Cluster' % $._config.grafanaK8s, + uid=($._config.grafanaDashboardIDs['k8s-resources-cluster.json']), + ).addTemplate('cluster', 'node_cpu_seconds_total', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) + .addRow( + (g.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + g.panel('CPU Utilisation') + + g.statPanel('1 - avg(rate(node_cpu_seconds_total{mode="idle", %(clusterLabel)s="$cluster"}[$interval]))' % $._config) ) - .addRow( - g.row('CPU') - .addPanel( - g.panel('CPU Usage') + - g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, '{{namespace}}') + - g.stack - ) + .addPanel( + g.panel('CPU Requests Commitment') + + g.statPanel('sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_cpu_cores{%(clusterLabel)s="$cluster"})' % $._config) ) - .addRow( - g.row('CPU Quota') - .addPanel( - g.panel('CPU Quota') + - g.tablePanel(podWorkloadColumns + [ - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - ], tableStyles { - 'Value #C': { alias: 'CPU Usage' }, - 'Value #D': { alias: 'CPU Requests' }, - 'Value #E': { alias: 'CPU Requests %', unit: 'percentunit' }, - 'Value #F': { alias: 'CPU Limits' }, - 'Value #G': { alias: 'CPU Limits %', unit: 'percentunit' }, - }) - ) + .addPanel( + g.panel('CPU Limits Commitment') + + g.statPanel('sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_cpu_cores{%(clusterLabel)s="$cluster"})' % $._config) ) - .addRow( - g.row('Memory') - .addPanel( - g.panel('Memory Usage (w/o cache)') + + .addPanel( + g.panel('Memory Utilisation') + + g.statPanel('1 - sum(:node_memory_MemAvailable_bytes:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_memory_bytes{%(clusterLabel)s="$cluster"})' % $._config) + ) + .addPanel( + g.panel('Memory Requests Commitment') + + g.statPanel('sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_memory_bytes{%(clusterLabel)s="$cluster"})' % $._config) + ) + .addPanel( + g.panel('Memory Limits Commitment') + + g.statPanel('sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable_memory_bytes{%(clusterLabel)s="$cluster"})' % $._config) + ) + ) + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Usage') + + g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, '{{namespace}}') + + g.stack + ) + ) + .addRow( + g.row('CPU Quota') + .addPanel( + g.panel('CPU Quota') + + g.tablePanel(podWorkloadColumns + [ + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + ], tableStyles { + 'Value #C': { alias: 'CPU Usage' }, + 'Value #D': { alias: 'CPU Requests' }, + 'Value #E': { alias: 'CPU Requests %', unit: 'percentunit' }, + 'Value #F': { alias: 'CPU Limits' }, + 'Value #G': { alias: 'CPU Limits %', unit: 'percentunit' }, + }) + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Usage (w/o cache)') + + // Not using container_memory_usage_bytes here because that includes page cache + g.queryPanel('sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('bytes') }, + ) + ) + .addRow( + g.row('Memory Requests') + .addPanel( + g.panel('Requests by Namespace') + + g.tablePanel(podWorkloadColumns + [ // Not using container_memory_usage_bytes here because that includes page cache - g.queryPanel('sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('bytes') }, - ) + 'sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace)' % $._config, + 'sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + 'sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, + ], tableStyles { + 'Value #C': { alias: 'Memory Usage', unit: 'bytes' }, + 'Value #D': { alias: 'Memory Requests', unit: 'bytes' }, + 'Value #E': { alias: 'Memory Requests %', unit: 'percentunit' }, + 'Value #F': { alias: 'Memory Limits', unit: 'bytes' }, + 'Value #G': { alias: 'Memory Limits %', unit: 'percentunit' }, + }) ) - .addRow( - g.row('Memory Requests') - .addPanel( - g.panel('Requests by Namespace') + - g.tablePanel(podWorkloadColumns + [ - // Not using container_memory_usage_bytes here because that includes page cache - 'sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace)' % $._config, - 'sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - 'sum(container_memory_rss{%(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config, - ], tableStyles { - 'Value #C': { alias: 'Memory Usage', unit: 'bytes' }, - 'Value #D': { alias: 'Memory Requests', unit: 'bytes' }, - 'Value #E': { alias: 'Memory Requests %', unit: 'percentunit' }, - 'Value #F': { alias: 'Memory Limits', unit: 'bytes' }, - 'Value #G': { alias: 'Memory Limits %', unit: 'percentunit' }, - }) - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Current Network Usage') + + g.tablePanel( + networkColumns, + networkTableStyles + ), ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Current Network Usage') + - g.tablePanel( - networkColumns, - networkTableStyles - ), - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Receive Bandwidth') + + g.queryPanel('sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Receive Bandwidth') + - g.queryPanel('sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Transmit Bandwidth') + + g.queryPanel('sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Transmit Bandwidth') + - g.queryPanel('sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Average Container Bandwidth by Namespace: Received') + + g.queryPanel('avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Average Container Bandwidth by Namespace: Received') + - g.queryPanel('avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Average Container Bandwidth by Namespace: Transmitted') + + g.queryPanel('avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Average Container Bandwidth by Namespace: Transmitted') + - g.queryPanel('avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Rate of Received Packets') + + g.queryPanel('sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Rate of Received Packets') + - g.queryPanel('sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Rate of Transmitted Packets') + + g.queryPanel('sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Rate of Transmitted Packets') + - g.queryPanel('sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Rate of Received Packets Dropped') + + g.queryPanel('sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Rate of Received Packets Dropped') + - g.queryPanel('sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Rate of Transmitted Packets Dropped') + + g.queryPanel('sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + + g.stack + + { yaxes: g.yaxes('Bps') }, ) - .addRow( - g.row('Network') - .addPanel( - g.panel('Rate of Transmitted Packets Dropped') + - g.queryPanel('sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~".+"}[$interval])) by (namespace)' % $._config, '{{namespace}}') + - g.stack + - { yaxes: g.yaxes('Bps') }, - ) - ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate] } }, - } -} \ No newline at end of file + ) + { + tags: $._config.grafanaK8s.dashboardTags, + templating+: { list+: [intervalTemplate, clusterTemplate] }, + refresh: $._config.grafanaK8s.refresh, + }, + }, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/multi-cluster.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/multi-cluster.libsonnet index ebeb340..09d53ad 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/multi-cluster.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/multi-cluster.libsonnet @@ -3,105 +3,105 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: - if $._config.showMultiCluster then { - 'k8s-resources-multicluster.json': - local tableStyles = { - [$._config.clusterLabel]: { - alias: 'Cluster', - link: '%(prefix)s/d/%(uid)s/k8s-resources-cluster?var-datasource=$datasource&var-cluster=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-cluster.json') }, - }, - }; + grafanaDashboards+:: + if $._config.showMultiCluster then { + 'k8s-resources-multicluster.json': + local tableStyles = { + [$._config.clusterLabel]: { + alias: 'Cluster', + link: '%(prefix)s/d/%(uid)s/k8s-resources-cluster?var-datasource=$datasource&var-cluster=$__cell' % { prefix: $._config.grafanaK8s.linkPrefix, uid: std.md5('k8s-resources-cluster.json') }, + }, + }; - g.dashboard( - '%(dashboardNamePrefix)sCompute Resources / Multi-Cluster' % $._config.grafanaK8s, - uid=($._config.grafanaDashboardIDs['k8s-resources-multicluster.json']), - ).addRow( - (g.row('Headlines') + - { - height: '100px', - showTitle: false, - }) - .addPanel( - g.panel('CPU Utilisation') + - g.statPanel('1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m]))' % $._config) + g.dashboard( + '%(dashboardNamePrefix)sCompute Resources / Multi-Cluster' % $._config.grafanaK8s, + uid=($._config.grafanaDashboardIDs['k8s-resources-multicluster.json']), + ).addRow( + (g.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + g.panel('CPU Utilisation') + + g.statPanel('1 - avg(rate(node_cpu_seconds_total{mode="idle"}[$__interval]))' % $._config) + ) + .addPanel( + g.panel('CPU Requests Commitment') + + g.statPanel('sum(kube_pod_container_resource_requests_cpu_cores) / sum(kube_node_status_allocatable_cpu_cores)' % $._config) + ) + .addPanel( + g.panel('CPU Limits Commitment') + + g.statPanel('sum(kube_pod_container_resource_limits_cpu_cores) / sum(kube_node_status_allocatable_cpu_cores)' % $._config) + ) + .addPanel( + g.panel('Memory Utilisation') + + g.statPanel('1 - sum(:node_memory_MemAvailable_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes)' % $._config) + ) + .addPanel( + g.panel('Memory Requests Commitment') + + g.statPanel('sum(kube_pod_container_resource_requests_memory_bytes) / sum(kube_node_status_allocatable_memory_bytes)' % $._config) + ) + .addPanel( + g.panel('Memory Limits Commitment') + + g.statPanel('sum(kube_pod_container_resource_limits_memory_bytes) / sum(kube_node_status_allocatable_memory_bytes)' % $._config) + ) ) - .addPanel( - g.panel('CPU Requests Commitment') + - g.statPanel('sum(kube_pod_container_resource_requests_cpu_cores) / sum(kube_node_status_allocatable_cpu_cores)' % $._config) + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Usage') + + g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config, '{{%(clusterLabel)s}}' % $._config) + + { fill: 0, linewidth: 2 }, + ) ) - .addPanel( - g.panel('CPU Limits Commitment') + - g.statPanel('sum(kube_pod_container_resource_limits_cpu_cores) / sum(kube_node_status_allocatable_cpu_cores)' % $._config) + .addRow( + g.row('CPU Quota') + .addPanel( + g.panel('CPU Quota') + + g.tablePanel([ + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config, + 'sum(kube_pod_container_resource_requests_cpu_cores) by (%(clusterLabel)s)' % $._config, + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests_cpu_cores) by (%(clusterLabel)s)' % $._config, + 'sum(kube_pod_container_resource_limits_cpu_cores) by (%(clusterLabel)s)' % $._config, + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits_cpu_cores) by (%(clusterLabel)s)' % $._config, + ], tableStyles { + 'Value #A': { alias: 'CPU Usage' }, + 'Value #B': { alias: 'CPU Requests' }, + 'Value #C': { alias: 'CPU Requests %', unit: 'percentunit' }, + 'Value #D': { alias: 'CPU Limits' }, + 'Value #E': { alias: 'CPU Limits %', unit: 'percentunit' }, + }) + ) ) - .addPanel( - g.panel('Memory Utilisation') + - g.statPanel('1 - sum(:node_memory_MemAvailable_bytes:sum) / sum(kube_node_status_allocatable_memory_bytes)' % $._config) - ) - .addPanel( - g.panel('Memory Requests Commitment') + - g.statPanel('sum(kube_pod_container_resource_requests_memory_bytes) / sum(kube_node_status_allocatable_memory_bytes)' % $._config) - ) - .addPanel( - g.panel('Memory Limits Commitment') + - g.statPanel('sum(kube_pod_container_resource_limits_memory_bytes) / sum(kube_node_status_allocatable_memory_bytes)' % $._config) - ) - ) - .addRow( - g.row('CPU') - .addPanel( - g.panel('CPU Usage') + - g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config, '{{%(clusterLabel)s}}' % $._config) - + { fill: 0, linewidth: 2 }, - ) - ) - .addRow( - g.row('CPU Quota') - .addPanel( - g.panel('CPU Quota') + - g.tablePanel([ - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_requests_cpu_cores) by (%(clusterLabel)s)' % $._config, - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests_cpu_cores) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_limits_cpu_cores) by (%(clusterLabel)s)' % $._config, - 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits_cpu_cores) by (%(clusterLabel)s)' % $._config, - ], tableStyles { - 'Value #A': { alias: 'CPU Usage' }, - 'Value #B': { alias: 'CPU Requests' }, - 'Value #C': { alias: 'CPU Requests %', unit: 'percentunit' }, - 'Value #D': { alias: 'CPU Limits' }, - 'Value #E': { alias: 'CPU Limits %', unit: 'percentunit' }, - }) - ) - ) - .addRow( - g.row('Memory') - .addPanel( - g.panel('Memory Usage (w/o cache)') + - // Not using container_memory_usage_bytes here because that includes page cache - g.queryPanel('sum(container_memory_rss{container!=""}) by (%(clusterLabel)s)' % $._config, '{{%(clusterLabel)s}}' % $._config) + - { fill: 0, linewidth: 2, yaxes: g.yaxes('bytes') }, - ) - ) - .addRow( - g.row('Memory Requests') - .addPanel( - g.panel('Requests by Namespace') + - g.tablePanel([ + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Usage (w/o cache)') + // Not using container_memory_usage_bytes here because that includes page cache - 'sum(container_memory_rss{container!=""}) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_requests_memory_bytes) by (%(clusterLabel)s)' % $._config, - 'sum(container_memory_rss{container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests_memory_bytes) by (%(clusterLabel)s)' % $._config, - 'sum(kube_pod_container_resource_limits_memory_bytes) by (%(clusterLabel)s)' % $._config, - 'sum(container_memory_rss{container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits_memory_bytes) by (%(clusterLabel)s)' % $._config, - ], tableStyles { - 'Value #A': { alias: 'Memory Usage', unit: 'bytes' }, - 'Value #B': { alias: 'Memory Requests', unit: 'bytes' }, - 'Value #C': { alias: 'Memory Requests %', unit: 'percentunit' }, - 'Value #D': { alias: 'Memory Limits', unit: 'bytes' }, - 'Value #E': { alias: 'Memory Limits %', unit: 'percentunit' }, - }) + g.queryPanel('sum(container_memory_rss{container!=""}) by (%(clusterLabel)s)' % $._config, '{{%(clusterLabel)s}}' % $._config) + + { fill: 0, linewidth: 2, yaxes: g.yaxes('bytes') }, + ) ) - ) + { tags: $._config.grafanaK8s.dashboardTags }, - } else {}, -} \ No newline at end of file + .addRow( + g.row('Memory Requests') + .addPanel( + g.panel('Requests by Namespace') + + g.tablePanel([ + // Not using container_memory_usage_bytes here because that includes page cache + 'sum(container_memory_rss{container!=""}) by (%(clusterLabel)s)' % $._config, + 'sum(kube_pod_container_resource_requests_memory_bytes) by (%(clusterLabel)s)' % $._config, + 'sum(container_memory_rss{container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_requests_memory_bytes) by (%(clusterLabel)s)' % $._config, + 'sum(kube_pod_container_resource_limits_memory_bytes) by (%(clusterLabel)s)' % $._config, + 'sum(container_memory_rss{container!=""}) by (%(clusterLabel)s) / sum(kube_pod_container_resource_limits_memory_bytes) by (%(clusterLabel)s)' % $._config, + ], tableStyles { + 'Value #A': { alias: 'Memory Usage', unit: 'bytes' }, + 'Value #B': { alias: 'Memory Requests', unit: 'bytes' }, + 'Value #C': { alias: 'Memory Requests %', unit: 'percentunit' }, + 'Value #D': { alias: 'Memory Limits', unit: 'bytes' }, + 'Value #E': { alias: 'Memory Limits %', unit: 'percentunit' }, + }) + ) + ) + { tags: $._config.grafanaK8s.dashboardTags, refresh: $._config.grafanaK8s.refresh }, + } else {}, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/namespace.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/namespace.libsonnet index a0ad253..4440c73 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/namespace.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/namespace.libsonnet @@ -3,33 +3,56 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: { - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', - }, - ], - }, + grafanaDashboards+:: { + local intervalTemplate = + template.new( + name='interval', + datasource='$datasource', + query='$__interval', + current='5m', + hide=2, + refresh=2, + includeAll=false, + sort=1 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + skipUrlSync: false, + type: 'interval', + options: [ + { + selected: true, + text: '$__interval', + value: '$__interval', + }, + ], + }, - 'k8s-resources-namespace.json': + local clusterTemplate = + template.new( + name='cluster', + datasource='$datasource', + query='label_values(kube_pod_info, %s)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=1, + includeAll=false, + sort=1 + ), + + local namespaceTemplate = + template.new( + name='namespace', + datasource='$datasource', + query='label_values(kube_pod_info{%(clusterLabel)s="$cluster"}, namespace)' % $._config.clusterLabel, + current='', + hide='', + refresh=1, + includeAll=false, + sort=1 + ), + 'k8s-resources-namespace.json': local tableStyles = { pod: { alias: 'Pod', @@ -78,17 +101,75 @@ local template = grafana.template; }, }; + local cpuUsageQuery = 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config; + + local memoryUsageQuery = 'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}) by (pod)' % $._config; + + local cpuQuotaRequestsQuery = 'scalar(kube_resourcequota{%(clusterLabel)s="$cluster", namespace="$namespace", type="hard",resource="requests.cpu"})' % $._config; + local cpuQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.cpu'); + local memoryQuotaRequestsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'requests.memory'); + local memoryQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.memory'); + g.dashboard( '%(dashboardNamePrefix)sCompute Resources / Namespace (Pods)' % $._config.grafanaK8s, uid=($._config.grafanaDashboardIDs['k8s-resources-namespace.json']), - ).addTemplate('cluster', 'kube_pod_info', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) - .addTemplate('namespace', 'kube_pod_info{%(clusterLabel)s="$cluster"}' % $._config, 'namespace') + ) + .addRow( + (g.row('Headlines') + + { + height: '100px', + showTitle: false, + }) + .addPanel( + g.panel('CPU Utilisation (from requests)') + + g.statPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace"})' % $._config) + ) + .addPanel( + g.panel('CPU Utilisation (from limits)') + + g.statPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) / sum(kube_pod_container_resource_limits_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace"})' % $._config) + ) + .addPanel( + g.panel('Memory Utilization (from requests)') + + g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"})' % $._config) + ) + .addPanel( + g.panel('Memory Utilisation (from limits)') + + g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"})' % $._config) + ) + ) .addRow( g.row('CPU Usage') .addPanel( g.panel('CPU Usage') + - g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config, '{{pod}}') + - g.stack, + g.queryPanel([ + cpuUsageQuery, + cpuQuotaRequestsQuery, + cpuQuotaLimitsQuery, + ], ['{{pod}}', 'quota - requests', 'quota - limits']) + + g.stack + { + seriesOverrides: [ + { + alias: 'quota - requests', + color: '#F2495C', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + { + alias: 'quota - limits', + color: '#FF9830', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + ], + }, ) ) .addRow( @@ -115,9 +196,37 @@ local template = grafana.template; .addPanel( g.panel('Memory Usage (w/o cache)') + // Like above, without page cache - g.queryPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}) by (pod)' % $._config, '{{pod}}') + + g.queryPanel([ + memoryUsageQuery, + memoryQuotaRequestsQuery, + memoryQuotaLimitsQuery, + ], ['{{pod}}', 'quota - requests', 'quota - limits']) + g.stack + - { yaxes: g.yaxes('bytes') }, + { + yaxes: g.yaxes('bytes'), + seriesOverrides: [ + { + alias: 'quota - requests', + color: '#F2495C', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + { + alias: 'quota - limits', + color: '#FF9830', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + ], + }, ) ) .addRow( @@ -208,6 +317,6 @@ local template = grafana.template; g.stack + { yaxes: g.yaxes('Bps') }, ) - ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate] } }, - } -} \ No newline at end of file + ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate, clusterTemplate, namespaceTemplate] }, refresh: $._config.grafanaK8s.refresh }, + }, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/node.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/node.libsonnet index 1fb0dd5..6cfc991 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/node.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/node.libsonnet @@ -3,33 +3,57 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: { - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', - }, - ], - }, + grafanaDashboards+:: { + local intervalTemplate = + template.new( + name='interval', + datasource='$datasource', + query='$__interval', + current='5m', + hide=2, + refresh=2, + includeAll=false, + sort=1 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + skipUrlSync: false, + type: 'interval', + options: [ + { + selected: true, + text: '$__interval', + value: '$__interval', + }, + ], + }, - 'k8s-resources-node.json': + local clusterTemplate = + template.new( + name='cluster', + datasource='$datasource', + query='label_values(kube_pod_info, %s)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=1, + includeAll=false, + sort=1 + ), + + local nodeTemplate = + template.new( + name='node', + datasource='$datasource', + query='label_values(kube_pod_info{%(clusterLabel)s="$cluster"}, node)' % $._config.clusterLabel, + current='', + hide='', + refresh=1, + includeAll=false, + sort=1 + ), + + 'k8s-resources-node.json': local tableStyles = { pod: { alias: 'Pod', @@ -39,8 +63,7 @@ local template = grafana.template; g.dashboard( '%(dashboardNamePrefix)sCompute Resources / Node (Pods)' % $._config.grafanaK8s, uid=($._config.grafanaDashboardIDs['k8s-resources-node.json']), - ).addTemplate('cluster', 'kube_pod_info', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) - .addTemplate('node', 'kube_pod_info{%(clusterLabel)s="$cluster"}' % $._config, 'node') + ) .addRow( g.row('CPU Usage') .addPanel( @@ -102,6 +125,6 @@ local template = grafana.template; 'Value #H': { alias: 'Memory Usage (Swap)', unit: 'bytes' }, }) ) - ) + { tags: $._config.grafanaK8s.dashboardTags }, - } -} \ No newline at end of file + ) + { tags: $._config.grafanaK8s.dashboardTags, refresh: $._config.grafanaK8s.refresh, templating+: { list+: [intervalTemplate, clusterTemplate, nodeTemplate] } }, + }, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/pod.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/pod.libsonnet index fb32b04..8c9ab07 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/pod.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/pod.libsonnet @@ -3,51 +3,150 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: { - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', - }, - ], - }, + grafanaDashboards+:: { + local intervalTemplate = + template.new( + name='interval', + datasource='$datasource', + query='$__interval', + current='5m', + hide=2, + refresh=2, + includeAll=false, + sort=1 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + skipUrlSync: false, + type: 'interval', + options: [ + { + selected: true, + text: '$__interval', + value: '$__interval', + }, + ], + }, - 'k8s-resources-pod.json': + local clusterTemplate = + template.new( + name='cluster', + datasource='$datasource', + query='label_values(kube_pod_info, %s)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=1, + includeAll=false, + sort=1 + ), + + local namespaceTemplate = + template.new( + name='namespace', + datasource='$datasource', + query='label_values(kube_pod_info{%(clusterLabel)s="$cluster"}, namespace)' % $._config.clusterLabel, + current='', + hide='', + refresh=1, + includeAll=false, + sort=1 + ), + + local podTemplate = + template.new( + name='pod', + datasource='$datasource', + query='label_values(kube_pod_info{%(clusterLabel)s="$cluster", namespace="$namespace"}, pod)' % $._config.clusterLabel, + current='', + hide='', + refresh=2, + includeAll=false, + sort=1 + ), + + 'k8s-resources-pod.json': local tableStyles = { container: { alias: 'Container', }, }; + local cpuRequestsQuery = ||| + sum( + kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod"}) + ||| % $._config; + + local cpuLimitsQuery = std.strReplace(cpuRequestsQuery, 'requests', 'limits'); + local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu_cores', 'memory_bytes'); + local memLimitsQuery = std.strReplace(cpuLimitsQuery, 'cpu_cores', 'memory_bytes'); + g.dashboard( '%(dashboardNamePrefix)sCompute Resources / Pod' % $._config.grafanaK8s, uid=($._config.grafanaDashboardIDs['k8s-resources-pod.json']), - ).addTemplate('cluster', 'kube_pod_info', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) - .addTemplate('namespace', 'kube_pod_info{%(clusterLabel)s="$cluster"}' % $._config, 'namespace') - .addTemplate('pod', 'kube_pod_info{%(clusterLabel)s="$cluster", namespace="$namespace"}' % $._config, 'pod') + ) .addRow( g.row('CPU Usage') .addPanel( g.panel('CPU Usage') + - g.queryPanel('sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}) by (container)' % $._config, '{{container}}') + - g.stack, + g.queryPanel( + [ + 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}) by (container)' % $._config, + cpuRequestsQuery, + cpuLimitsQuery, + ], [ + '{{container}}', + 'requests', + 'limits', + ], + ) + + g.stack + { + seriesOverrides: [ + { + alias: 'requests', + color: '#F2495C', + fill: 0, + hideTooltip: true, + legend: true, + linewidth: 2, + stack: false, + }, + { + alias: 'limits', + color: '#FF9830', + fill: 0, + hideTooltip: true, + legend: true, + linewidth: 2, + stack: false, + }, + ], + }, + ) + ) + .addRow( + g.row('CPU Throttling') + .addPanel( + g.panel('CPU Throttling') + + g.queryPanel('sum(increase(container_cpu_cfs_throttled_periods_total{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}[5m])) by (container)' % $._config, '{{container}}') + + g.stack + + { + yaxes: g.yaxes({ format: 'percentunit', max: 1 }), + legend+: { + current: true, + max: true, + }, + thresholds: [ + { + value: $._config.cpuThrottlingPercent / 100, + colorMode: 'critical', + op: 'gt', + fill: true, + line: true, + yaxis: 'left', + }, + ], + }, ) ) .addRow( @@ -74,16 +173,40 @@ local template = grafana.template; .addPanel( g.panel('Memory Usage') + g.queryPanel([ - 'sum(container_memory_rss{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config, - 'sum(container_memory_cache{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config, - 'sum(container_memory_swap{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config, + 'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config, + memRequestsQuery, + memLimitsQuery, ], [ - '{{container}} (RSS)', - '{{container}} (Cache)', - '{{container}} (Swap)', + '{{container}}', + 'requests', + 'limits', ]) + g.stack + - { yaxes: g.yaxes('bytes') }, + { + yaxes: g.yaxes('bytes'), + seriesOverrides: [ + { + alias: 'requests', + color: '#F2495C', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + { + alias: 'limits', + color: '#FF9830', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + ], + } ) ) .addRow( @@ -164,6 +287,6 @@ local template = grafana.template; g.stack + { yaxes: g.yaxes('Bps') }, ) - ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate] } }, - } -} \ No newline at end of file + ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate, clusterTemplate, namespaceTemplate, podTemplate] }, refresh: $._config.grafanaK8s.refresh }, + }, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload-namespace.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload-namespace.libsonnet index c9cbb38..26ab665 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload-namespace.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload-namespace.libsonnet @@ -3,51 +3,75 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: { - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', - }, - ], - }, + grafanaDashboards+:: { + local intervalTemplate = + template.new( + name='interval', + datasource='$datasource', + query='$__interval', + current='5m', + hide=2, + refresh=2, + includeAll=false, + sort=1 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + skipUrlSync: false, + type: 'interval', + options: [ + { + selected: true, + text: '$__interval', + value: '$__interval', + }, + ], + }, - local typeTemplate = - template.new( - name='type', - datasource='$datasource', - query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)', - current='deployment', - hide='', - refresh=1, - includeAll=false, - sort=0 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)', - skipUrlSync: false, - }, + local typeTemplate = + template.new( + name='type', + datasource='$datasource', + query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)', + current='deployment', + hide='', + refresh=1, + includeAll=false, + sort=0 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)', + skipUrlSync: false, + }, - 'k8s-resources-workloads-namespace.json': + local clusterTemplate = + template.new( + name='cluster', + datasource='$datasource', + query='label_values(kube_pod_info, %s)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=1, + includeAll=false, + sort=1 + ), + + local namespaceTemplate = + template.new( + name='namespace', + datasource='$datasource', + query='label_values(kube_pod_info{%(clusterLabel)s="$cluster"}, namespace)' % $._config.clusterLabel, + current='', + hide='', + refresh=1, + includeAll=false, + sort=1 + ), + + 'k8s-resources-workloads-namespace.json': local tableStyles = { workload: { alias: 'Workload', @@ -155,17 +179,44 @@ local template = grafana.template; local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu_cores', 'memory_bytes'); local memLimitsQuery = std.strReplace(cpuLimitsQuery, 'cpu_cores', 'memory_bytes'); + local cpuQuotaRequestsQuery = 'scalar(kube_resourcequota{%(clusterLabel)s="$cluster", namespace="$namespace", type="hard",resource="requests.cpu"})' % $._config; + local cpuQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.cpu'); + local memoryQuotaRequestsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'requests.memory'); + local memoryQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.memory'); + g.dashboard( '%(dashboardNamePrefix)sCompute Resources / Namespace (Workloads)' % $._config.grafanaK8s, uid=($._config.grafanaDashboardIDs['k8s-resources-workloads-namespace.json']), - ).addTemplate('cluster', 'kube_pod_info', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) - .addTemplate('namespace', 'kube_pod_info{%(clusterLabel)s="$cluster"}' % $._config, 'namespace') + ) .addRow( g.row('CPU Usage') .addPanel( g.panel('CPU Usage') + - g.queryPanel(cpuUsageQuery, '{{workload}} - {{workload_type}}') + - g.stack, + g.queryPanel([cpuUsageQuery, cpuQuotaRequestsQuery, cpuQuotaLimitsQuery], ['{{workload}} - {{workload_type}}', 'quota - requests', 'quota - limits']) + + g.stack + { + seriesOverrides: [ + { + alias: 'quota - requests', + color: '#F2495C', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + { + alias: 'quota - limits', + color: '#FF9830', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + ], + }, ) ) .addRow( @@ -193,9 +244,33 @@ local template = grafana.template; g.row('Memory Usage') .addPanel( g.panel('Memory Usage') + - g.queryPanel(memUsageQuery, '{{workload}} - {{workload_type}}') + + g.queryPanel([memUsageQuery, memoryQuotaRequestsQuery, memoryQuotaLimitsQuery], ['{{workload}} - {{workload_type}}', 'quota - requests', 'quota - limits']) + g.stack + - { yaxes: g.yaxes('bytes') }, + { + yaxes: g.yaxes('bytes'), + seriesOverrides: [ + { + alias: 'quota - requests', + color: '#F2495C', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + { + alias: 'quota - limits', + color: '#FF9830', + dashes: true, + fill: 0, + hideTooltip: true, + legend: false, + linewidth: 2, + stack: false, + }, + ], + }, ) ) .addRow( @@ -332,7 +407,7 @@ local template = grafana.template; g.stack + { yaxes: g.yaxes('Bps') }, ) - ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate, typeTemplate] } }, + ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate, typeTemplate, clusterTemplate, namespaceTemplate] }, refresh: $._config.grafanaK8s.refresh }, - } -} \ No newline at end of file + }, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload.libsonnet index aed215e..db5ccab 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/resources/workload.libsonnet @@ -3,32 +3,79 @@ local grafana = import 'grafonnet/grafana.libsonnet'; local template = grafana.template; { - grafanaDashboards+:: { - local intervalTemplate = - template.new( - name='interval', - datasource='$datasource', - query='4h', - current='5m', - hide=2, - refresh=2, - includeAll=false, - sort=1 - ) + { - auto: false, - auto_count: 30, - auto_min: '10s', - skipUrlSync: false, - type: 'interval', - options: [ - { - selected: true, - text: '4h', - value: '4h', - }, - ], - }, - + grafanaDashboards+:: { + local intervalTemplate = + template.new( + name='interval', + datasource='$datasource', + query='$__interval', + current='5m', + hide=2, + refresh=2, + includeAll=false, + sort=1 + ) + { + auto: false, + auto_count: 30, + auto_min: '10s', + skipUrlSync: false, + type: 'interval', + options: [ + { + selected: true, + text: '$__interval', + value: '$__interval', + }, + ], + }, + + local clusterTemplate = + template.new( + name='cluster', + datasource='$datasource', + query='label_values(kube_pod_info, %s)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=1, + includeAll=false, + sort=1 + ), + + local namespaceTemplate = + template.new( + name='namespace', + datasource='$datasource', + query='label_values(kube_pod_info{%(clusterLabel)s="$cluster"}, namespace)' % $._config.clusterLabel, + current='', + hide='', + refresh=1, + includeAll=false, + sort=1 + ), + + local workloadTemplate = + template.new( + name='workload', + datasource='$datasource', + query='label_values(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace"}, workload)' % $._config.clusterLabel, + current='', + hide=if $._config.showMultiCluster then '' else '2', + refresh=1, + includeAll=false, + sort=1 + ), + + local workloadTypeTemplate = + template.new( + name='type', + datasource='$datasource', + query='label_values(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload"}, workload_type)' % $._config.clusterLabel, + current='', + hide='', + refresh=1, + includeAll=false, + sort=1 + ), 'k8s-resources-workload.json': local tableStyles = { pod: { @@ -133,10 +180,7 @@ local template = grafana.template; g.dashboard( '%(dashboardNamePrefix)sCompute Resources / Workload' % $._config.grafanaK8s, uid=($._config.grafanaDashboardIDs['k8s-resources-workload.json']), - ).addTemplate('cluster', 'kube_pod_info', $._config.clusterLabel, hide=if $._config.showMultiCluster then 0 else 2) - .addTemplate('namespace', 'kube_pod_info{%(clusterLabel)s="$cluster"}' % $._config, 'namespace') - .addTemplate('workload', 'mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace"}' % $._config, 'workload') - .addTemplate('type', 'mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload"}' % $._config, 'workload_type') + ) .addRow( g.row('CPU Usage') .addPanel( @@ -305,6 +349,6 @@ local template = grafana.template; g.stack + { yaxes: g.yaxes('Bps') }, ) - ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate] } }, - } -} \ No newline at end of file + ) + { tags: $._config.grafanaK8s.dashboardTags, templating+: { list+: [intervalTemplate, clusterTemplate, namespaceTemplate, workloadTemplate, workloadTypeTemplate] }, refresh: $._config.grafanaK8s.refresh }, + }, +} diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/scheduler.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/scheduler.libsonnet index 823ec3e..17d62e5 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/scheduler.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/scheduler.libsonnet @@ -170,6 +170,6 @@ local singlestat = grafana.singlestat; .addPanel(memory) .addPanel(cpu) .addPanel(goroutines) - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/dashboards/windows.libsonnet b/monitoring/vendor/kubernetes-mixin/dashboards/windows.libsonnet index 4d91687..0c65622 100644 --- a/monitoring/vendor/kubernetes-mixin/dashboards/windows.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/dashboards/windows.libsonnet @@ -561,6 +561,6 @@ local g = import 'grafana-builder/grafana.libsonnet'; ) + { yaxes: g.yaxes('percentunit') }, ), - ), + ) + { refresh: $._config.grafanaK8s.refresh }, }, } diff --git a/monitoring/vendor/kubernetes-mixin/rules/apps.libsonnet b/monitoring/vendor/kubernetes-mixin/rules/apps.libsonnet index fca2e83..2fc6371 100644 --- a/monitoring/vendor/kubernetes-mixin/rules/apps.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/rules/apps.libsonnet @@ -23,35 +23,45 @@ expr: ||| sum by (%(clusterLabel)s, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{%(cadvisorSelector)s, image!="", container!="POD"}[5m]) - ) * on (%(clusterLabel)s, namespace, pod) group_left(node) max by(%(clusterLabel)s, namespace, pod, node) (kube_pod_info) + ) * on (%(clusterLabel)s, namespace, pod) group_left(node) topk by (%(clusterLabel)s, namespace, pod) ( + 1, max by(%(clusterLabel)s, namespace, pod, node) (kube_pod_info) + ) ||| % $._config, }, { record: 'node_namespace_pod_container:container_memory_working_set_bytes', expr: ||| container_memory_working_set_bytes{%(cadvisorSelector)s, image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) ||| % $._config, }, { record: 'node_namespace_pod_container:container_memory_rss', expr: ||| container_memory_rss{%(cadvisorSelector)s, image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) ||| % $._config, }, { record: 'node_namespace_pod_container:container_memory_cache', expr: ||| container_memory_cache{%(cadvisorSelector)s, image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) ||| % $._config, }, { record: 'node_namespace_pod_container:container_memory_swap', expr: ||| container_memory_swap{%(cadvisorSelector)s, image!=""} - * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info) + ) ||| % $._config, }, { @@ -92,15 +102,19 @@ { record: 'mixin_pod_workload', expr: ||| - sum( + max by (%(clusterLabel)s, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{%(kubeStateMetricsSelector)s, owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{%(kubeStateMetricsSelector)s}, + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{%(kubeStateMetricsSelector)s} + ) + ), "workload", "$1", "owner_name", "(.*)" ) - ) by (%(clusterLabel)s, namespace, workload, pod) + ) ||| % $._config, labels: { workload_type: 'deployment', @@ -109,12 +123,12 @@ { record: 'mixin_pod_workload', expr: ||| - sum( + max by (%(clusterLabel)s, namespace, workload, pod) ( label_replace( kube_pod_owner{%(kubeStateMetricsSelector)s, owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (%(clusterLabel)s, namespace, workload, pod) + ) ||| % $._config, labels: { workload_type: 'daemonset', @@ -123,12 +137,12 @@ { record: 'mixin_pod_workload', expr: ||| - sum( + max by (%(clusterLabel)s, namespace, workload, pod) ( label_replace( kube_pod_owner{%(kubeStateMetricsSelector)s, owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) - ) by (%(clusterLabel)s, namespace, workload, pod) + ) ||| % $._config, labels: { workload_type: 'statefulset', diff --git a/monitoring/vendor/kubernetes-mixin/rules/node.libsonnet b/monitoring/vendor/kubernetes-mixin/rules/node.libsonnet index ad8fa70..ef7554c 100644 --- a/monitoring/vendor/kubernetes-mixin/rules/node.libsonnet +++ b/monitoring/vendor/kubernetes-mixin/rules/node.libsonnet @@ -15,15 +15,23 @@ // SINCE 2018-02-08 record: ':kube_pod_info_node_count:', expr: ||| - sum(min(kube_pod_info) by (%(clusterLabel)s, node)) - ||| % $._config, + sum(min(kube_pod_info) by (%(clusterLabel)s, node)) + ||| % $._config, }, { - // This rule results in the tuples (node, namespace, instance) => 1; - // it is used to calculate per-node metrics, given namespace & instance. + // This rule results in the tuples (node, namespace, instance) => 1. + // It is used to calculate per-node metrics, given namespace & instance. + // We use the topk() aggregator to ensure that each (namespace, + // instance) tuple is only associated to one node and thus avoid + // "many-to-many matching not allowed" errors when joining with + // other timeseries on (namespace, instance). See node:node_num_cpu:sum + // below for instance. record: 'node_namespace_pod:kube_pod_info:', expr: ||| - max(label_replace(kube_pod_info{%(kubeStateMetricsSelector)s}, "%(podLabel)s", "$1", "pod", "(.*)")) by (node, namespace, %(podLabel)s) + topk by(namespace, %(podLabel)s) (1, + max by (node, namespace, %(podLabel)s) ( + label_replace(kube_pod_info{%(kubeStateMetricsSelector)s}, "%(podLabel)s", "$1", "pod", "(.*)") + )) ||| % $._config, }, {