From dd18c911bc945eb5d04f78606ccfd0d8c913fb67 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Thu, 15 Oct 2020 21:02:11 +0200 Subject: [PATCH] update monitoring --- monitoring/jsonnetfile.lock.json | 44 ++-- .../alertmanager-tbrnt-config-secret.yaml | 17 -- .../grafana-dashboardDefinitions.yaml | 129 +++++++++- .../manifests/healthchecks-io-cronjob.yaml | 31 --- .../manifests/healthchecks-io-secret.yaml | 17 -- monitoring/manifests/k8rules.yaml | 40 --- .../kube-state-metrics-clusterRole.yaml | 2 +- .../manifests/node-exporter-daemonset.yaml | 12 +- .../manifests/node-exporter-service.yaml | 2 +- .../node-exporter-serviceMonitor.yaml | 2 +- .../prometheus-operator-serviceMonitor.yaml | 4 +- monitoring/manifests/prometheus-rules.yaml | 110 ++++++--- .../prometheus-serviceMonitorKubelet.yaml | 1 + .../prometheus-operator-clusterRole.yaml | 2 +- ...rometheus-operator-clusterRoleBinding.yaml | 2 +- .../setup/prometheus-operator-deployment.yaml | 8 +- .../setup/prometheus-operator-service.yaml | 2 +- .../prometheus-operator-serviceAccount.yaml | 2 +- .../manifests/traefik-service-monitor.yaml | 17 -- .../Documentation/etcd-mixin/mixin.libsonnet | 80 +++++- .../etcd/Documentation/etcd-mixin/test.yaml | 36 ++- .../grafonnet/graph_panel.libsonnet | 5 + .../grafonnet-lib/grafonnet/loki.libsonnet | 3 + .../alerts/apps_alerts.libsonnet | 2 +- .../dashboards/windows.libsonnet | 4 +- .../kubernetes-mixin/rules/windows.libsonnet | 4 +- .../kube-state-metrics.libsonnet | 3 +- .../kube-prometheus/alerts/alerts.libsonnet | 3 +- .../alerts/prometheus-operator.libsonnet | 63 ----- .../jsonnet/kube-prometheus/jsonnetfile.json | 9 + .../kube-prometheus/kube-prometheus.libsonnet | 4 +- .../kube-state-metrics.libsonnet | 233 +++++++++--------- .../node-exporter/node-exporter.libsonnet | 10 +- .../prometheus/prometheus.libsonnet | 1 + .../jsonnet/mixin/alerts.jsonnet | 3 + .../jsonnet/mixin/alerts/alerts.libsonnet | 95 +++++++ .../jsonnet/mixin/config.libsonnet | 5 + .../jsonnet/mixin/mixin.libsonnet | 2 + .../prometheus-operator.libsonnet | 2 +- .../docs/node-mixin/alerts/alerts.libsonnet | 10 +- .../docs/node-mixin/config.libsonnet | 5 + .../docs/node-mixin/dashboards/node.libsonnet | 5 +- monitoring/vendor/mixin | 1 + 43 files changed, 619 insertions(+), 413 deletions(-) delete mode 100644 monitoring/manifests/healthchecks-io-cronjob.yaml delete mode 100644 monitoring/manifests/healthchecks-io-secret.yaml delete mode 100644 monitoring/manifests/k8rules.yaml delete mode 100644 monitoring/manifests/traefik-service-monitor.yaml delete mode 100644 monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet create mode 100644 monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts.jsonnet create mode 100644 monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts/alerts.libsonnet create mode 100644 monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/config.libsonnet create mode 100644 monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet create mode 120000 monitoring/vendor/mixin diff --git a/monitoring/jsonnetfile.lock.json b/monitoring/jsonnetfile.lock.json index ef31195..23d2fb7 100644 --- a/monitoring/jsonnetfile.lock.json +++ b/monitoring/jsonnetfile.lock.json @@ -18,8 +18,8 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "528b01c327ee4abfd4afea29de9066c7f4b247fa", - "sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ=" + "version": "e42127658c910d91e7902be958f12d41ac33d54f", + "sum": "L+PGlPK9mykGCJ9TIoEWdhMBjz+9lKuQ4YZ8fOeP9sk=" }, { "source": { @@ -28,8 +28,8 @@ "subdir": "grafonnet" } }, - "version": "cc1626a1b4dee45c99b78ddd9714dfd5f5d7816e", - "sum": "nkgrtMYPCq/YB4r3mKyToepaLhicwWnxDdGIodPpzz0=" + "version": "8d382c732dbdc839ff07549a3f42d25828f1b268", + "sum": "DRSRw4luAXlBXblo19/T1Jrv+9hyV8ivlS0KEtNANec=" }, { "source": { @@ -38,7 +38,7 @@ "subdir": "grafana-builder" } }, - "version": "2cc8d1dcb943eb3ff1dcb85bc9a3933afb36b730", + "version": "b5e45051995755ea373ea67642f8e5f54fcb8dd7", "sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc=" }, { @@ -59,8 +59,8 @@ "subdir": "" } }, - "version": "0bbe890539df0c1477000322c73977af71ef71e9", - "sum": "h48bpWnNFX9iN9Uqc9y0NTlKQu8sA1izvNyAHzsMIX8=" + "version": "aa2adbcf39884fd9c85d7c3e0ff338b1d61ea1ba", + "sum": "ttkPUnv/5bqlOFcZ8fvp2wi/S7ZLKiqAZ4ZdTolX77M=" }, { "source": { @@ -69,7 +69,7 @@ "subdir": "lib/promgrafonnet" } }, - "version": "0bbe890539df0c1477000322c73977af71ef71e9", + "version": "aa2adbcf39884fd9c85d7c3e0ff338b1d61ea1ba", "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" }, { @@ -79,8 +79,8 @@ "subdir": "jsonnet/kube-state-metrics" } }, - "version": "daf555f1e11ad6aa37852653e63baede5f99367e", - "sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA=" + "version": "35ef70bb74520a78cc8dc7cf364e1ff4e0c45063", + "sum": "ySP+bI2ZMLPt/sguSh9WrwI5H5dasaNFRE8Uo9PcZrI=" }, { "source": { @@ -89,7 +89,7 @@ "subdir": "jsonnet/kube-state-metrics-mixin" } }, - "version": "daf555f1e11ad6aa37852653e63baede5f99367e", + "version": "35ef70bb74520a78cc8dc7cf364e1ff4e0c45063", "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" }, { @@ -99,8 +99,18 @@ "subdir": "jsonnet/kube-prometheus" } }, - "version": "5fe45c57b60f17568001fd04a7dc2bb754fdf152", - "sum": "6Qrn74pNRqJNKYdsmcBu8ergYbMEH48qG1VDVm9FKak=" + "version": "980e95de011319b88a3b9c0787a81dcdf338a898", + "sum": "BxOXyWCSc9KkgWJXDau2Xtsy3aOYZDHz2VqOSLga7VU=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator", + "subdir": "jsonnet/mixin" + } + }, + "version": "55baf034c431ed2c78d950b187f7d8b34dd06860", + "sum": "+Q45oBC7O8g7KQOaiKhGglwndAMWRlLTR94KUI8Q1Ko=" }, { "source": { @@ -109,8 +119,8 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "96094ad1ab039950537df448b95bbcc04c57bfc4", - "sum": "ReamRYoS2C39Of7KtXGqkSWdfHw5Fy/Ix6ujOmBLFAg=" + "version": "cd331ce9bb58bb926e391c6ae807621cb12cc29e", + "sum": "nM1eDP5vftqAeQSmVYzSBAh+lG0SN6zu46QiocQiVhk=" }, { "source": { @@ -119,8 +129,8 @@ "subdir": "docs/node-mixin" } }, - "version": "d8a1585f59ef1169837d08979ecc92dcea8aa58a", - "sum": "EE+C+Krf518EGLjA/x3ZvKfenCI0J7YuwFJVBscypRw=" + "version": "f81747e608ea85ae44e76454eb63f9cb6484fb9e", + "sum": "VyMzZPxQIjiKQYGjZjXeKNWfLJ9vOl3emp84PWfsrUc=" }, { "source": { diff --git a/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml b/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml index 683019d..e69de29 100644 --- a/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml +++ b/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml @@ -1,17 +0,0 @@ -apiVersion: bitnami.com/v1alpha1 -kind: SealedSecret -metadata: - creationTimestamp: null - name: alertmanager-tbrnt-config - namespace: monitoring -spec: - encryptedData: - alertmanager.yaml: AgATnPnwlvlfGVJeWhjcFHyp/am3nmguqi9PsWgEhxtVLuyDA6OB1G+BdJZ7dGdCViZGeDJD5mHxESSLDMTfxg5DxMDIG18XCzojMtRFTPJMZPLjbETNuSZqtrkbscp/qQom4z+igVuLlkaihdYRcCNV+B0vm+1h6BUPV8Utv1RN1dy9XUvdrvhPRNFvqhCpVcpcLwNP5cli5SNYgVc/ty6a45Fl5h+KLv7rFBJexLhUXoR0jamQpQWoH7oNHcS4ONHxLDMKXqE9jFpKzlQJBNgiRQEEotwCYTodoALmkcIs37Ai+trQxEMZZYtD5vFzbehfTtNLT1bPhLiX91rv6Q9n9wuIw951Qk11L6cF93zDl2mZ9dAQHSAglVHEriXKXBZ3Df4DSyh5qkr+/7lFBdFTQVMS5+YTgM1eCmG1yfsvU33IWKh5wrNhpkUqGLiq9f+4k3xPQVysVY3jJjVhINM/A9OsTPfFzm7aAAklBxuROXiZgZ/6L4Oc/c0Tv3EN/02rhinSGr1hIMmcaSxdOVQxXPU+pbx4JcSmyQFXIY37n/2ya/UbJW/o901MtigCukUvgMedkxPSGhedvHOygKKXPKNSl5U1Emhza7c6vP9cSiiaHpRm7EyUQvjWpJRUP7tSRgDlZyBM9Ud0PRBRdYWLG5YlZB4STOX6cDyYOcFJvnAyiZpDuwOPKMOrhWQSbCgcMcuS/RCgCnYJ4YBfm1cSxcqxsA65PRhXbRmiY9b/Mqs7s1xpJo3RySO27JiffbY+vYRIrFv4G4ak0ug9AQJvrvEA/ZgSs9xpASXSsr42pB63exhlZP+D9JEDGFLgzGQVnVFRFDrlYLFQieqWDgBc0pkcxHHwGBTp3H6PP5RRPLKzNoypTbNrdLlaNAsAlb8VuPopPBHqLcpU+DPuxxBCQ/P8ezYXE8RmpH7x4A2rPLbV902zYVwfnWMrUdPZif7oPnn/xM+VDQMLIlKA/CQySudzAYf621N450V2zE0akOFQNATGEUZT+8HtjNKMcAxjojP/pJpo486t02KXHDw+i04R1kdGPVkLnZDz+UShh4eLwovBL3zopxHxSFnonI1Ez+IetemM+aCJhadU4YNC4zY7x+blNa51ZAGEGoXuSJB3fszLd7wSz4owIhQnb+StxVNcwNgircvFjhauLVLwEynO4WTm+YKzosf2GxJAaNriveyLj1L+DwBUOeWnvmL4QsHX3nriZgVR89KlIQI1d3+lcf2jw8VqfHj3tbpMxb98qZFWw2pczrDeE2t4UQTPTj+4VK4htKGhnIHlNmdeSN39GJCWfilnzAsznrkxZsr1wIqrP+ayRO+NxljBzUN7xspByJeJrBpzW1pukvNg74MK1K7g0/fh/zmqyduQYmJnCDDqfN0PB1YoXFPZn3o4kzGnLXetgJHyJG4tsinediVXrZJb+6KOL31hEbZVArP/gWYHQv1MltEZj2yunuLeEy+Oo35oS/IkW17qF3gkF0sLavZFhJe6XqVvO2BFF1V8S15cBApXXPvFJQWyFLOTVqhYicYfJqQEgz7sKH1uYK0zLeyBkRIfUXk3vs7X9X/CzQo9J9oH5FCgNEI6GdROhitGUihCxwAVa6lsteVxZZ4USkMGZtJPG3Pi5RQuWpSMFX9nWm/LHv8wpgFItx12ZkuaIDB30wYfp4SqcBr1ZvuptKtwJpTUjVx5HSUKnEye2g== - template: - metadata: - creationTimestamp: null - name: alertmanager-tbrnt-config - namespace: monitoring - type: Opaque -status: {} - diff --git a/monitoring/manifests/grafana-dashboardDefinitions.yaml b/monitoring/manifests/grafana-dashboardDefinitions.yaml index 0a8abe7..99bd69a 100644 --- a/monitoring/manifests/grafana-dashboardDefinitions.yaml +++ b/monitoring/manifests/grafana-dashboardDefinitions.yaml @@ -144,6 +144,7 @@ items: "decimals": 3, "description": "How much error budget is left looking at our 0.990% availability gurantees?", "fill": 10, + "fillGradient": 0, "gridPos": { }, @@ -338,6 +339,7 @@ items: "datasource": "$datasource", "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", "fill": 10, + "fillGradient": 0, "gridPos": { }, @@ -446,6 +448,7 @@ items: "datasource": "$datasource", "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -539,6 +542,7 @@ items: "datasource": "$datasource", "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -731,6 +735,7 @@ items: "datasource": "$datasource", "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", "fill": 10, + "fillGradient": 0, "gridPos": { }, @@ -839,6 +844,7 @@ items: "datasource": "$datasource", "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -932,6 +938,7 @@ items: "datasource": "$datasource", "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1037,6 +1044,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1129,6 +1137,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1221,6 +1230,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1326,6 +1336,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1418,6 +1429,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1510,6 +1522,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1780,6 +1793,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -1882,6 +1896,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -2325,6 +2340,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -2427,6 +2443,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -2559,6 +2576,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -2659,6 +2677,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -2770,6 +2789,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -2870,6 +2890,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -2990,6 +3011,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -3090,6 +3112,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -3190,6 +3213,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -3294,6 +3318,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 24, @@ -3668,6 +3693,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3773,6 +3799,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3878,6 +3905,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3983,6 +4011,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4096,6 +4125,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4201,6 +4231,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4306,6 +4337,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4398,6 +4430,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4490,6 +4523,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -16978,6 +17012,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17070,6 +17105,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17175,6 +17211,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17280,6 +17317,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17379,6 +17417,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17491,6 +17530,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17585,6 +17625,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17692,6 +17733,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17799,6 +17841,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17891,6 +17934,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -17997,6 +18041,7 @@ items: "datasource": "$datasource", "description": "Pod lifecycle event generator", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18089,6 +18134,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18194,6 +18240,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18299,6 +18346,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18425,6 +18473,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18530,6 +18579,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18622,6 +18672,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -18714,6 +18765,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -19527,6 +19579,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -19627,6 +19680,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -19738,6 +19792,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -19838,6 +19893,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -19958,6 +20014,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -20058,6 +20115,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -20400,6 +20458,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -20502,6 +20561,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -20945,6 +21005,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21047,6 +21108,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21179,6 +21241,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21279,6 +21342,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21390,6 +21454,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21490,6 +21555,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21610,6 +21676,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -21710,6 +21777,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -23983,6 +24051,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -24076,6 +24145,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -24202,6 +24272,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -24367,7 +24438,7 @@ items: "tableColumn": "", "targets": [ { - "expr": "100 -\n(\n node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n* 100\n)\n", + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -24412,6 +24483,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -24528,6 +24600,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -24647,6 +24720,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -24740,6 +24814,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -24961,6 +25036,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -25157,6 +25233,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -25819,6 +25896,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -25919,6 +25997,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -26030,6 +26109,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -26130,6 +26210,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -26250,6 +26331,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -26350,6 +26432,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 10, "w": 12, @@ -26700,6 +26783,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -26792,6 +26876,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -26897,6 +26982,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27002,6 +27088,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27095,6 +27182,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27187,6 +27275,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27279,6 +27368,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27384,6 +27474,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27476,6 +27567,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27581,6 +27673,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27673,6 +27766,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27778,6 +27872,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27870,6 +27965,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -27962,6 +28058,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -28054,6 +28151,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -29634,6 +29732,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -29726,6 +29825,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -29831,6 +29931,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -29923,6 +30024,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30028,6 +30130,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30141,6 +30244,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30246,6 +30350,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30351,6 +30456,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30443,6 +30549,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30535,6 +30642,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30839,6 +30947,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -30952,6 +31061,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -31078,6 +31188,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -31191,6 +31302,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -31296,6 +31408,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -31401,6 +31514,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -31493,6 +31607,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -31585,6 +31700,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -32417,6 +32533,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -33529,6 +33646,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -33631,6 +33749,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -33744,6 +33863,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -33846,6 +33966,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -33978,6 +34099,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -34078,6 +34200,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -34189,6 +34312,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -34289,6 +34413,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -34409,6 +34534,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, @@ -34509,6 +34635,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 2, + "fillGradient": 0, "gridPos": { "h": 9, "w": 12, diff --git a/monitoring/manifests/healthchecks-io-cronjob.yaml b/monitoring/manifests/healthchecks-io-cronjob.yaml deleted file mode 100644 index d9417bd..0000000 --- a/monitoring/manifests/healthchecks-io-cronjob.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: batch/v1beta1 -kind: CronJob -metadata: - name: healthchecks-io - namespace: monitoring -spec: - schedule: "*/1 * * * *" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 1 - failedJobsHistoryLimit: 1 - startingDeadlineSeconds: 200 - jobTemplate: - spec: - template: - spec: - containers: - - name: pinghc - env: - - name: HCURL - valueFrom: - secretKeyRef: - name: healthchecks-io - key: HCURL - image: busybox - args: - - /bin/sh - - -c - - "date && echo $HCURL && /bin/wget -q -O - --no-check-certificate $HCURL" - restartPolicy: OnFailure - - diff --git a/monitoring/manifests/healthchecks-io-secret.yaml b/monitoring/manifests/healthchecks-io-secret.yaml deleted file mode 100644 index 948d736..0000000 --- a/monitoring/manifests/healthchecks-io-secret.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: bitnami.com/v1alpha1 -kind: SealedSecret -metadata: - creationTimestamp: null - name: healthchecks-io - namespace: monitoring -spec: - encryptedData: - HCURL: AgBEpwET1Qa1hQqAmwrNGBv4sL0ml8pGYPwgq9Aps3tYhBVqsXjV7U5RQa/txldg1umw2Zqx8MfvZTN2kmFk6bJTROCWqTxmxd4rHgnJYqRR0+Opn/BtDhVx4WTnehyM/il9ymddhMD+WRQDr/Wfxq/0UQdsy+IEYyVMQuOKEihZabxmXRyNeAl5ZBeQ0W1T29biJPx3rifS37RbGlJtCIYuNPh82d0KAMu1dszDnkln8k5CBv6mPD8BVHg+Z/y1v1jFhTIE3YOlGzCIjb8RrJj6MVm7zlauj8zrl30JvF2OAWDGGZDOL3b0G3IKd0Qp/eagT33Sx7vbppY/l1Vci6UQcVpde3u2+ATMbysRej04Mvcodq5OgkBFqbgCzx0UFTIq0wER/GuCoYbt+k8b3TouK5ChQet8EP0W/c7rLHcMY3c0UR00N7m5UeKZAzAkXSGV+u3M9K6PMp8pl0VuDo+IVgEIY7ku9rtzL7SPIfXS4u5w7fte13fOtKB/2sa11dNqAbHmidF+IO6ycjm8SZibC7NKyCxgIKWPfsFXhNUT2Nx7eBRrzR1QlqThIGRsDpX1RVplTwe/OLsBz0K99AyGDUkSBJdOZLaRT/b3T0nS8DE5x/e8MvFsbbDdGE2U/YhVrbfn072u/X979/RIm0oCjipvByZXhFmobRj9SP9RcK2UfjBSY7xyKnd2rjj1mnIs2S0CmwGFdJqoywHckJJOu3YP2oN2Q1U7+Fe4yciupAshgdszY2okHMtd4aDDJJKeKKFHpjpsuA== - template: - metadata: - creationTimestamp: null - name: healthchecks-io - namespace: monitoring - type: Opaque -status: {} - diff --git a/monitoring/manifests/k8rules.yaml b/monitoring/manifests/k8rules.yaml deleted file mode 100644 index bef9f77..0000000 --- a/monitoring/manifests/k8rules.yaml +++ /dev/null @@ -1,40 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: k8up - labels: - prometheus: k8s - role: alert-rules -spec: - groups: - - name: k8up.rules - rules: - - alert: baas_last_errors - expr: baas_backup_restic_last_errors > 0 - for: 1m - labels: - severity: critical - annotations: - summary: Amount of errors of last restic backup - description: This alert is fired when error number is > 0 - - alert: K8upBackupFailed - expr: rate(k8up_jobs_failed_counter[1d]) > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "Job in {{ $labels.namespace }} of type {{ $labels.jobType }} failed" - - alert: K8upBackupNotRunning - expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0 - for: 1m - labels: - severity: critical - annotations: - summary: "No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock" - - alert: K8upJobStuck - expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0 - for: 24h - labels: - severity: critical - annotations: - summary: "K8up jobs are stuck in {{ $labels.namespace }} for the last 24 hours." diff --git a/monitoring/manifests/kube-state-metrics-clusterRole.yaml b/monitoring/manifests/kube-state-metrics-clusterRole.yaml index d4d4c22..9b330bd 100644 --- a/monitoring/manifests/kube-state-metrics-clusterRole.yaml +++ b/monitoring/manifests/kube-state-metrics-clusterRole.yaml @@ -30,7 +30,6 @@ rules: - daemonsets - deployments - replicasets - - ingresses verbs: - list - watch @@ -105,6 +104,7 @@ rules: - networking.k8s.io resources: - networkpolicies + - ingresses verbs: - list - watch diff --git a/monitoring/manifests/node-exporter-daemonset.yaml b/monitoring/manifests/node-exporter-daemonset.yaml index f99a1e5..c659215 100644 --- a/monitoring/manifests/node-exporter-daemonset.yaml +++ b/monitoring/manifests/node-exporter-daemonset.yaml @@ -3,7 +3,7 @@ kind: DaemonSet metadata: labels: app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v0.18.1 + app.kubernetes.io/version: v1.0.1 name: node-exporter namespace: monitoring spec: @@ -14,7 +14,7 @@ spec: metadata: labels: app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v0.18.1 + app.kubernetes.io/version: v1.0.1 spec: containers: - args: @@ -25,7 +25,7 @@ spec: - --no-collector.wifi - --no-collector.hwmon - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) - image: quay.io/prometheus/node-exporter:v0.18.1 + image: quay.io/prometheus/node-exporter:v1.0.1 name: node-exporter resources: limits: @@ -36,11 +36,13 @@ spec: memory: 180Mi volumeMounts: - mountPath: /host/proc + mountPropagation: HostToContainer name: proc - readOnly: false + readOnly: true - mountPath: /host/sys + mountPropagation: HostToContainer name: sys - readOnly: false + readOnly: true - mountPath: /host/root mountPropagation: HostToContainer name: root diff --git a/monitoring/manifests/node-exporter-service.yaml b/monitoring/manifests/node-exporter-service.yaml index 7dfbef6..cb96660 100644 --- a/monitoring/manifests/node-exporter-service.yaml +++ b/monitoring/manifests/node-exporter-service.yaml @@ -3,7 +3,7 @@ kind: Service metadata: labels: app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v0.18.1 + app.kubernetes.io/version: v1.0.1 name: node-exporter namespace: monitoring spec: diff --git a/monitoring/manifests/node-exporter-serviceMonitor.yaml b/monitoring/manifests/node-exporter-serviceMonitor.yaml index 357164d..8e5a97c 100644 --- a/monitoring/manifests/node-exporter-serviceMonitor.yaml +++ b/monitoring/manifests/node-exporter-serviceMonitor.yaml @@ -3,7 +3,7 @@ kind: ServiceMonitor metadata: labels: app.kubernetes.io/name: node-exporter - app.kubernetes.io/version: v0.18.1 + app.kubernetes.io/version: v1.0.1 name: node-exporter namespace: monitoring spec: diff --git a/monitoring/manifests/prometheus-operator-serviceMonitor.yaml b/monitoring/manifests/prometheus-operator-serviceMonitor.yaml index a8b977d..52ee0e3 100644 --- a/monitoring/manifests/prometheus-operator-serviceMonitor.yaml +++ b/monitoring/manifests/prometheus-operator-serviceMonitor.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring spec: @@ -19,4 +19,4 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 diff --git a/monitoring/manifests/prometheus-rules.yaml b/monitoring/manifests/prometheus-rules.yaml index 166f215..43b1055 100644 --- a/monitoring/manifests/prometheus-rules.yaml +++ b/monitoring/manifests/prometheus-rules.yaml @@ -1019,6 +1019,8 @@ spec: summary: Clock not synchronising. expr: | min_over_time(node_timex_sync_status[5m]) == 0 + and + node_timex_maxerror_seconds >= 16 for: 10m labels: severity: warning @@ -1044,6 +1046,75 @@ spec: node_md_disks{state="fail"} > 0 labels: severity: warning + - name: prometheus-operator + rules: + - alert: PrometheusOperatorListErrors + annotations: + description: Errors while performing List operations in controller {{$labels.controller}} + in {{$labels.namespace}} namespace. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors + summary: Errors while performing list operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorWatchErrors + annotations: + description: Errors while performing watch operations in controller {{$labels.controller}} + in {{$labels.namespace}} namespace. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors + summary: Errors while performing watch operations in controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 + for: 15m + labels: + severity: warning + - alert: PrometheusOperatorSyncFailed + annotations: + description: Controller {{ $labels.controller }} in {{ $labels.namespace }} + namespace fails to reconcile {{ $value }} objects. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed + summary: Last controller reconciliation failed + expr: | + min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorReconcileErrors + annotations: + description: '{{ $value | humanizePercentage }} of reconciling operations + failed for {{ $labels.controller }} controller in {{ $labels.namespace }} + namespace.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors + summary: Errors while reconciling controller. + expr: | + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + description: Errors while reconciling Prometheus in {{ $labels.namespace }} + Namespace. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors + summary: Errors while reconciling Prometheus. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNotReady + annotations: + description: Prometheus operator in {{ $labels.namespace }} namespace isn't + ready to reconcile {{ $labels.controller }} resources. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready + summary: Prometheus operator not ready + expr: | + min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0) + for: 5m + labels: + severity: warning - name: kubernetes-apps rules: - alert: KubePodCrashLooping @@ -1249,7 +1320,7 @@ spec: - alert: KubeJobFailed annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to - complete. + complete. Removing failed job after investigation should clear this alert. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed summary: Job failed to complete. expr: | @@ -2031,40 +2102,3 @@ spec: for: 2m labels: severity: warning - - name: prometheus-operator - rules: - - alert: PrometheusOperatorListErrors - annotations: - message: Errors while performing List operations in controller {{$labels.controller}} - in {{$labels.namespace}} namespace. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorWatchErrors - annotations: - message: Errors while performing Watch operations in controller {{$labels.controller}} - in {{$labels.namespace}} namespace. - expr: | - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4 - for: 15m - labels: - severity: warning - - alert: PrometheusOperatorReconcileErrors - annotations: - message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace - }} Namespace. - expr: | - rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 - for: 10m - labels: - severity: warning - - alert: PrometheusOperatorNodeLookupErrors - annotations: - message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. - expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 - for: 10m - labels: - severity: warning diff --git a/monitoring/manifests/prometheus-serviceMonitorKubelet.yaml b/monitoring/manifests/prometheus-serviceMonitorKubelet.yaml index afa853a..7db47ef 100644 --- a/monitoring/manifests/prometheus-serviceMonitorKubelet.yaml +++ b/monitoring/manifests/prometheus-serviceMonitorKubelet.yaml @@ -53,6 +53,7 @@ spec: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true + honorTimestamps: false interval: 30s metricRelabelings: - action: drop diff --git a/monitoring/manifests/setup/prometheus-operator-clusterRole.yaml b/monitoring/manifests/setup/prometheus-operator-clusterRole.yaml index a3de43b..ed89587 100644 --- a/monitoring/manifests/setup/prometheus-operator-clusterRole.yaml +++ b/monitoring/manifests/setup/prometheus-operator-clusterRole.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator rules: - apiGroups: diff --git a/monitoring/manifests/setup/prometheus-operator-clusterRoleBinding.yaml b/monitoring/manifests/setup/prometheus-operator-clusterRoleBinding.yaml index 9001430..38e9826 100644 --- a/monitoring/manifests/setup/prometheus-operator-clusterRoleBinding.yaml +++ b/monitoring/manifests/setup/prometheus-operator-clusterRoleBinding.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator roleRef: apiGroup: rbac.authorization.k8s.io diff --git a/monitoring/manifests/setup/prometheus-operator-deployment.yaml b/monitoring/manifests/setup/prometheus-operator-deployment.yaml index e0aaf06..32cbd4a 100644 --- a/monitoring/manifests/setup/prometheus-operator-deployment.yaml +++ b/monitoring/manifests/setup/prometheus-operator-deployment.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring spec: @@ -18,15 +18,15 @@ spec: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 spec: containers: - args: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=jimmidyson/configmap-reload:v0.4.0 - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.0 - image: quay.io/prometheus-operator/prometheus-operator:v0.42.0 + - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.1 + image: quay.io/prometheus-operator/prometheus-operator:v0.42.1 name: prometheus-operator ports: - containerPort: 8080 diff --git a/monitoring/manifests/setup/prometheus-operator-service.yaml b/monitoring/manifests/setup/prometheus-operator-service.yaml index 91d1bbe..a1543b1 100644 --- a/monitoring/manifests/setup/prometheus-operator-service.yaml +++ b/monitoring/manifests/setup/prometheus-operator-service.yaml @@ -4,7 +4,7 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring spec: diff --git a/monitoring/manifests/setup/prometheus-operator-serviceAccount.yaml b/monitoring/manifests/setup/prometheus-operator-serviceAccount.yaml index 2a98d4d..37f53fe 100644 --- a/monitoring/manifests/setup/prometheus-operator-serviceAccount.yaml +++ b/monitoring/manifests/setup/prometheus-operator-serviceAccount.yaml @@ -4,6 +4,6 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.42.0 + app.kubernetes.io/version: v0.42.1 name: prometheus-operator namespace: monitoring diff --git a/monitoring/manifests/traefik-service-monitor.yaml b/monitoring/manifests/traefik-service-monitor.yaml deleted file mode 100644 index be736ba..0000000 --- a/monitoring/manifests/traefik-service-monitor.yaml +++ /dev/null @@ -1,17 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: traefik - namespace: monitoring -spec: - endpoints: - - interval: 30s - path: /metrics - port: metrics - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - app: traefik diff --git a/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet b/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet index cf74da8..6c146fd 100644 --- a/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet +++ b/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/mixin.libsonnet @@ -34,7 +34,8 @@ severity: 'critical', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).', + description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).', + summary: 'etcd cluster members are down.', }, }, { @@ -47,7 +48,8 @@ severity: 'critical', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', + description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', + summary: 'etcd cluster has insufficient number of members.', }, }, { @@ -60,7 +62,8 @@ severity: 'critical', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', + description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', + summary: 'etcd cluster has no leader.', }, }, { @@ -73,7 +76,8 @@ severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', + summary: 'etcd cluster has high number of leader changes.', }, }, { @@ -89,7 +93,8 @@ severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster has high number of failed grpc requests.', }, }, { @@ -105,7 +110,8 @@ severity: 'critical', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster has high number of failed grpc requests.', }, }, { @@ -119,7 +125,8 @@ severity: 'critical', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd grpc requests are slow', }, }, { @@ -133,7 +140,8 @@ severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster member communication is slow.', }, }, { @@ -146,7 +154,8 @@ severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.', + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster has high number of proposal failures.', }, }, { @@ -159,6 +168,21 @@ labels: { severity: 'warning', }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster 99th percentile fsync durations are too high.', + }, + }, + { + alert: 'etcdHighFsyncDurations', + expr: ||| + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) + > 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, annotations: { message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', }, @@ -174,7 +198,8 @@ severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', + description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster 99th percentile commit durations are too high.', }, }, { @@ -188,7 +213,8 @@ severity: 'warning', }, annotations: { - message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}', + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}', + summary: 'etcd has high number of failed HTTP requests.', }, }, { @@ -202,7 +228,8 @@ severity: 'critical', }, annotations: { - message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.', + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.', + summary: 'etcd has high number of failed HTTP requests.', }, }, { @@ -216,9 +243,36 @@ severity: 'warning', }, annotations: { - message: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.', + description: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.', + summary: 'etcd instance HTTP requests are slow.', }, }, + { + alert: 'etcdBackendQuotaLowSpace', + expr: ||| + (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.', + }, + }, + { + alert: 'etcdExcessiveDatabaseGrowth', + expr: ||| + increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.', + }, + }, ], }, ], diff --git a/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/test.yaml b/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/test.yaml index 3f11ca8..24162bd 100644 --- a/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/test.yaml +++ b/monitoring/vendor/github.com/etcd-io/etcd/Documentation/etcd-mixin/test.yaml @@ -26,7 +26,8 @@ tests: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": members are down (3).' + description: 'etcd cluster "etcd": members are down (3).' + summary: 'etcd cluster members are down.' - eval_time: 7m alertname: etcdInsufficientMembers - eval_time: 11m @@ -36,7 +37,8 @@ tests: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": insufficient members (1).' + description: 'etcd cluster "etcd": insufficient members (1).' + summary: 'etcd cluster has insufficient number of members.' - eval_time: 15m alertname: etcdInsufficientMembers exp_alerts: @@ -44,7 +46,8 @@ tests: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": insufficient members (0).' + description: 'etcd cluster "etcd": insufficient members (0).' + summary: 'etcd cluster has insufficient number of members.' - interval: 1m input_series: @@ -62,7 +65,8 @@ tests: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": members are down (3).' + description: 'etcd cluster "etcd": members are down (3).' + summary: 'etcd cluster members are down.' - interval: 1m input_series: @@ -80,7 +84,8 @@ tests: job: etcd severity: critical exp_annotations: - message: 'etcd cluster "etcd": members are down (1).' + description: 'etcd cluster "etcd": members are down (1).' + summary: 'etcd cluster members are down.' - interval: 1m input_series: - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' @@ -97,7 +102,8 @@ tests: job: etcd severity: warning exp_annotations: - message: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + description: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + summary: 'etcd cluster has high number of leader changes.' - interval: 1m input_series: - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' @@ -110,4 +116,20 @@ tests: - eval_time: 10m alertname: etcdHighNumberOfLeaderChanges exp_alerts: - + - interval: 1m + input_series: + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}' + values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}' + values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdExcessiveDatabaseGrowth + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.' diff --git a/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/graph_panel.libsonnet b/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/graph_panel.libsonnet index b2b3405..3a39235 100644 --- a/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/graph_panel.libsonnet +++ b/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/graph_panel.libsonnet @@ -10,6 +10,7 @@ * @param span (optional) Width of the panel * @param datasource (optional) Datasource * @param fill (default `1`) , integer from 0 to 10 + * @param fillGradient (default `0`) , integer from 0 to 10 * @param linewidth (default `1`) Line Width, integer from 0 to 10 * @param decimals (optional) Override automatic decimal precision for legend and tooltip. If null, not added to the json output. * @param decimalsY1 (optional) Override automatic decimal precision for the first Y axis. If null, use decimals parameter. @@ -63,11 +64,13 @@ * @method addYaxis(format,min,max,label,show,logBase,decimals) Adds a Y axis to the graph * @method addAlert(alert) Adds an alert * @method addLink(link) Adds a [panel link](https://grafana.com/docs/grafana/latest/linking/panel-links/) + * @method addLinks(links) Adds an array of links. */ new( title, span=null, fill=1, + fillGradient=0, linewidth=1, decimals=null, decimalsY1=null, @@ -166,6 +169,7 @@ }, lines: lines, fill: fill, + fillGradient: fillGradient, linewidth: linewidth, dashes: dashes, dashLength: 10, @@ -283,5 +287,6 @@ addLink(link):: self { links+: [link], }, + addLinks(links):: std.foldl(function(p, t) p.addLink(t), links, self), }, } diff --git a/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/loki.libsonnet b/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/loki.libsonnet index 2151abb..32d7267 100644 --- a/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/loki.libsonnet +++ b/monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/loki.libsonnet @@ -6,12 +6,15 @@ * * @param expr * @param hide (optional) Disable query on graph. + * @param legendFormat (optional) Defines the legend. Defaults to ''. */ target( expr, hide=null, + legendFormat='', ):: { [if hide != null then 'hide']: hide, expr: expr, + legendFormat: legendFormat, }, } diff --git a/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/alerts/apps_alerts.libsonnet b/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/alerts/apps_alerts.libsonnet index 7c4454f..afbb246 100644 --- a/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/alerts/apps_alerts.libsonnet +++ b/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/alerts/apps_alerts.libsonnet @@ -258,7 +258,7 @@ severity: 'warning', }, annotations: { - description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.', + description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.', summary: 'Job failed to complete.', }, }, diff --git a/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/windows.libsonnet b/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/windows.libsonnet index cb9d8ef..c3c8a89 100644 --- a/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/windows.libsonnet +++ b/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/windows.libsonnet @@ -337,11 +337,11 @@ local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libson legend_avg=true, ) .addTarget(prometheus.target( - 'sort_desc(sum by (container) (rate(windows_container_network_receive_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config, + 'sort_desc(sum by (container) (rate(windows_container_network_received_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config, legendFormat='Received : {{ container }}', )) .addTarget(prometheus.target( - 'sort_desc(sum by (container) (rate(windows_container_network_transmit_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config, + 'sort_desc(sum by (container) (rate(windows_container_network_transmitted_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config, legendFormat='Transmitted : {{ container }}', )) ) diff --git a/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/rules/windows.libsonnet b/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/rules/windows.libsonnet index 72535cc..b3964e0 100644 --- a/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/rules/windows.libsonnet +++ b/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/rules/windows.libsonnet @@ -202,13 +202,13 @@ ||| % $._config, }, { - record: 'windows_container_network_receive_bytes_total', + record: 'windows_container_network_received_bytes_total', expr: ||| windows_container_network_receive_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace) ||| % $._config, }, { - record: 'windows_container_network_transmit_bytes_total', + record: 'windows_container_network_transmitted_bytes_total', expr: ||| windows_container_network_transmit_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace) ||| % $._config, diff --git a/monitoring/vendor/github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet b/monitoring/vendor/github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet index 296657a..60efd5b 100644 --- a/monitoring/vendor/github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet +++ b/monitoring/vendor/github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet @@ -58,7 +58,6 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; 'daemonsets', 'deployments', 'replicasets', - 'ingresses', ]) + rulesType.withVerbs(['list', 'watch']), @@ -135,6 +134,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; rulesType.withApiGroups(['networking.k8s.io']) + rulesType.withResources([ 'networkpolicies', + 'ingresses', ]) + rulesType.withVerbs(['list', 'watch']), @@ -228,6 +228,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; roleBinding.new() + roleBinding.mixin.metadata.withName(ksm.name) + + roleBinding.mixin.metadata.withNamespace(ksm.namespace) + roleBinding.mixin.metadata.withLabels(ksm.commonLabels) + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + roleBinding.mixin.roleRef.withName(ksm.name) + diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/alerts.libsonnet b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/alerts.libsonnet index 3521aa8..adc4613 100644 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/alerts.libsonnet +++ b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/alerts.libsonnet @@ -1,4 +1,3 @@ (import 'alertmanager.libsonnet') + (import 'general.libsonnet') + -(import 'node.libsonnet') + -(import 'prometheus-operator.libsonnet') +(import 'node.libsonnet') diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet deleted file mode 100644 index 731994a..0000000 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/alerts/prometheus-operator.libsonnet +++ /dev/null @@ -1,63 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'prometheus-operator', - rules: [ - { - alert: 'PrometheusOperatorListErrors', - expr: ||| - (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', - }, - 'for': '15m', - }, - { - alert: 'PrometheusOperatorWatchErrors', - expr: ||| - (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', - }, - 'for': '15m', - }, - { - alert: 'PrometheusOperatorReconcileErrors', - expr: ||| - rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - message: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.', - }, - 'for': '10m', - }, - { - alert: 'PrometheusOperatorNodeLookupErrors', - expr: ||| - rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 - ||| % $._config, - labels: { - severity: 'warning', - }, - annotations: { - message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', - }, - 'for': '10m', - }, - ], - }, - ], - }, -} diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/jsonnetfile.json b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/jsonnetfile.json index 30e7f66..4da9452 100644 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/jsonnetfile.json +++ b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/jsonnetfile.json @@ -28,6 +28,15 @@ }, "version": "release-0.42" }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator", + "subdir": "jsonnet/mixin" + } + }, + "version": "master" + }, { "source": { "git": { diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-prometheus.libsonnet b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-prometheus.libsonnet index 71369ac..1bdcf4f 100644 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-prometheus.libsonnet +++ b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-prometheus.libsonnet @@ -1,6 +1,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet'; local configMapList = k3.core.v1.configMapList; +local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet'; (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + (import './kube-state-metrics/kube-state-metrics.libsonnet') + @@ -9,6 +10,7 @@ local configMapList = k3.core.v1.configMapList; (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') + (import './alertmanager/alertmanager.libsonnet') + (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') + +(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') + (import './prometheus/prometheus.libsonnet') + (import './prometheus-adapter/prometheus-adapter.libsonnet') + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') + @@ -60,7 +62,7 @@ local configMapList = k3.core.v1.configMapList; ], }, } + - ((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') { + (kubeRbacProxyContainer { config+:: { kubeRbacProxy: { local cfg = self, diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet index a313e7b..7fae2be 100644 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet +++ b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/kube-state-metrics/kube-state-metrics.libsonnet @@ -1,3 +1,6 @@ +local kubeRbacProxyContainer = import '../kube-rbac-proxy/container.libsonnet'; +local ksm = import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet'; + { _config+:: { versions+:: { @@ -11,119 +14,119 @@ scrapeTimeout: '30s', }, }, - kubeStateMetrics+:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') + - { - local ksm = self, - name:: 'kube-state-metrics', - namespace:: $._config.namespace, - version:: $._config.versions.kubeStateMetrics, - image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics, - service+: { - spec+: { - ports: [ - { - name: 'https-main', - port: 8443, - targetPort: 'https-main', - }, - { - name: 'https-self', - port: 9443, - targetPort: 'https-self', - }, - ], - }, - }, - deployment+: { - spec+: { - template+: { - spec+: { - containers: std.map(function(c) c { - ports:: null, - livenessProbe:: null, - readinessProbe:: null, - args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], - }, super.containers), - }, - }, - }, - }, - serviceMonitor: - { - apiVersion: 'monitoring.coreos.com/v1', - kind: 'ServiceMonitor', - metadata: { - name: 'kube-state-metrics', - namespace: $._config.namespace, - labels: { - 'app.kubernetes.io/name': 'kube-state-metrics', - 'app.kubernetes.io/version': ksm.version, - }, - }, - spec: { - jobLabel: 'app.kubernetes.io/name', - selector: { - matchLabels: { - 'app.kubernetes.io/name': 'kube-state-metrics', - }, - }, - endpoints: [ - { - port: 'https-main', - scheme: 'https', - interval: $._config.kubeStateMetrics.scrapeInterval, - scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, - honorLabels: true, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - relabelings: [ - { - regex: '(pod|service|endpoint|namespace)', - action: 'labeldrop', - }, - ], - tlsConfig: { - insecureSkipVerify: true, - }, - }, - { - port: 'https-self', - scheme: 'https', - interval: $._config.kubeStateMetrics.scrapeInterval, - bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', - tlsConfig: { - insecureSkipVerify: true, - }, - }, - ], - }, - }, - } + - ((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy-main', - securePortName: 'https-main', - securePort: 8443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8081/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin + - ((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') { - config+:: { - kubeRbacProxy: { - local cfg = self, - image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, - name: 'kube-rbac-proxy-self', - securePortName: 'https-self', - securePort: 9443, - secureListenAddress: ':%d' % self.securePort, - upstream: 'http://127.0.0.1:8082/', - tlsCipherSuites: $._config.tlsCipherSuites, - }, - }, - }).deploymentMixin, + kubeStateMetrics+:: + ksm + { + local version = self.version, + name:: 'kube-state-metrics', + namespace:: $._config.namespace, + version:: $._config.versions.kubeStateMetrics, + image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics, + service+: { + spec+: { + ports: [ + { + name: 'https-main', + port: 8443, + targetPort: 'https-main', + }, + { + name: 'https-self', + port: 9443, + targetPort: 'https-self', + }, + ], + }, + }, + deployment+: { + spec+: { + template+: { + spec+: { + containers: std.map(function(c) c { + ports:: null, + livenessProbe:: null, + readinessProbe:: null, + args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], + }, super.containers), + }, + }, + }, + }, + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'kube-state-metrics', + namespace: $._config.namespace, + labels: { + 'app.kubernetes.io/name': 'kube-state-metrics', + 'app.kubernetes.io/version': version, + }, + }, + spec: { + jobLabel: 'app.kubernetes.io/name', + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'kube-state-metrics', + }, + }, + endpoints: [ + { + port: 'https-main', + scheme: 'https', + interval: $._config.kubeStateMetrics.scrapeInterval, + scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, + honorLabels: true, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + relabelings: [ + { + regex: '(pod|service|endpoint|namespace)', + action: 'labeldrop', + }, + ], + tlsConfig: { + insecureSkipVerify: true, + }, + }, + { + port: 'https-self', + scheme: 'https', + interval: $._config.kubeStateMetrics.scrapeInterval, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, + } + + (kubeRbacProxyContainer { + config+:: { + kubeRbacProxy: { + local cfg = self, + image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, + name: 'kube-rbac-proxy-main', + securePortName: 'https-main', + securePort: 8443, + secureListenAddress: ':%d' % self.securePort, + upstream: 'http://127.0.0.1:8081/', + tlsCipherSuites: $._config.tlsCipherSuites, + }, + }, + }).deploymentMixin + + (kubeRbacProxyContainer { + config+:: { + kubeRbacProxy: { + local cfg = self, + image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, + name: 'kube-rbac-proxy-self', + securePortName: 'https-self', + securePort: 9443, + secureListenAddress: ':%d' % self.securePort, + upstream: 'http://127.0.0.1:8082/', + tlsCipherSuites: $._config.tlsCipherSuites, + }, + }, + }).deploymentMixin, } diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet index a6fb058..e0326b8 100644 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet +++ b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/node-exporter/node-exporter.libsonnet @@ -5,7 +5,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; namespace: 'default', versions+:: { - nodeExporter: 'v0.18.1', + nodeExporter: 'v1.0.1', }, imageRepos+:: { @@ -79,11 +79,15 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; toleration.withOperator('Exists'); local procVolumeName = 'proc'; local procVolume = volume.fromHostPath(procVolumeName, '/proc'); - local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc'); + local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc'). + withMountPropagation('HostToContainer'). + withReadOnly(true); local sysVolumeName = 'sys'; local sysVolume = volume.fromHostPath(sysVolumeName, '/sys'); - local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'); + local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'). + withMountPropagation('HostToContainer'). + withReadOnly(true); local rootVolumeName = 'root'; local rootVolume = volume.fromHostPath(rootVolumeName, '/'); diff --git a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet index a1a52d8..fa2bca0 100644 --- a/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet +++ b/monitoring/vendor/github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus/prometheus/prometheus.libsonnet @@ -312,6 +312,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; path: '/metrics/cadvisor', interval: '30s', honorLabels: true, + honorTimestamps: false, tlsConfig: { insecureSkipVerify: true, }, diff --git a/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts.jsonnet b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts.jsonnet new file mode 100644 index 0000000..23f4afe --- /dev/null +++ b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts.jsonnet @@ -0,0 +1,3 @@ +( + import 'mixin.libsonnet' +).prometheusAlerts diff --git a/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts/alerts.libsonnet b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts/alerts.libsonnet new file mode 100644 index 0000000..aa53638 --- /dev/null +++ b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/alerts/alerts.libsonnet @@ -0,0 +1,95 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus-operator', + rules: [ + { + alert: 'PrometheusOperatorListErrors', + expr: ||| + (sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', + summary: 'Errors while performing list operations in controller.', + }, + 'for': '15m', + }, + { + alert: 'PrometheusOperatorWatchErrors', + expr: ||| + (sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: 'Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.', + summary: 'Errors while performing watch operations in controller.', + }, + 'for': '15m', + }, + { + alert: 'PrometheusOperatorSyncFailed', + expr: ||| + min_over_time(prometheus_operator_syncs{status="failed",%(prometheusOperatorSelector)s}[5m]) > 0 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: 'Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.', + summary: 'Last controller reconciliation failed', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorReconcileErrors', + expr: ||| + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.', + summary: 'Errors while reconciling controller.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorNodeLookupErrors', + expr: ||| + rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.', + summary: 'Errors while reconciling Prometheus.', + }, + 'for': '10m', + }, + { + alert: 'PrometheusOperatorNotReady', + expr: ||| + min by(namespace, controller) (max_over_time(prometheus_operator_ready{%(prometheusOperatorSelector)s}[5m]) == 0) + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + description: "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.", + summary: 'Prometheus operator not ready', + }, + 'for': '5m', + }, + ], + }, + ], + }, +} diff --git a/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/config.libsonnet b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/config.libsonnet new file mode 100644 index 0000000..e16fdf8 --- /dev/null +++ b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/config.libsonnet @@ -0,0 +1,5 @@ +{ + _config+:: { + prometheusOperatorSelector: 'job="prometheus-operator"', + }, +} diff --git a/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet new file mode 100644 index 0000000..7cb7c3d --- /dev/null +++ b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet @@ -0,0 +1,2 @@ +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') diff --git a/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet index f035c02..fa51d72 100644 --- a/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet +++ b/monitoring/vendor/github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet @@ -15,7 +15,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; }, versions+:: { - prometheusOperator: 'v0.42.0', + prometheusOperator: 'v0.42.1', prometheusConfigReloader: self.prometheusOperator, configmapReloader: 'v0.4.0', }, diff --git a/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet b/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet index 5ed86d3..a37dd72 100644 --- a/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet +++ b/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/alerts/alerts.libsonnet @@ -48,7 +48,7 @@ alert: 'NodeFilesystemAlmostOutOfSpace', expr: ||| ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceAvailableCriticalThreshold)d and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) @@ -58,7 +58,7 @@ severity: 'warning', }, annotations: { - summary: 'Filesystem has less than 5% space left.', + summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config, description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, @@ -66,7 +66,7 @@ alert: 'NodeFilesystemAlmostOutOfSpace', expr: ||| ( - node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceAvailableWarningThreshold)d and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 ) @@ -76,7 +76,7 @@ severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { - summary: 'Filesystem has less than 3% space left.', + summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config, description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', }, }, @@ -238,6 +238,8 @@ alert: 'NodeClockNotSynchronising', expr: ||| min_over_time(node_timex_sync_status[5m]) == 0 + and + node_timex_maxerror_seconds >= 16 ||| % $._config, 'for': '10m', labels: { diff --git a/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/config.libsonnet b/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/config.libsonnet index c06252c..47e741e 100644 --- a/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/config.libsonnet +++ b/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/config.libsonnet @@ -47,6 +47,11 @@ fsSpaceFillingUpWarningThreshold: 40, fsSpaceFillingUpCriticalThreshold: 20, + // Available disk space (%) thresholds on which to trigger the + // 'NodeFilesystemAlmostOutOfSpace' alerts. + fsSpaceAvailableCriticalThreshold: 5, + fsSpaceAvailableWarningThreshold: 3, + grafana_prefix: '', }, } diff --git a/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/dashboards/node.libsonnet b/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/dashboards/node.libsonnet index 78241ed..07dd188 100644 --- a/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/dashboards/node.libsonnet +++ b/monitoring/vendor/github.com/prometheus/node_exporter/docs/node-mixin/dashboards/node.libsonnet @@ -75,14 +75,15 @@ local gauge = promgrafonnet.gauge; // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. // This needs to be added upstream in the promgrafonnet library and then changed here. + // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. local memoryGauge = gauge.new( 'Memory Usage', ||| 100 - ( - node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} + avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) / - node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} + avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) * 100 ) ||| % $._config, diff --git a/monitoring/vendor/mixin b/monitoring/vendor/mixin new file mode 120000 index 0000000..b78ed5a --- /dev/null +++ b/monitoring/vendor/mixin @@ -0,0 +1 @@ +github.com/prometheus-operator/prometheus-operator/jsonnet/mixin \ No newline at end of file