update monitoring
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2020-10-15 21:02:11 +02:00
parent 14d548bc23
commit dd18c911bc
43 changed files with 619 additions and 413 deletions

View File

@ -18,8 +18,8 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "528b01c327ee4abfd4afea29de9066c7f4b247fa", "version": "e42127658c910d91e7902be958f12d41ac33d54f",
"sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ=" "sum": "L+PGlPK9mykGCJ9TIoEWdhMBjz+9lKuQ4YZ8fOeP9sk="
}, },
{ {
"source": { "source": {
@ -28,8 +28,8 @@
"subdir": "grafonnet" "subdir": "grafonnet"
} }
}, },
"version": "cc1626a1b4dee45c99b78ddd9714dfd5f5d7816e", "version": "8d382c732dbdc839ff07549a3f42d25828f1b268",
"sum": "nkgrtMYPCq/YB4r3mKyToepaLhicwWnxDdGIodPpzz0=" "sum": "DRSRw4luAXlBXblo19/T1Jrv+9hyV8ivlS0KEtNANec="
}, },
{ {
"source": { "source": {
@ -38,7 +38,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "2cc8d1dcb943eb3ff1dcb85bc9a3933afb36b730", "version": "b5e45051995755ea373ea67642f8e5f54fcb8dd7",
"sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc=" "sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc="
}, },
{ {
@ -59,8 +59,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "0bbe890539df0c1477000322c73977af71ef71e9", "version": "aa2adbcf39884fd9c85d7c3e0ff338b1d61ea1ba",
"sum": "h48bpWnNFX9iN9Uqc9y0NTlKQu8sA1izvNyAHzsMIX8=" "sum": "ttkPUnv/5bqlOFcZ8fvp2wi/S7ZLKiqAZ4ZdTolX77M="
}, },
{ {
"source": { "source": {
@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet" "subdir": "lib/promgrafonnet"
} }
}, },
"version": "0bbe890539df0c1477000322c73977af71ef71e9", "version": "aa2adbcf39884fd9c85d7c3e0ff338b1d61ea1ba",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
}, },
{ {
@ -79,8 +79,8 @@
"subdir": "jsonnet/kube-state-metrics" "subdir": "jsonnet/kube-state-metrics"
} }
}, },
"version": "daf555f1e11ad6aa37852653e63baede5f99367e", "version": "35ef70bb74520a78cc8dc7cf364e1ff4e0c45063",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA=" "sum": "ySP+bI2ZMLPt/sguSh9WrwI5H5dasaNFRE8Uo9PcZrI="
}, },
{ {
"source": { "source": {
@ -89,7 +89,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin" "subdir": "jsonnet/kube-state-metrics-mixin"
} }
}, },
"version": "daf555f1e11ad6aa37852653e63baede5f99367e", "version": "35ef70bb74520a78cc8dc7cf364e1ff4e0c45063",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
}, },
{ {
@ -99,8 +99,18 @@
"subdir": "jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "5fe45c57b60f17568001fd04a7dc2bb754fdf152", "version": "980e95de011319b88a3b9c0787a81dcdf338a898",
"sum": "6Qrn74pNRqJNKYdsmcBu8ergYbMEH48qG1VDVm9FKak=" "sum": "BxOXyWCSc9KkgWJXDau2Xtsy3aOYZDHz2VqOSLga7VU="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator",
"subdir": "jsonnet/mixin"
}
},
"version": "55baf034c431ed2c78d950b187f7d8b34dd06860",
"sum": "+Q45oBC7O8g7KQOaiKhGglwndAMWRlLTR94KUI8Q1Ko="
}, },
{ {
"source": { "source": {
@ -109,8 +119,8 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "96094ad1ab039950537df448b95bbcc04c57bfc4", "version": "cd331ce9bb58bb926e391c6ae807621cb12cc29e",
"sum": "ReamRYoS2C39Of7KtXGqkSWdfHw5Fy/Ix6ujOmBLFAg=" "sum": "nM1eDP5vftqAeQSmVYzSBAh+lG0SN6zu46QiocQiVhk="
}, },
{ {
"source": { "source": {
@ -119,8 +129,8 @@
"subdir": "docs/node-mixin" "subdir": "docs/node-mixin"
} }
}, },
"version": "d8a1585f59ef1169837d08979ecc92dcea8aa58a", "version": "f81747e608ea85ae44e76454eb63f9cb6484fb9e",
"sum": "EE+C+Krf518EGLjA/x3ZvKfenCI0J7YuwFJVBscypRw=" "sum": "VyMzZPxQIjiKQYGjZjXeKNWfLJ9vOl3emp84PWfsrUc="
}, },
{ {
"source": { "source": {

View File

@ -1,17 +0,0 @@
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
creationTimestamp: null
name: alertmanager-tbrnt-config
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgATnPnwlvlfGVJeWhjcFHyp/am3nmguqi9PsWgEhxtVLuyDA6OB1G+BdJZ7dGdCViZGeDJD5mHxESSLDMTfxg5DxMDIG18XCzojMtRFTPJMZPLjbETNuSZqtrkbscp/qQom4z+igVuLlkaihdYRcCNV+B0vm+1h6BUPV8Utv1RN1dy9XUvdrvhPRNFvqhCpVcpcLwNP5cli5SNYgVc/ty6a45Fl5h+KLv7rFBJexLhUXoR0jamQpQWoH7oNHcS4ONHxLDMKXqE9jFpKzlQJBNgiRQEEotwCYTodoALmkcIs37Ai+trQxEMZZYtD5vFzbehfTtNLT1bPhLiX91rv6Q9n9wuIw951Qk11L6cF93zDl2mZ9dAQHSAglVHEriXKXBZ3Df4DSyh5qkr+/7lFBdFTQVMS5+YTgM1eCmG1yfsvU33IWKh5wrNhpkUqGLiq9f+4k3xPQVysVY3jJjVhINM/A9OsTPfFzm7aAAklBxuROXiZgZ/6L4Oc/c0Tv3EN/02rhinSGr1hIMmcaSxdOVQxXPU+pbx4JcSmyQFXIY37n/2ya/UbJW/o901MtigCukUvgMedkxPSGhedvHOygKKXPKNSl5U1Emhza7c6vP9cSiiaHpRm7EyUQvjWpJRUP7tSRgDlZyBM9Ud0PRBRdYWLG5YlZB4STOX6cDyYOcFJvnAyiZpDuwOPKMOrhWQSbCgcMcuS/RCgCnYJ4YBfm1cSxcqxsA65PRhXbRmiY9b/Mqs7s1xpJo3RySO27JiffbY+vYRIrFv4G4ak0ug9AQJvrvEA/ZgSs9xpASXSsr42pB63exhlZP+D9JEDGFLgzGQVnVFRFDrlYLFQieqWDgBc0pkcxHHwGBTp3H6PP5RRPLKzNoypTbNrdLlaNAsAlb8VuPopPBHqLcpU+DPuxxBCQ/P8ezYXE8RmpH7x4A2rPLbV902zYVwfnWMrUdPZif7oPnn/xM+VDQMLIlKA/CQySudzAYf621N450V2zE0akOFQNATGEUZT+8HtjNKMcAxjojP/pJpo486t02KXHDw+i04R1kdGPVkLnZDz+UShh4eLwovBL3zopxHxSFnonI1Ez+IetemM+aCJhadU4YNC4zY7x+blNa51ZAGEGoXuSJB3fszLd7wSz4owIhQnb+StxVNcwNgircvFjhauLVLwEynO4WTm+YKzosf2GxJAaNriveyLj1L+DwBUOeWnvmL4QsHX3nriZgVR89KlIQI1d3+lcf2jw8VqfHj3tbpMxb98qZFWw2pczrDeE2t4UQTPTj+4VK4htKGhnIHlNmdeSN39GJCWfilnzAsznrkxZsr1wIqrP+ayRO+NxljBzUN7xspByJeJrBpzW1pukvNg74MK1K7g0/fh/zmqyduQYmJnCDDqfN0PB1YoXFPZn3o4kzGnLXetgJHyJG4tsinediVXrZJb+6KOL31hEbZVArP/gWYHQv1MltEZj2yunuLeEy+Oo35oS/IkW17qF3gkF0sLavZFhJe6XqVvO2BFF1V8S15cBApXXPvFJQWyFLOTVqhYicYfJqQEgz7sKH1uYK0zLeyBkRIfUXk3vs7X9X/CzQo9J9oH5FCgNEI6GdROhitGUihCxwAVa6lsteVxZZ4USkMGZtJPG3Pi5RQuWpSMFX9nWm/LHv8wpgFItx12ZkuaIDB30wYfp4SqcBr1ZvuptKtwJpTUjVx5HSUKnEye2g==
template:
metadata:
creationTimestamp: null
name: alertmanager-tbrnt-config
namespace: monitoring
type: Opaque
status: {}

View File

@ -144,6 +144,7 @@ items:
"decimals": 3, "decimals": 3,
"description": "How much error budget is left looking at our 0.990% availability gurantees?", "description": "How much error budget is left looking at our 0.990% availability gurantees?",
"fill": 10, "fill": 10,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -338,6 +339,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "How many read requests (LIST,GET) per second do the apiservers get by code?", "description": "How many read requests (LIST,GET) per second do the apiservers get by code?",
"fill": 10, "fill": 10,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -446,6 +448,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -539,6 +542,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -731,6 +735,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?",
"fill": 10, "fill": 10,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -839,6 +844,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -932,6 +938,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1037,6 +1044,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1129,6 +1137,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1221,6 +1230,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1326,6 +1336,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1418,6 +1429,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1510,6 +1522,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -1780,6 +1793,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -1882,6 +1896,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -2325,6 +2340,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -2427,6 +2443,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -2559,6 +2576,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -2659,6 +2677,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -2770,6 +2789,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -2870,6 +2890,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -2990,6 +3011,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -3090,6 +3112,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -3190,6 +3213,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -3294,6 +3318,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 24, "w": 24,
@ -3668,6 +3693,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -3773,6 +3799,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -3878,6 +3905,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -3983,6 +4011,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -4096,6 +4125,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -4201,6 +4231,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -4306,6 +4337,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -4398,6 +4430,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -4490,6 +4523,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -16978,6 +17012,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17070,6 +17105,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17175,6 +17211,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17280,6 +17317,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17379,6 +17417,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17491,6 +17530,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17585,6 +17625,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17692,6 +17733,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17799,6 +17841,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17891,6 +17934,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -17997,6 +18041,7 @@ items:
"datasource": "$datasource", "datasource": "$datasource",
"description": "Pod lifecycle event generator", "description": "Pod lifecycle event generator",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18089,6 +18134,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18194,6 +18240,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18299,6 +18346,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18425,6 +18473,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18530,6 +18579,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18622,6 +18672,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -18714,6 +18765,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -19527,6 +19579,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -19627,6 +19680,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -19738,6 +19792,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -19838,6 +19893,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -19958,6 +20014,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -20058,6 +20115,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -20400,6 +20458,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -20502,6 +20561,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -20945,6 +21005,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21047,6 +21108,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21179,6 +21241,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21279,6 +21342,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21390,6 +21454,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21490,6 +21555,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21610,6 +21676,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -21710,6 +21777,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -23983,6 +24051,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24076,6 +24145,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 0, "fill": 0,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24202,6 +24272,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24367,7 +24438,7 @@ items:
"tableColumn": "", "tableColumn": "",
"targets": [ "targets": [
{ {
"expr": "100 -\n(\n node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n* 100\n)\n", "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n",
"format": "time_series", "format": "time_series",
"intervalFactor": 2, "intervalFactor": 2,
"legendFormat": "", "legendFormat": "",
@ -24412,6 +24483,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 0, "fill": 0,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24528,6 +24600,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24647,6 +24720,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 0, "fill": 0,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24740,6 +24814,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 0, "fill": 0,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -24961,6 +25036,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -25157,6 +25233,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -25819,6 +25896,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -25919,6 +25997,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -26030,6 +26109,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -26130,6 +26210,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -26250,6 +26331,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -26350,6 +26432,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 10, "h": 10,
"w": 12, "w": 12,
@ -26700,6 +26783,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -26792,6 +26876,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -26897,6 +26982,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27002,6 +27088,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27095,6 +27182,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27187,6 +27275,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27279,6 +27368,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27384,6 +27474,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27476,6 +27567,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27581,6 +27673,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27673,6 +27766,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27778,6 +27872,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27870,6 +27965,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -27962,6 +28058,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -28054,6 +28151,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -29634,6 +29732,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -29726,6 +29825,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -29831,6 +29931,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -29923,6 +30024,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30028,6 +30130,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30141,6 +30244,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30246,6 +30350,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30351,6 +30456,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30443,6 +30549,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30535,6 +30642,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30839,6 +30947,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -30952,6 +31061,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -31078,6 +31188,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -31191,6 +31302,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -31296,6 +31408,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -31401,6 +31514,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -31493,6 +31607,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -31585,6 +31700,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -32417,6 +32533,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 1, "fill": 1,
"fillGradient": 0,
"gridPos": { "gridPos": {
}, },
@ -33529,6 +33646,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -33631,6 +33749,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -33744,6 +33863,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -33846,6 +33966,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -33978,6 +34099,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -34078,6 +34200,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -34189,6 +34312,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -34289,6 +34413,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -34409,6 +34534,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,
@ -34509,6 +34635,7 @@ items:
"dashes": false, "dashes": false,
"datasource": "$datasource", "datasource": "$datasource",
"fill": 2, "fill": 2,
"fillGradient": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 9,
"w": 12, "w": 12,

View File

@ -1,31 +0,0 @@
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: healthchecks-io
namespace: monitoring
spec:
schedule: "*/1 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 1
startingDeadlineSeconds: 200
jobTemplate:
spec:
template:
spec:
containers:
- name: pinghc
env:
- name: HCURL
valueFrom:
secretKeyRef:
name: healthchecks-io
key: HCURL
image: busybox
args:
- /bin/sh
- -c
- "date && echo $HCURL && /bin/wget -q -O - --no-check-certificate $HCURL"
restartPolicy: OnFailure

View File

@ -1,17 +0,0 @@
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
creationTimestamp: null
name: healthchecks-io
namespace: monitoring
spec:
encryptedData:
HCURL: AgBEpwET1Qa1hQqAmwrNGBv4sL0ml8pGYPwgq9Aps3tYhBVqsXjV7U5RQa/txldg1umw2Zqx8MfvZTN2kmFk6bJTROCWqTxmxd4rHgnJYqRR0+Opn/BtDhVx4WTnehyM/il9ymddhMD+WRQDr/Wfxq/0UQdsy+IEYyVMQuOKEihZabxmXRyNeAl5ZBeQ0W1T29biJPx3rifS37RbGlJtCIYuNPh82d0KAMu1dszDnkln8k5CBv6mPD8BVHg+Z/y1v1jFhTIE3YOlGzCIjb8RrJj6MVm7zlauj8zrl30JvF2OAWDGGZDOL3b0G3IKd0Qp/eagT33Sx7vbppY/l1Vci6UQcVpde3u2+ATMbysRej04Mvcodq5OgkBFqbgCzx0UFTIq0wER/GuCoYbt+k8b3TouK5ChQet8EP0W/c7rLHcMY3c0UR00N7m5UeKZAzAkXSGV+u3M9K6PMp8pl0VuDo+IVgEIY7ku9rtzL7SPIfXS4u5w7fte13fOtKB/2sa11dNqAbHmidF+IO6ycjm8SZibC7NKyCxgIKWPfsFXhNUT2Nx7eBRrzR1QlqThIGRsDpX1RVplTwe/OLsBz0K99AyGDUkSBJdOZLaRT/b3T0nS8DE5x/e8MvFsbbDdGE2U/YhVrbfn072u/X979/RIm0oCjipvByZXhFmobRj9SP9RcK2UfjBSY7xyKnd2rjj1mnIs2S0CmwGFdJqoywHckJJOu3YP2oN2Q1U7+Fe4yciupAshgdszY2okHMtd4aDDJJKeKKFHpjpsuA==
template:
metadata:
creationTimestamp: null
name: healthchecks-io
namespace: monitoring
type: Opaque
status: {}

View File

@ -1,40 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8up
labels:
prometheus: k8s
role: alert-rules
spec:
groups:
- name: k8up.rules
rules:
- alert: baas_last_errors
expr: baas_backup_restic_last_errors > 0
for: 1m
labels:
severity: critical
annotations:
summary: Amount of errors of last restic backup
description: This alert is fired when error number is > 0
- alert: K8upBackupFailed
expr: rate(k8up_jobs_failed_counter[1d]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Job in {{ $labels.namespace }} of type {{ $labels.jobType }} failed"
- alert: K8upBackupNotRunning
expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0
for: 1m
labels:
severity: critical
annotations:
summary: "No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock"
- alert: K8upJobStuck
expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0
for: 24h
labels:
severity: critical
annotations:
summary: "K8up jobs are stuck in {{ $labels.namespace }} for the last 24 hours."

View File

@ -30,7 +30,6 @@ rules:
- daemonsets - daemonsets
- deployments - deployments
- replicasets - replicasets
- ingresses
verbs: verbs:
- list - list
- watch - watch
@ -105,6 +104,7 @@ rules:
- networking.k8s.io - networking.k8s.io
resources: resources:
- networkpolicies - networkpolicies
- ingresses
verbs: verbs:
- list - list
- watch - watch

View File

@ -3,7 +3,7 @@ kind: DaemonSet
metadata: metadata:
labels: labels:
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1 app.kubernetes.io/version: v1.0.1
name: node-exporter name: node-exporter
namespace: monitoring namespace: monitoring
spec: spec:
@ -14,7 +14,7 @@ spec:
metadata: metadata:
labels: labels:
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1 app.kubernetes.io/version: v1.0.1
spec: spec:
containers: containers:
- args: - args:
@ -25,7 +25,7 @@ spec:
- --no-collector.wifi - --no-collector.wifi
- --no-collector.hwmon - --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
image: quay.io/prometheus/node-exporter:v0.18.1 image: quay.io/prometheus/node-exporter:v1.0.1
name: node-exporter name: node-exporter
resources: resources:
limits: limits:
@ -36,11 +36,13 @@ spec:
memory: 180Mi memory: 180Mi
volumeMounts: volumeMounts:
- mountPath: /host/proc - mountPath: /host/proc
mountPropagation: HostToContainer
name: proc name: proc
readOnly: false readOnly: true
- mountPath: /host/sys - mountPath: /host/sys
mountPropagation: HostToContainer
name: sys name: sys
readOnly: false readOnly: true
- mountPath: /host/root - mountPath: /host/root
mountPropagation: HostToContainer mountPropagation: HostToContainer
name: root name: root

View File

@ -3,7 +3,7 @@ kind: Service
metadata: metadata:
labels: labels:
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1 app.kubernetes.io/version: v1.0.1
name: node-exporter name: node-exporter
namespace: monitoring namespace: monitoring
spec: spec:

View File

@ -3,7 +3,7 @@ kind: ServiceMonitor
metadata: metadata:
labels: labels:
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1 app.kubernetes.io/version: v1.0.1
name: node-exporter name: node-exporter
namespace: monitoring namespace: monitoring
spec: spec:

View File

@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:
@ -19,4 +19,4 @@ spec:
matchLabels: matchLabels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1

View File

@ -1019,6 +1019,8 @@ spec:
summary: Clock not synchronising. summary: Clock not synchronising.
expr: | expr: |
min_over_time(node_timex_sync_status[5m]) == 0 min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -1044,6 +1046,75 @@ spec:
node_md_disks{state="fail"} > 0 node_md_disks{state="fail"} > 0
labels: labels:
severity: warning severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations
failed for {{ $labels.controller }} controller in {{ $labels.namespace }}
namespace.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
Namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
for: 5m
labels:
severity: warning
- name: kubernetes-apps - name: kubernetes-apps
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
@ -1249,7 +1320,7 @@ spec:
- alert: KubeJobFailed - alert: KubeJobFailed
annotations: annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete. complete. Removing failed job after investigation should clear this alert.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete. summary: Job failed to complete.
expr: | expr: |
@ -2031,40 +2102,3 @@ spec:
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
message: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
message: Errors while performing Watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning

View File

@ -53,6 +53,7 @@ spec:
insecureSkipVerify: true insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true honorLabels: true
honorTimestamps: false
interval: 30s interval: 30s
metricRelabelings: metricRelabelings:
- action: drop - action: drop

View File

@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
name: prometheus-operator name: prometheus-operator
rules: rules:
- apiGroups: - apiGroups:

View File

@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
name: prometheus-operator name: prometheus-operator
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io

View File

@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:
@ -18,15 +18,15 @@ spec:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
spec: spec:
containers: containers:
- args: - args:
- --kubelet-service=kube-system/kubelet - --kubelet-service=kube-system/kubelet
- --logtostderr=true - --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.4.0 - --config-reloader-image=jimmidyson/configmap-reload:v0.4.0
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.0 - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.1
image: quay.io/prometheus-operator/prometheus-operator:v0.42.0 image: quay.io/prometheus-operator/prometheus-operator:v0.42.1
name: prometheus-operator name: prometheus-operator
ports: ports:
- containerPort: 8080 - containerPort: 8080

View File

@ -4,7 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring
spec: spec:

View File

@ -4,6 +4,6 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0 app.kubernetes.io/version: v0.42.1
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: monitoring

View File

@ -1,17 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: traefik
namespace: monitoring
spec:
endpoints:
- interval: 30s
path: /metrics
port: metrics
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app: traefik

View File

@ -34,7 +34,8 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).', description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
summary: 'etcd cluster members are down.',
}, },
}, },
{ {
@ -47,7 +48,8 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).',
summary: 'etcd cluster has insufficient number of members.',
}, },
}, },
{ {
@ -60,7 +62,8 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.',
summary: 'etcd cluster has no leader.',
}, },
}, },
{ {
@ -73,7 +76,8 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
summary: 'etcd cluster has high number of leader changes.',
}, },
}, },
{ {
@ -89,7 +93,8 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster has high number of failed grpc requests.',
}, },
}, },
{ {
@ -105,7 +110,8 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster has high number of failed grpc requests.',
}, },
}, },
{ {
@ -119,7 +125,8 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd grpc requests are slow',
}, },
}, },
{ {
@ -133,7 +140,8 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster member communication is slow.',
}, },
}, },
{ {
@ -146,7 +154,8 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.', description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster has high number of proposal failures.',
}, },
}, },
{ {
@ -159,6 +168,21 @@
labels: { labels: {
severity: 'warning', severity: 'warning',
}, },
annotations: {
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster 99th percentile fsync durations are too high.',
},
},
{
alert: 'etcdHighFsyncDurations',
expr: |||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
> 1
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
}, },
@ -174,7 +198,8 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster 99th percentile commit durations are too high.',
}, },
}, },
{ {
@ -188,7 +213,8 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}', description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}',
summary: 'etcd has high number of failed HTTP requests.',
}, },
}, },
{ {
@ -202,7 +228,8 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.', description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd has high number of failed HTTP requests.',
}, },
}, },
{ {
@ -216,9 +243,36 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
message: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.', description: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.',
summary: 'etcd instance HTTP requests are slow.',
}, },
}, },
{
alert: 'etcdBackendQuotaLowSpace',
expr: |||
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.',
},
},
{
alert: 'etcdExcessiveDatabaseGrowth',
expr: |||
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.',
},
},
], ],
}, },
], ],

View File

@ -26,7 +26,8 @@ tests:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": members are down (3).' description: 'etcd cluster "etcd": members are down (3).'
summary: 'etcd cluster members are down.'
- eval_time: 7m - eval_time: 7m
alertname: etcdInsufficientMembers alertname: etcdInsufficientMembers
- eval_time: 11m - eval_time: 11m
@ -36,7 +37,8 @@ tests:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": insufficient members (1).' description: 'etcd cluster "etcd": insufficient members (1).'
summary: 'etcd cluster has insufficient number of members.'
- eval_time: 15m - eval_time: 15m
alertname: etcdInsufficientMembers alertname: etcdInsufficientMembers
exp_alerts: exp_alerts:
@ -44,7 +46,8 @@ tests:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": insufficient members (0).' description: 'etcd cluster "etcd": insufficient members (0).'
summary: 'etcd cluster has insufficient number of members.'
- interval: 1m - interval: 1m
input_series: input_series:
@ -62,7 +65,8 @@ tests:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": members are down (3).' description: 'etcd cluster "etcd": members are down (3).'
summary: 'etcd cluster members are down.'
- interval: 1m - interval: 1m
input_series: input_series:
@ -80,7 +84,8 @@ tests:
job: etcd job: etcd
severity: critical severity: critical
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": members are down (1).' description: 'etcd cluster "etcd": members are down (1).'
summary: 'etcd cluster members are down.'
- interval: 1m - interval: 1m
input_series: input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
@ -97,7 +102,8 @@ tests:
job: etcd job: etcd
severity: warning severity: warning
exp_annotations: exp_annotations:
message: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' description: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
summary: 'etcd cluster has high number of leader changes.'
- interval: 1m - interval: 1m
input_series: input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
@ -110,4 +116,20 @@ tests:
- eval_time: 10m - eval_time: 10m
alertname: etcdHighNumberOfLeaderChanges alertname: etcdHighNumberOfLeaderChanges
exp_alerts: exp_alerts:
- interval: 1m
input_series:
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}'
values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0'
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}'
values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0'
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}'
values: '0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 10m
alertname: etcdExcessiveDatabaseGrowth
exp_alerts:
- exp_labels:
job: etcd
severity: warning
exp_annotations:
message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'

View File

@ -10,6 +10,7 @@
* @param span (optional) Width of the panel * @param span (optional) Width of the panel
* @param datasource (optional) Datasource * @param datasource (optional) Datasource
* @param fill (default `1`) , integer from 0 to 10 * @param fill (default `1`) , integer from 0 to 10
* @param fillGradient (default `0`) , integer from 0 to 10
* @param linewidth (default `1`) Line Width, integer from 0 to 10 * @param linewidth (default `1`) Line Width, integer from 0 to 10
* @param decimals (optional) Override automatic decimal precision for legend and tooltip. If null, not added to the json output. * @param decimals (optional) Override automatic decimal precision for legend and tooltip. If null, not added to the json output.
* @param decimalsY1 (optional) Override automatic decimal precision for the first Y axis. If null, use decimals parameter. * @param decimalsY1 (optional) Override automatic decimal precision for the first Y axis. If null, use decimals parameter.
@ -63,11 +64,13 @@
* @method addYaxis(format,min,max,label,show,logBase,decimals) Adds a Y axis to the graph * @method addYaxis(format,min,max,label,show,logBase,decimals) Adds a Y axis to the graph
* @method addAlert(alert) Adds an alert * @method addAlert(alert) Adds an alert
* @method addLink(link) Adds a [panel link](https://grafana.com/docs/grafana/latest/linking/panel-links/) * @method addLink(link) Adds a [panel link](https://grafana.com/docs/grafana/latest/linking/panel-links/)
* @method addLinks(links) Adds an array of links.
*/ */
new( new(
title, title,
span=null, span=null,
fill=1, fill=1,
fillGradient=0,
linewidth=1, linewidth=1,
decimals=null, decimals=null,
decimalsY1=null, decimalsY1=null,
@ -166,6 +169,7 @@
}, },
lines: lines, lines: lines,
fill: fill, fill: fill,
fillGradient: fillGradient,
linewidth: linewidth, linewidth: linewidth,
dashes: dashes, dashes: dashes,
dashLength: 10, dashLength: 10,
@ -283,5 +287,6 @@
addLink(link):: self { addLink(link):: self {
links+: [link], links+: [link],
}, },
addLinks(links):: std.foldl(function(p, t) p.addLink(t), links, self),
}, },
} }

View File

@ -6,12 +6,15 @@
* *
* @param expr * @param expr
* @param hide (optional) Disable query on graph. * @param hide (optional) Disable query on graph.
* @param legendFormat (optional) Defines the legend. Defaults to ''.
*/ */
target( target(
expr, expr,
hide=null, hide=null,
legendFormat='',
):: { ):: {
[if hide != null then 'hide']: hide, [if hide != null then 'hide']: hide,
expr: expr, expr: expr,
legendFormat: legendFormat,
}, },
} }

View File

@ -258,7 +258,7 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.', description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
summary: 'Job failed to complete.', summary: 'Job failed to complete.',
}, },
}, },

View File

@ -337,11 +337,11 @@ local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libson
legend_avg=true, legend_avg=true,
) )
.addTarget(prometheus.target( .addTarget(prometheus.target(
'sort_desc(sum by (container) (rate(windows_container_network_receive_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config, 'sort_desc(sum by (container) (rate(windows_container_network_received_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config,
legendFormat='Received : {{ container }}', legendFormat='Received : {{ container }}',
)) ))
.addTarget(prometheus.target( .addTarget(prometheus.target(
'sort_desc(sum by (container) (rate(windows_container_network_transmit_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config, 'sort_desc(sum by (container) (rate(windows_container_network_transmitted_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config,
legendFormat='Transmitted : {{ container }}', legendFormat='Transmitted : {{ container }}',
)) ))
) )

View File

@ -202,13 +202,13 @@
||| % $._config, ||| % $._config,
}, },
{ {
record: 'windows_container_network_receive_bytes_total', record: 'windows_container_network_received_bytes_total',
expr: ||| expr: |||
windows_container_network_receive_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace) windows_container_network_receive_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace)
||| % $._config, ||| % $._config,
}, },
{ {
record: 'windows_container_network_transmit_bytes_total', record: 'windows_container_network_transmitted_bytes_total',
expr: ||| expr: |||
windows_container_network_transmit_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace) windows_container_network_transmit_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace)
||| % $._config, ||| % $._config,

View File

@ -58,7 +58,6 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
'daemonsets', 'daemonsets',
'deployments', 'deployments',
'replicasets', 'replicasets',
'ingresses',
]) + ]) +
rulesType.withVerbs(['list', 'watch']), rulesType.withVerbs(['list', 'watch']),
@ -135,6 +134,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
rulesType.withApiGroups(['networking.k8s.io']) + rulesType.withApiGroups(['networking.k8s.io']) +
rulesType.withResources([ rulesType.withResources([
'networkpolicies', 'networkpolicies',
'ingresses',
]) + ]) +
rulesType.withVerbs(['list', 'watch']), rulesType.withVerbs(['list', 'watch']),
@ -228,6 +228,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
roleBinding.new() + roleBinding.new() +
roleBinding.mixin.metadata.withName(ksm.name) + roleBinding.mixin.metadata.withName(ksm.name) +
roleBinding.mixin.metadata.withNamespace(ksm.namespace) +
roleBinding.mixin.metadata.withLabels(ksm.commonLabels) + roleBinding.mixin.metadata.withLabels(ksm.commonLabels) +
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
roleBinding.mixin.roleRef.withName(ksm.name) + roleBinding.mixin.roleRef.withName(ksm.name) +

View File

@ -1,4 +1,3 @@
(import 'alertmanager.libsonnet') + (import 'alertmanager.libsonnet') +
(import 'general.libsonnet') + (import 'general.libsonnet') +
(import 'node.libsonnet') + (import 'node.libsonnet')
(import 'prometheus-operator.libsonnet')

View File

@ -1,63 +0,0 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorListErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorWatchErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||
rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNodeLookupErrors',
expr: |||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
],
},
],
},
}

View File

@ -28,6 +28,15 @@
}, },
"version": "release-0.42" "version": "release-0.42"
}, },
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator",
"subdir": "jsonnet/mixin"
}
},
"version": "master"
},
{ {
"source": { "source": {
"git": { "git": {

View File

@ -1,6 +1,7 @@
local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet'; local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet'; local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet';
local configMapList = k3.core.v1.configMapList; local configMapList = k3.core.v1.configMapList;
local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet';
(import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') + (import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') +
(import './kube-state-metrics/kube-state-metrics.libsonnet') + (import './kube-state-metrics/kube-state-metrics.libsonnet') +
@ -9,6 +10,7 @@ local configMapList = k3.core.v1.configMapList;
(import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') + (import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') +
(import './alertmanager/alertmanager.libsonnet') + (import './alertmanager/alertmanager.libsonnet') +
(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') + (import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') +
(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') +
(import './prometheus/prometheus.libsonnet') + (import './prometheus/prometheus.libsonnet') +
(import './prometheus-adapter/prometheus-adapter.libsonnet') + (import './prometheus-adapter/prometheus-adapter.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') + (import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') +
@ -60,7 +62,7 @@ local configMapList = k3.core.v1.configMapList;
], ],
}, },
} + } +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') { (kubeRbacProxyContainer {
config+:: { config+:: {
kubeRbacProxy: { kubeRbacProxy: {
local cfg = self, local cfg = self,

View File

@ -1,3 +1,6 @@
local kubeRbacProxyContainer = import '../kube-rbac-proxy/container.libsonnet';
local ksm = import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet';
{ {
_config+:: { _config+:: {
versions+:: { versions+:: {
@ -11,119 +14,119 @@
scrapeTimeout: '30s', scrapeTimeout: '30s',
}, },
}, },
kubeStateMetrics+:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') + kubeStateMetrics+::
{ ksm + {
local ksm = self, local version = self.version,
name:: 'kube-state-metrics', name:: 'kube-state-metrics',
namespace:: $._config.namespace, namespace:: $._config.namespace,
version:: $._config.versions.kubeStateMetrics, version:: $._config.versions.kubeStateMetrics,
image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics, image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics,
service+: { service+: {
spec+: { spec+: {
ports: [ ports: [
{ {
name: 'https-main', name: 'https-main',
port: 8443, port: 8443,
targetPort: 'https-main', targetPort: 'https-main',
}, },
{ {
name: 'https-self', name: 'https-self',
port: 9443, port: 9443,
targetPort: 'https-self', targetPort: 'https-self',
}, },
], ],
}, },
}, },
deployment+: { deployment+: {
spec+: { spec+: {
template+: { template+: {
spec+: { spec+: {
containers: std.map(function(c) c { containers: std.map(function(c) c {
ports:: null, ports:: null,
livenessProbe:: null, livenessProbe:: null,
readinessProbe:: null, readinessProbe:: null,
args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'], args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'],
}, super.containers), }, super.containers),
}, },
}, },
}, },
}, },
serviceMonitor: serviceMonitor:
{ {
apiVersion: 'monitoring.coreos.com/v1', apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor', kind: 'ServiceMonitor',
metadata: { metadata: {
name: 'kube-state-metrics', name: 'kube-state-metrics',
namespace: $._config.namespace, namespace: $._config.namespace,
labels: { labels: {
'app.kubernetes.io/name': 'kube-state-metrics', 'app.kubernetes.io/name': 'kube-state-metrics',
'app.kubernetes.io/version': ksm.version, 'app.kubernetes.io/version': version,
}, },
}, },
spec: { spec: {
jobLabel: 'app.kubernetes.io/name', jobLabel: 'app.kubernetes.io/name',
selector: { selector: {
matchLabels: { matchLabels: {
'app.kubernetes.io/name': 'kube-state-metrics', 'app.kubernetes.io/name': 'kube-state-metrics',
}, },
}, },
endpoints: [ endpoints: [
{ {
port: 'https-main', port: 'https-main',
scheme: 'https', scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval, interval: $._config.kubeStateMetrics.scrapeInterval,
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout, scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
honorLabels: true, honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [ relabelings: [
{ {
regex: '(pod|service|endpoint|namespace)', regex: '(pod|service|endpoint|namespace)',
action: 'labeldrop', action: 'labeldrop',
}, },
], ],
tlsConfig: { tlsConfig: {
insecureSkipVerify: true, insecureSkipVerify: true,
}, },
}, },
{ {
port: 'https-self', port: 'https-self',
scheme: 'https', scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval, interval: $._config.kubeStateMetrics.scrapeInterval,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
tlsConfig: { tlsConfig: {
insecureSkipVerify: true, insecureSkipVerify: true,
}, },
}, },
], ],
}, },
}, },
} + } +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') { (kubeRbacProxyContainer {
config+:: { config+:: {
kubeRbacProxy: { kubeRbacProxy: {
local cfg = self, local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-main', name: 'kube-rbac-proxy-main',
securePortName: 'https-main', securePortName: 'https-main',
securePort: 8443, securePort: 8443,
secureListenAddress: ':%d' % self.securePort, secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8081/', upstream: 'http://127.0.0.1:8081/',
tlsCipherSuites: $._config.tlsCipherSuites, tlsCipherSuites: $._config.tlsCipherSuites,
}, },
}, },
}).deploymentMixin + }).deploymentMixin +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') { (kubeRbacProxyContainer {
config+:: { config+:: {
kubeRbacProxy: { kubeRbacProxy: {
local cfg = self, local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy, image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-self', name: 'kube-rbac-proxy-self',
securePortName: 'https-self', securePortName: 'https-self',
securePort: 9443, securePort: 9443,
secureListenAddress: ':%d' % self.securePort, secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8082/', upstream: 'http://127.0.0.1:8082/',
tlsCipherSuites: $._config.tlsCipherSuites, tlsCipherSuites: $._config.tlsCipherSuites,
}, },
}, },
}).deploymentMixin, }).deploymentMixin,
} }

View File

@ -5,7 +5,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
namespace: 'default', namespace: 'default',
versions+:: { versions+:: {
nodeExporter: 'v0.18.1', nodeExporter: 'v1.0.1',
}, },
imageRepos+:: { imageRepos+:: {
@ -79,11 +79,15 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
toleration.withOperator('Exists'); toleration.withOperator('Exists');
local procVolumeName = 'proc'; local procVolumeName = 'proc';
local procVolume = volume.fromHostPath(procVolumeName, '/proc'); local procVolume = volume.fromHostPath(procVolumeName, '/proc');
local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc'); local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc').
withMountPropagation('HostToContainer').
withReadOnly(true);
local sysVolumeName = 'sys'; local sysVolumeName = 'sys';
local sysVolume = volume.fromHostPath(sysVolumeName, '/sys'); local sysVolume = volume.fromHostPath(sysVolumeName, '/sys');
local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys'); local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys').
withMountPropagation('HostToContainer').
withReadOnly(true);
local rootVolumeName = 'root'; local rootVolumeName = 'root';
local rootVolume = volume.fromHostPath(rootVolumeName, '/'); local rootVolume = volume.fromHostPath(rootVolumeName, '/');

View File

@ -312,6 +312,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
path: '/metrics/cadvisor', path: '/metrics/cadvisor',
interval: '30s', interval: '30s',
honorLabels: true, honorLabels: true,
honorTimestamps: false,
tlsConfig: { tlsConfig: {
insecureSkipVerify: true, insecureSkipVerify: true,
}, },

View File

@ -0,0 +1,3 @@
(
import 'mixin.libsonnet'
).prometheusAlerts

View File

@ -0,0 +1,95 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorListErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
summary: 'Errors while performing list operations in controller.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorWatchErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
summary: 'Errors while performing watch operations in controller.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorSyncFailed',
expr: |||
min_over_time(prometheus_operator_syncs{status="failed",%(prometheusOperatorSelector)s}[5m]) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.',
summary: 'Last controller reconciliation failed',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
summary: 'Errors while reconciling controller.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNodeLookupErrors',
expr: |||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
summary: 'Errors while reconciling Prometheus.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNotReady',
expr: |||
min by(namespace, controller) (max_over_time(prometheus_operator_ready{%(prometheusOperatorSelector)s}[5m]) == 0)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.",
summary: 'Prometheus operator not ready',
},
'for': '5m',
},
],
},
],
},
}

View File

@ -0,0 +1,5 @@
{
_config+:: {
prometheusOperatorSelector: 'job="prometheus-operator"',
},
}

View File

@ -0,0 +1,2 @@
(import 'config.libsonnet') +
(import 'alerts/alerts.libsonnet')

View File

@ -15,7 +15,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
}, },
versions+:: { versions+:: {
prometheusOperator: 'v0.42.0', prometheusOperator: 'v0.42.1',
prometheusConfigReloader: self.prometheusOperator, prometheusConfigReloader: self.prometheusOperator,
configmapReloader: 'v0.4.0', configmapReloader: 'v0.4.0',
}, },

View File

@ -48,7 +48,7 @@
alert: 'NodeFilesystemAlmostOutOfSpace', alert: 'NodeFilesystemAlmostOutOfSpace',
expr: ||| expr: |||
( (
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceAvailableCriticalThreshold)d
and and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
) )
@ -58,7 +58,7 @@
severity: 'warning', severity: 'warning',
}, },
annotations: { annotations: {
summary: 'Filesystem has less than 5% space left.', summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config,
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
}, },
}, },
@ -66,7 +66,7 @@
alert: 'NodeFilesystemAlmostOutOfSpace', alert: 'NodeFilesystemAlmostOutOfSpace',
expr: ||| expr: |||
( (
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceAvailableWarningThreshold)d
and and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0 node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
) )
@ -76,7 +76,7 @@
severity: '%(nodeCriticalSeverity)s' % $._config, severity: '%(nodeCriticalSeverity)s' % $._config,
}, },
annotations: { annotations: {
summary: 'Filesystem has less than 3% space left.', summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config,
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
}, },
}, },
@ -238,6 +238,8 @@
alert: 'NodeClockNotSynchronising', alert: 'NodeClockNotSynchronising',
expr: ||| expr: |||
min_over_time(node_timex_sync_status[5m]) == 0 min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
||| % $._config, ||| % $._config,
'for': '10m', 'for': '10m',
labels: { labels: {

View File

@ -47,6 +47,11 @@
fsSpaceFillingUpWarningThreshold: 40, fsSpaceFillingUpWarningThreshold: 40,
fsSpaceFillingUpCriticalThreshold: 20, fsSpaceFillingUpCriticalThreshold: 20,
// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemAlmostOutOfSpace' alerts.
fsSpaceAvailableCriticalThreshold: 5,
fsSpaceAvailableWarningThreshold: 3,
grafana_prefix: '', grafana_prefix: '',
}, },
} }

View File

@ -75,14 +75,15 @@ local gauge = promgrafonnet.gauge;
// TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%.
// This needs to be added upstream in the promgrafonnet library and then changed here. // This needs to be added upstream in the promgrafonnet library and then changed here.
// NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout.
local memoryGauge = gauge.new( local memoryGauge = gauge.new(
'Memory Usage', 'Memory Usage',
||| |||
100 - 100 -
( (
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"})
/ /
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"})
* 100 * 100
) )
||| % $._config, ||| % $._config,

1
monitoring/vendor/mixin vendored Symbolic link
View File

@ -0,0 +1 @@
github.com/prometheus-operator/prometheus-operator/jsonnet/mixin