update monitoring
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2020-10-15 21:02:11 +02:00
parent 14d548bc23
commit dd18c911bc
43 changed files with 619 additions and 413 deletions

View File

@ -18,8 +18,8 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "528b01c327ee4abfd4afea29de9066c7f4b247fa",
"sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ="
"version": "e42127658c910d91e7902be958f12d41ac33d54f",
"sum": "L+PGlPK9mykGCJ9TIoEWdhMBjz+9lKuQ4YZ8fOeP9sk="
},
{
"source": {
@ -28,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "cc1626a1b4dee45c99b78ddd9714dfd5f5d7816e",
"sum": "nkgrtMYPCq/YB4r3mKyToepaLhicwWnxDdGIodPpzz0="
"version": "8d382c732dbdc839ff07549a3f42d25828f1b268",
"sum": "DRSRw4luAXlBXblo19/T1Jrv+9hyV8ivlS0KEtNANec="
},
{
"source": {
@ -38,7 +38,7 @@
"subdir": "grafana-builder"
}
},
"version": "2cc8d1dcb943eb3ff1dcb85bc9a3933afb36b730",
"version": "b5e45051995755ea373ea67642f8e5f54fcb8dd7",
"sum": "mD0zEP9FVFXeag7EaeS5OvUr2A9D6DQhGemoNn6+PLc="
},
{
@ -59,8 +59,8 @@
"subdir": ""
}
},
"version": "0bbe890539df0c1477000322c73977af71ef71e9",
"sum": "h48bpWnNFX9iN9Uqc9y0NTlKQu8sA1izvNyAHzsMIX8="
"version": "aa2adbcf39884fd9c85d7c3e0ff338b1d61ea1ba",
"sum": "ttkPUnv/5bqlOFcZ8fvp2wi/S7ZLKiqAZ4ZdTolX77M="
},
{
"source": {
@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "0bbe890539df0c1477000322c73977af71ef71e9",
"version": "aa2adbcf39884fd9c85d7c3e0ff338b1d61ea1ba",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
@ -79,8 +79,8 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "daf555f1e11ad6aa37852653e63baede5f99367e",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
"version": "35ef70bb74520a78cc8dc7cf364e1ff4e0c45063",
"sum": "ySP+bI2ZMLPt/sguSh9WrwI5H5dasaNFRE8Uo9PcZrI="
},
{
"source": {
@ -89,7 +89,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "daf555f1e11ad6aa37852653e63baede5f99367e",
"version": "35ef70bb74520a78cc8dc7cf364e1ff4e0c45063",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
},
{
@ -99,8 +99,18 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "5fe45c57b60f17568001fd04a7dc2bb754fdf152",
"sum": "6Qrn74pNRqJNKYdsmcBu8ergYbMEH48qG1VDVm9FKak="
"version": "980e95de011319b88a3b9c0787a81dcdf338a898",
"sum": "BxOXyWCSc9KkgWJXDau2Xtsy3aOYZDHz2VqOSLga7VU="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator",
"subdir": "jsonnet/mixin"
}
},
"version": "55baf034c431ed2c78d950b187f7d8b34dd06860",
"sum": "+Q45oBC7O8g7KQOaiKhGglwndAMWRlLTR94KUI8Q1Ko="
},
{
"source": {
@ -109,8 +119,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "96094ad1ab039950537df448b95bbcc04c57bfc4",
"sum": "ReamRYoS2C39Of7KtXGqkSWdfHw5Fy/Ix6ujOmBLFAg="
"version": "cd331ce9bb58bb926e391c6ae807621cb12cc29e",
"sum": "nM1eDP5vftqAeQSmVYzSBAh+lG0SN6zu46QiocQiVhk="
},
{
"source": {
@ -119,8 +129,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "d8a1585f59ef1169837d08979ecc92dcea8aa58a",
"sum": "EE+C+Krf518EGLjA/x3ZvKfenCI0J7YuwFJVBscypRw="
"version": "f81747e608ea85ae44e76454eb63f9cb6484fb9e",
"sum": "VyMzZPxQIjiKQYGjZjXeKNWfLJ9vOl3emp84PWfsrUc="
},
{
"source": {

View File

@ -1,17 +0,0 @@
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
creationTimestamp: null
name: alertmanager-tbrnt-config
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgATnPnwlvlfGVJeWhjcFHyp/am3nmguqi9PsWgEhxtVLuyDA6OB1G+BdJZ7dGdCViZGeDJD5mHxESSLDMTfxg5DxMDIG18XCzojMtRFTPJMZPLjbETNuSZqtrkbscp/qQom4z+igVuLlkaihdYRcCNV+B0vm+1h6BUPV8Utv1RN1dy9XUvdrvhPRNFvqhCpVcpcLwNP5cli5SNYgVc/ty6a45Fl5h+KLv7rFBJexLhUXoR0jamQpQWoH7oNHcS4ONHxLDMKXqE9jFpKzlQJBNgiRQEEotwCYTodoALmkcIs37Ai+trQxEMZZYtD5vFzbehfTtNLT1bPhLiX91rv6Q9n9wuIw951Qk11L6cF93zDl2mZ9dAQHSAglVHEriXKXBZ3Df4DSyh5qkr+/7lFBdFTQVMS5+YTgM1eCmG1yfsvU33IWKh5wrNhpkUqGLiq9f+4k3xPQVysVY3jJjVhINM/A9OsTPfFzm7aAAklBxuROXiZgZ/6L4Oc/c0Tv3EN/02rhinSGr1hIMmcaSxdOVQxXPU+pbx4JcSmyQFXIY37n/2ya/UbJW/o901MtigCukUvgMedkxPSGhedvHOygKKXPKNSl5U1Emhza7c6vP9cSiiaHpRm7EyUQvjWpJRUP7tSRgDlZyBM9Ud0PRBRdYWLG5YlZB4STOX6cDyYOcFJvnAyiZpDuwOPKMOrhWQSbCgcMcuS/RCgCnYJ4YBfm1cSxcqxsA65PRhXbRmiY9b/Mqs7s1xpJo3RySO27JiffbY+vYRIrFv4G4ak0ug9AQJvrvEA/ZgSs9xpASXSsr42pB63exhlZP+D9JEDGFLgzGQVnVFRFDrlYLFQieqWDgBc0pkcxHHwGBTp3H6PP5RRPLKzNoypTbNrdLlaNAsAlb8VuPopPBHqLcpU+DPuxxBCQ/P8ezYXE8RmpH7x4A2rPLbV902zYVwfnWMrUdPZif7oPnn/xM+VDQMLIlKA/CQySudzAYf621N450V2zE0akOFQNATGEUZT+8HtjNKMcAxjojP/pJpo486t02KXHDw+i04R1kdGPVkLnZDz+UShh4eLwovBL3zopxHxSFnonI1Ez+IetemM+aCJhadU4YNC4zY7x+blNa51ZAGEGoXuSJB3fszLd7wSz4owIhQnb+StxVNcwNgircvFjhauLVLwEynO4WTm+YKzosf2GxJAaNriveyLj1L+DwBUOeWnvmL4QsHX3nriZgVR89KlIQI1d3+lcf2jw8VqfHj3tbpMxb98qZFWw2pczrDeE2t4UQTPTj+4VK4htKGhnIHlNmdeSN39GJCWfilnzAsznrkxZsr1wIqrP+ayRO+NxljBzUN7xspByJeJrBpzW1pukvNg74MK1K7g0/fh/zmqyduQYmJnCDDqfN0PB1YoXFPZn3o4kzGnLXetgJHyJG4tsinediVXrZJb+6KOL31hEbZVArP/gWYHQv1MltEZj2yunuLeEy+Oo35oS/IkW17qF3gkF0sLavZFhJe6XqVvO2BFF1V8S15cBApXXPvFJQWyFLOTVqhYicYfJqQEgz7sKH1uYK0zLeyBkRIfUXk3vs7X9X/CzQo9J9oH5FCgNEI6GdROhitGUihCxwAVa6lsteVxZZ4USkMGZtJPG3Pi5RQuWpSMFX9nWm/LHv8wpgFItx12ZkuaIDB30wYfp4SqcBr1ZvuptKtwJpTUjVx5HSUKnEye2g==
template:
metadata:
creationTimestamp: null
name: alertmanager-tbrnt-config
namespace: monitoring
type: Opaque
status: {}

View File

@ -144,6 +144,7 @@ items:
"decimals": 3,
"description": "How much error budget is left looking at our 0.990% availability gurantees?",
"fill": 10,
"fillGradient": 0,
"gridPos": {
},
@ -338,6 +339,7 @@ items:
"datasource": "$datasource",
"description": "How many read requests (LIST,GET) per second do the apiservers get by code?",
"fill": 10,
"fillGradient": 0,
"gridPos": {
},
@ -446,6 +448,7 @@ items:
"datasource": "$datasource",
"description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -539,6 +542,7 @@ items:
"datasource": "$datasource",
"description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -731,6 +735,7 @@ items:
"datasource": "$datasource",
"description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?",
"fill": 10,
"fillGradient": 0,
"gridPos": {
},
@ -839,6 +844,7 @@ items:
"datasource": "$datasource",
"description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -932,6 +938,7 @@ items:
"datasource": "$datasource",
"description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1037,6 +1044,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1129,6 +1137,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1221,6 +1230,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1326,6 +1336,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1418,6 +1429,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1510,6 +1522,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -1780,6 +1793,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -1882,6 +1896,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -2325,6 +2340,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -2427,6 +2443,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -2559,6 +2576,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -2659,6 +2677,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -2770,6 +2789,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -2870,6 +2890,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -2990,6 +3011,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -3090,6 +3112,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -3190,6 +3213,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -3294,6 +3318,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 24,
@ -3668,6 +3693,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -3773,6 +3799,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -3878,6 +3905,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -3983,6 +4011,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -4096,6 +4125,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -4201,6 +4231,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -4306,6 +4337,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -4398,6 +4430,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -4490,6 +4523,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -16978,6 +17012,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17070,6 +17105,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17175,6 +17211,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17280,6 +17317,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17379,6 +17417,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17491,6 +17530,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17585,6 +17625,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17692,6 +17733,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17799,6 +17841,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17891,6 +17934,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -17997,6 +18041,7 @@ items:
"datasource": "$datasource",
"description": "Pod lifecycle event generator",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18089,6 +18134,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18194,6 +18240,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18299,6 +18346,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18425,6 +18473,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18530,6 +18579,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18622,6 +18672,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -18714,6 +18765,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -19527,6 +19579,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -19627,6 +19680,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -19738,6 +19792,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -19838,6 +19893,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -19958,6 +20014,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -20058,6 +20115,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -20400,6 +20458,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -20502,6 +20561,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -20945,6 +21005,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21047,6 +21108,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21179,6 +21241,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21279,6 +21342,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21390,6 +21454,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21490,6 +21555,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21610,6 +21676,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -21710,6 +21777,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -23983,6 +24051,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -24076,6 +24145,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
},
@ -24202,6 +24272,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -24367,7 +24438,7 @@ items:
"tableColumn": "",
"targets": [
{
"expr": "100 -\n(\n node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n* 100\n)\n",
"expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
@ -24412,6 +24483,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
},
@ -24528,6 +24600,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -24647,6 +24720,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
},
@ -24740,6 +24814,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 0,
"fillGradient": 0,
"gridPos": {
},
@ -24961,6 +25036,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -25157,6 +25233,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -25819,6 +25896,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -25919,6 +25997,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -26030,6 +26109,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -26130,6 +26210,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -26250,6 +26331,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -26350,6 +26432,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 10,
"w": 12,
@ -26700,6 +26783,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -26792,6 +26876,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -26897,6 +26982,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27002,6 +27088,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27095,6 +27182,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27187,6 +27275,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27279,6 +27368,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27384,6 +27474,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27476,6 +27567,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27581,6 +27673,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27673,6 +27766,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27778,6 +27872,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27870,6 +27965,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -27962,6 +28058,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -28054,6 +28151,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -29634,6 +29732,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -29726,6 +29825,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -29831,6 +29931,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -29923,6 +30024,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30028,6 +30130,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30141,6 +30244,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30246,6 +30350,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30351,6 +30456,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30443,6 +30549,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30535,6 +30642,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30839,6 +30947,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -30952,6 +31061,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -31078,6 +31188,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -31191,6 +31302,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -31296,6 +31408,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -31401,6 +31514,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -31493,6 +31607,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -31585,6 +31700,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -32417,6 +32533,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"fillGradient": 0,
"gridPos": {
},
@ -33529,6 +33646,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -33631,6 +33749,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -33744,6 +33863,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -33846,6 +33966,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -33978,6 +34099,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -34078,6 +34200,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -34189,6 +34312,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -34289,6 +34413,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -34409,6 +34534,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,
@ -34509,6 +34635,7 @@ items:
"dashes": false,
"datasource": "$datasource",
"fill": 2,
"fillGradient": 0,
"gridPos": {
"h": 9,
"w": 12,

View File

@ -1,31 +0,0 @@
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: healthchecks-io
namespace: monitoring
spec:
schedule: "*/1 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 1
startingDeadlineSeconds: 200
jobTemplate:
spec:
template:
spec:
containers:
- name: pinghc
env:
- name: HCURL
valueFrom:
secretKeyRef:
name: healthchecks-io
key: HCURL
image: busybox
args:
- /bin/sh
- -c
- "date && echo $HCURL && /bin/wget -q -O - --no-check-certificate $HCURL"
restartPolicy: OnFailure

View File

@ -1,17 +0,0 @@
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
creationTimestamp: null
name: healthchecks-io
namespace: monitoring
spec:
encryptedData:
HCURL: AgBEpwET1Qa1hQqAmwrNGBv4sL0ml8pGYPwgq9Aps3tYhBVqsXjV7U5RQa/txldg1umw2Zqx8MfvZTN2kmFk6bJTROCWqTxmxd4rHgnJYqRR0+Opn/BtDhVx4WTnehyM/il9ymddhMD+WRQDr/Wfxq/0UQdsy+IEYyVMQuOKEihZabxmXRyNeAl5ZBeQ0W1T29biJPx3rifS37RbGlJtCIYuNPh82d0KAMu1dszDnkln8k5CBv6mPD8BVHg+Z/y1v1jFhTIE3YOlGzCIjb8RrJj6MVm7zlauj8zrl30JvF2OAWDGGZDOL3b0G3IKd0Qp/eagT33Sx7vbppY/l1Vci6UQcVpde3u2+ATMbysRej04Mvcodq5OgkBFqbgCzx0UFTIq0wER/GuCoYbt+k8b3TouK5ChQet8EP0W/c7rLHcMY3c0UR00N7m5UeKZAzAkXSGV+u3M9K6PMp8pl0VuDo+IVgEIY7ku9rtzL7SPIfXS4u5w7fte13fOtKB/2sa11dNqAbHmidF+IO6ycjm8SZibC7NKyCxgIKWPfsFXhNUT2Nx7eBRrzR1QlqThIGRsDpX1RVplTwe/OLsBz0K99AyGDUkSBJdOZLaRT/b3T0nS8DE5x/e8MvFsbbDdGE2U/YhVrbfn072u/X979/RIm0oCjipvByZXhFmobRj9SP9RcK2UfjBSY7xyKnd2rjj1mnIs2S0CmwGFdJqoywHckJJOu3YP2oN2Q1U7+Fe4yciupAshgdszY2okHMtd4aDDJJKeKKFHpjpsuA==
template:
metadata:
creationTimestamp: null
name: healthchecks-io
namespace: monitoring
type: Opaque
status: {}

View File

@ -1,40 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8up
labels:
prometheus: k8s
role: alert-rules
spec:
groups:
- name: k8up.rules
rules:
- alert: baas_last_errors
expr: baas_backup_restic_last_errors > 0
for: 1m
labels:
severity: critical
annotations:
summary: Amount of errors of last restic backup
description: This alert is fired when error number is > 0
- alert: K8upBackupFailed
expr: rate(k8up_jobs_failed_counter[1d]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Job in {{ $labels.namespace }} of type {{ $labels.jobType }} failed"
- alert: K8upBackupNotRunning
expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0
for: 1m
labels:
severity: critical
annotations:
summary: "No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock"
- alert: K8upJobStuck
expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0
for: 24h
labels:
severity: critical
annotations:
summary: "K8up jobs are stuck in {{ $labels.namespace }} for the last 24 hours."

View File

@ -30,7 +30,6 @@ rules:
- daemonsets
- deployments
- replicasets
- ingresses
verbs:
- list
- watch
@ -105,6 +104,7 @@ rules:
- networking.k8s.io
resources:
- networkpolicies
- ingresses
verbs:
- list
- watch

View File

@ -3,7 +3,7 @@ kind: DaemonSet
metadata:
labels:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
app.kubernetes.io/version: v1.0.1
name: node-exporter
namespace: monitoring
spec:
@ -14,7 +14,7 @@ spec:
metadata:
labels:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
app.kubernetes.io/version: v1.0.1
spec:
containers:
- args:
@ -25,7 +25,7 @@ spec:
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
image: quay.io/prometheus/node-exporter:v0.18.1
image: quay.io/prometheus/node-exporter:v1.0.1
name: node-exporter
resources:
limits:
@ -36,11 +36,13 @@ spec:
memory: 180Mi
volumeMounts:
- mountPath: /host/proc
mountPropagation: HostToContainer
name: proc
readOnly: false
readOnly: true
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: false
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root

View File

@ -3,7 +3,7 @@ kind: Service
metadata:
labels:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
app.kubernetes.io/version: v1.0.1
name: node-exporter
namespace: monitoring
spec:

View File

@ -3,7 +3,7 @@ kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
app.kubernetes.io/version: v1.0.1
name: node-exporter
namespace: monitoring
spec:

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
name: prometheus-operator
namespace: monitoring
spec:
@ -19,4 +19,4 @@ spec:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1

View File

@ -1019,6 +1019,8 @@ spec:
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
for: 10m
labels:
severity: warning
@ -1044,6 +1046,75 @@ spec:
node_md_disks{state="fail"} > 0
labels:
severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations
failed for {{ $labels.controller }} controller in {{ $labels.namespace }}
namespace.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
Namespace.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="monitoring"}[5m]) == 0)
for: 5m
labels:
severity: warning
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
@ -1249,7 +1320,7 @@ spec:
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete.
complete. Removing failed job after investigation should clear this alert.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete.
expr: |
@ -2031,40 +2102,3 @@ spec:
for: 2m
labels:
severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
message: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
message: Errors while performing Watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m
labels:
severity: warning

View File

@ -53,6 +53,7 @@ spec:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
honorTimestamps: false
interval: 30s
metricRelabelings:
- action: drop

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
name: prometheus-operator
rules:
- apiGroups:

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
name: prometheus-operator
namespace: monitoring
spec:
@ -18,15 +18,15 @@ spec:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.4.0
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.0
image: quay.io/prometheus-operator/prometheus-operator:v0.42.0
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.42.1
image: quay.io/prometheus-operator/prometheus-operator:v0.42.1
name: prometheus-operator
ports:
- containerPort: 8080

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
name: prometheus-operator
namespace: monitoring
spec:

View File

@ -4,6 +4,6 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.42.0
app.kubernetes.io/version: v0.42.1
name: prometheus-operator
namespace: monitoring

View File

@ -1,17 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: traefik
namespace: monitoring
spec:
endpoints:
- interval: 30s
path: /metrics
port: metrics
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app: traefik

View File

@ -34,7 +34,8 @@
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
summary: 'etcd cluster members are down.',
},
},
{
@ -47,7 +48,8 @@
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).',
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).',
summary: 'etcd cluster has insufficient number of members.',
},
},
{
@ -60,7 +62,8 @@
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.',
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.',
summary: 'etcd cluster has no leader.',
},
},
{
@ -73,7 +76,8 @@
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
summary: 'etcd cluster has high number of leader changes.',
},
},
{
@ -89,7 +93,8 @@
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster has high number of failed grpc requests.',
},
},
{
@ -105,7 +110,8 @@
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster has high number of failed grpc requests.',
},
},
{
@ -119,7 +125,8 @@
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd grpc requests are slow',
},
},
{
@ -133,7 +140,8 @@
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster member communication is slow.',
},
},
{
@ -146,7 +154,8 @@
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.',
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster has high number of proposal failures.',
},
},
{
@ -159,6 +168,21 @@
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster 99th percentile fsync durations are too high.',
},
},
{
alert: 'etcdHighFsyncDurations',
expr: |||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
> 1
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
},
@ -174,7 +198,8 @@
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.',
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.',
summary: 'etcd cluster 99th percentile commit durations are too high.',
},
},
{
@ -188,7 +213,8 @@
severity: 'warning',
},
annotations: {
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}',
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}',
summary: 'etcd has high number of failed HTTP requests.',
},
},
{
@ -202,7 +228,8 @@
severity: 'critical',
},
annotations: {
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd has high number of failed HTTP requests.',
},
},
{
@ -216,9 +243,36 @@
severity: 'warning',
},
annotations: {
message: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.',
description: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.',
summary: 'etcd instance HTTP requests are slow.',
},
},
{
alert: 'etcdBackendQuotaLowSpace',
expr: |||
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.',
},
},
{
alert: 'etcdExcessiveDatabaseGrowth',
expr: |||
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.',
},
},
],
},
],

View File

@ -26,7 +26,8 @@ tests:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (3).'
description: 'etcd cluster "etcd": members are down (3).'
summary: 'etcd cluster members are down.'
- eval_time: 7m
alertname: etcdInsufficientMembers
- eval_time: 11m
@ -36,7 +37,8 @@ tests:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": insufficient members (1).'
description: 'etcd cluster "etcd": insufficient members (1).'
summary: 'etcd cluster has insufficient number of members.'
- eval_time: 15m
alertname: etcdInsufficientMembers
exp_alerts:
@ -44,7 +46,8 @@ tests:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": insufficient members (0).'
description: 'etcd cluster "etcd": insufficient members (0).'
summary: 'etcd cluster has insufficient number of members.'
- interval: 1m
input_series:
@ -62,7 +65,8 @@ tests:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (3).'
description: 'etcd cluster "etcd": members are down (3).'
summary: 'etcd cluster members are down.'
- interval: 1m
input_series:
@ -80,7 +84,8 @@ tests:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (1).'
description: 'etcd cluster "etcd": members are down (1).'
summary: 'etcd cluster members are down.'
- interval: 1m
input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
@ -97,7 +102,8 @@ tests:
job: etcd
severity: warning
exp_annotations:
message: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
description: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
summary: 'etcd cluster has high number of leader changes.'
- interval: 1m
input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
@ -110,4 +116,20 @@ tests:
- eval_time: 10m
alertname: etcdHighNumberOfLeaderChanges
exp_alerts:
- interval: 1m
input_series:
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}'
values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0'
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}'
values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0'
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}'
values: '0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 10m
alertname: etcdExcessiveDatabaseGrowth
exp_alerts:
- exp_labels:
job: etcd
severity: warning
exp_annotations:
message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'

View File

@ -10,6 +10,7 @@
* @param span (optional) Width of the panel
* @param datasource (optional) Datasource
* @param fill (default `1`) , integer from 0 to 10
* @param fillGradient (default `0`) , integer from 0 to 10
* @param linewidth (default `1`) Line Width, integer from 0 to 10
* @param decimals (optional) Override automatic decimal precision for legend and tooltip. If null, not added to the json output.
* @param decimalsY1 (optional) Override automatic decimal precision for the first Y axis. If null, use decimals parameter.
@ -63,11 +64,13 @@
* @method addYaxis(format,min,max,label,show,logBase,decimals) Adds a Y axis to the graph
* @method addAlert(alert) Adds an alert
* @method addLink(link) Adds a [panel link](https://grafana.com/docs/grafana/latest/linking/panel-links/)
* @method addLinks(links) Adds an array of links.
*/
new(
title,
span=null,
fill=1,
fillGradient=0,
linewidth=1,
decimals=null,
decimalsY1=null,
@ -166,6 +169,7 @@
},
lines: lines,
fill: fill,
fillGradient: fillGradient,
linewidth: linewidth,
dashes: dashes,
dashLength: 10,
@ -283,5 +287,6 @@
addLink(link):: self {
links+: [link],
},
addLinks(links):: std.foldl(function(p, t) p.addLink(t), links, self),
},
}

View File

@ -6,12 +6,15 @@
*
* @param expr
* @param hide (optional) Disable query on graph.
* @param legendFormat (optional) Defines the legend. Defaults to ''.
*/
target(
expr,
hide=null,
legendFormat='',
):: {
[if hide != null then 'hide']: hide,
expr: expr,
legendFormat: legendFormat,
},
}

View File

@ -258,7 +258,7 @@
severity: 'warning',
},
annotations: {
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.',
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
summary: 'Job failed to complete.',
},
},

View File

@ -337,11 +337,11 @@ local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libson
legend_avg=true,
)
.addTarget(prometheus.target(
'sort_desc(sum by (container) (rate(windows_container_network_receive_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config,
'sort_desc(sum by (container) (rate(windows_container_network_received_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config,
legendFormat='Received : {{ container }}',
))
.addTarget(prometheus.target(
'sort_desc(sum by (container) (rate(windows_container_network_transmit_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config,
'sort_desc(sum by (container) (rate(windows_container_network_transmitted_bytes_total{namespace="$namespace", pod="$pod"}[1m])))' % $._config,
legendFormat='Transmitted : {{ container }}',
))
)

View File

@ -202,13 +202,13 @@
||| % $._config,
},
{
record: 'windows_container_network_receive_bytes_total',
record: 'windows_container_network_received_bytes_total',
expr: |||
windows_container_network_receive_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace)
||| % $._config,
},
{
record: 'windows_container_network_transmit_bytes_total',
record: 'windows_container_network_transmitted_bytes_total',
expr: |||
windows_container_network_transmit_bytes_total{%(wmiExporterSelector)s} * on(container_id) group_left(container, pod, namespace) max(kube_pod_container_info{%(kubeStateMetricsSelector)s}) by(container, container_id, pod, namespace)
||| % $._config,

View File

@ -58,7 +58,6 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
'daemonsets',
'deployments',
'replicasets',
'ingresses',
]) +
rulesType.withVerbs(['list', 'watch']),
@ -135,6 +134,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
rulesType.withApiGroups(['networking.k8s.io']) +
rulesType.withResources([
'networkpolicies',
'ingresses',
]) +
rulesType.withVerbs(['list', 'watch']),
@ -228,6 +228,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
roleBinding.new() +
roleBinding.mixin.metadata.withName(ksm.name) +
roleBinding.mixin.metadata.withNamespace(ksm.namespace) +
roleBinding.mixin.metadata.withLabels(ksm.commonLabels) +
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
roleBinding.mixin.roleRef.withName(ksm.name) +

View File

@ -1,4 +1,3 @@
(import 'alertmanager.libsonnet') +
(import 'general.libsonnet') +
(import 'node.libsonnet') +
(import 'prometheus-operator.libsonnet')
(import 'node.libsonnet')

View File

@ -1,63 +0,0 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorListErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorWatchErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||
rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNodeLookupErrors',
expr: |||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
],
},
],
},
}

View File

@ -28,6 +28,15 @@
},
"version": "release-0.42"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator",
"subdir": "jsonnet/mixin"
}
},
"version": "master"
},
{
"source": {
"git": {

View File

@ -1,6 +1,7 @@
local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
local k3 = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.3/k.libsonnet';
local configMapList = k3.core.v1.configMapList;
local kubeRbacProxyContainer = import './kube-rbac-proxy/container.libsonnet';
(import 'github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet') +
(import './kube-state-metrics/kube-state-metrics.libsonnet') +
@ -9,6 +10,7 @@ local configMapList = k3.core.v1.configMapList;
(import 'github.com/prometheus/node_exporter/docs/node-mixin/mixin.libsonnet') +
(import './alertmanager/alertmanager.libsonnet') +
(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator/prometheus-operator.libsonnet') +
(import 'github.com/prometheus-operator/prometheus-operator/jsonnet/mixin/mixin.libsonnet') +
(import './prometheus/prometheus.libsonnet') +
(import './prometheus-adapter/prometheus-adapter.libsonnet') +
(import 'github.com/kubernetes-monitoring/kubernetes-mixin/mixin.libsonnet') +
@ -60,7 +62,7 @@ local configMapList = k3.core.v1.configMapList;
],
},
} +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
(kubeRbacProxyContainer {
config+:: {
kubeRbacProxy: {
local cfg = self,

View File

@ -1,3 +1,6 @@
local kubeRbacProxyContainer = import '../kube-rbac-proxy/container.libsonnet';
local ksm = import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet';
{
_config+:: {
versions+:: {
@ -11,119 +14,119 @@
scrapeTimeout: '30s',
},
},
kubeStateMetrics+:: (import 'github.com/kubernetes/kube-state-metrics/jsonnet/kube-state-metrics/kube-state-metrics.libsonnet') +
{
local ksm = self,
name:: 'kube-state-metrics',
namespace:: $._config.namespace,
version:: $._config.versions.kubeStateMetrics,
image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics,
service+: {
spec+: {
ports: [
{
name: 'https-main',
port: 8443,
targetPort: 'https-main',
},
{
name: 'https-self',
port: 9443,
targetPort: 'https-self',
},
],
},
},
deployment+: {
spec+: {
template+: {
spec+: {
containers: std.map(function(c) c {
ports:: null,
livenessProbe:: null,
readinessProbe:: null,
args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'],
}, super.containers),
},
},
},
},
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kube-state-metrics',
namespace: $._config.namespace,
labels: {
'app.kubernetes.io/name': 'kube-state-metrics',
'app.kubernetes.io/version': ksm.version,
},
},
spec: {
jobLabel: 'app.kubernetes.io/name',
selector: {
matchLabels: {
'app.kubernetes.io/name': 'kube-state-metrics',
},
},
endpoints: [
{
port: 'https-main',
scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval,
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
regex: '(pod|service|endpoint|namespace)',
action: 'labeldrop',
},
],
tlsConfig: {
insecureSkipVerify: true,
},
},
{
port: 'https-self',
scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
tlsConfig: {
insecureSkipVerify: true,
},
},
],
},
},
} +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-main',
securePortName: 'https-main',
securePort: 8443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8081/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-self',
securePortName: 'https-self',
securePort: 9443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8082/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin,
kubeStateMetrics+::
ksm + {
local version = self.version,
name:: 'kube-state-metrics',
namespace:: $._config.namespace,
version:: $._config.versions.kubeStateMetrics,
image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics,
service+: {
spec+: {
ports: [
{
name: 'https-main',
port: 8443,
targetPort: 'https-main',
},
{
name: 'https-self',
port: 9443,
targetPort: 'https-self',
},
],
},
},
deployment+: {
spec+: {
template+: {
spec+: {
containers: std.map(function(c) c {
ports:: null,
livenessProbe:: null,
readinessProbe:: null,
args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'],
}, super.containers),
},
},
},
},
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kube-state-metrics',
namespace: $._config.namespace,
labels: {
'app.kubernetes.io/name': 'kube-state-metrics',
'app.kubernetes.io/version': version,
},
},
spec: {
jobLabel: 'app.kubernetes.io/name',
selector: {
matchLabels: {
'app.kubernetes.io/name': 'kube-state-metrics',
},
},
endpoints: [
{
port: 'https-main',
scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval,
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
regex: '(pod|service|endpoint|namespace)',
action: 'labeldrop',
},
],
tlsConfig: {
insecureSkipVerify: true,
},
},
{
port: 'https-self',
scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
tlsConfig: {
insecureSkipVerify: true,
},
},
],
},
},
} +
(kubeRbacProxyContainer {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-main',
securePortName: 'https-main',
securePort: 8443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8081/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin +
(kubeRbacProxyContainer {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-self',
securePortName: 'https-self',
securePort: 9443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8082/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin,
}

View File

@ -5,7 +5,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
namespace: 'default',
versions+:: {
nodeExporter: 'v0.18.1',
nodeExporter: 'v1.0.1',
},
imageRepos+:: {
@ -79,11 +79,15 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
toleration.withOperator('Exists');
local procVolumeName = 'proc';
local procVolume = volume.fromHostPath(procVolumeName, '/proc');
local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc');
local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc').
withMountPropagation('HostToContainer').
withReadOnly(true);
local sysVolumeName = 'sys';
local sysVolume = volume.fromHostPath(sysVolumeName, '/sys');
local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys');
local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys').
withMountPropagation('HostToContainer').
withReadOnly(true);
local rootVolumeName = 'root';
local rootVolume = volume.fromHostPath(rootVolumeName, '/');

View File

@ -312,6 +312,7 @@ local k = import 'github.com/ksonnet/ksonnet-lib/ksonnet.beta.4/k.libsonnet';
path: '/metrics/cadvisor',
interval: '30s',
honorLabels: true,
honorTimestamps: false,
tlsConfig: {
insecureSkipVerify: true,
},

View File

@ -0,0 +1,3 @@
(
import 'mixin.libsonnet'
).prometheusAlerts

View File

@ -0,0 +1,95 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorListErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
summary: 'Errors while performing list operations in controller.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorWatchErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[10m]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
summary: 'Errors while performing watch operations in controller.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorSyncFailed',
expr: |||
min_over_time(prometheus_operator_syncs{status="failed",%(prometheusOperatorSelector)s}[5m]) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.',
summary: 'Last controller reconciliation failed',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m]))) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.',
summary: 'Errors while reconciling controller.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNodeLookupErrors',
expr: |||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
summary: 'Errors while reconciling Prometheus.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNotReady',
expr: |||
min by(namespace, controller) (max_over_time(prometheus_operator_ready{%(prometheusOperatorSelector)s}[5m]) == 0)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: "Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources.",
summary: 'Prometheus operator not ready',
},
'for': '5m',
},
],
},
],
},
}

View File

@ -0,0 +1,5 @@
{
_config+:: {
prometheusOperatorSelector: 'job="prometheus-operator"',
},
}

View File

@ -0,0 +1,2 @@
(import 'config.libsonnet') +
(import 'alerts/alerts.libsonnet')

View File

@ -15,7 +15,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
},
versions+:: {
prometheusOperator: 'v0.42.0',
prometheusOperator: 'v0.42.1',
prometheusConfigReloader: self.prometheusOperator,
configmapReloader: 'v0.4.0',
},

View File

@ -48,7 +48,7 @@
alert: 'NodeFilesystemAlmostOutOfSpace',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceAvailableCriticalThreshold)d
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
@ -58,7 +58,7 @@
severity: 'warning',
},
annotations: {
summary: 'Filesystem has less than 5% space left.',
summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config,
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
},
},
@ -66,7 +66,7 @@
alert: 'NodeFilesystemAlmostOutOfSpace',
expr: |||
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceAvailableWarningThreshold)d
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
)
@ -76,7 +76,7 @@
severity: '%(nodeCriticalSeverity)s' % $._config,
},
annotations: {
summary: 'Filesystem has less than 3% space left.',
summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config,
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
},
},
@ -238,6 +238,8 @@
alert: 'NodeClockNotSynchronising',
expr: |||
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
||| % $._config,
'for': '10m',
labels: {

View File

@ -47,6 +47,11 @@
fsSpaceFillingUpWarningThreshold: 40,
fsSpaceFillingUpCriticalThreshold: 20,
// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemAlmostOutOfSpace' alerts.
fsSpaceAvailableCriticalThreshold: 5,
fsSpaceAvailableWarningThreshold: 3,
grafana_prefix: '',
},
}

View File

@ -75,14 +75,15 @@ local gauge = promgrafonnet.gauge;
// TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%.
// This needs to be added upstream in the promgrafonnet library and then changed here.
// NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout.
local memoryGauge = gauge.new(
'Memory Usage',
|||
100 -
(
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}
avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"})
/
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"})
* 100
)
||| % $._config,

1
monitoring/vendor/mixin vendored Symbolic link
View File

@ -0,0 +1 @@
github.com/prometheus-operator/prometheus-operator/jsonnet/mixin