upgrade monitoring - switch to prometheus-operator/kube-prometheus
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
df9c32f59c
commit
0108ac6084
|
@ -4,7 +4,7 @@
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
"git": {
|
"git": {
|
||||||
"remote": "https://github.com/coreos/kube-prometheus",
|
"remote": "https://github.com/prometheus-operator/kube-prometheus",
|
||||||
"subdir": "jsonnet/kube-prometheus"
|
"subdir": "jsonnet/kube-prometheus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -8,8 +8,8 @@
|
||||||
"subdir": "grafana"
|
"subdir": "grafana"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "014301fd5f71d8305a395b2fb437089a7b1a3999",
|
"version": "18c50c83ea49291b0aa00067e4b2b386556ba0e3",
|
||||||
"sum": "RHtpk2c0CcliWyt6F4DIgwpi4cEfHADK7nAxIw6RTGs="
|
"sum": "GEVLrcKGvUuvjq6bDhaWr4fOwkG5QMDpnUhHxUUywwg="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
|
@ -18,28 +18,8 @@
|
||||||
"subdir": "Documentation/etcd-mixin"
|
"subdir": "Documentation/etcd-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "9006d8d4f9d82f6cce6eb93d6f2dfe7c154fa05d",
|
"version": "44dea5df0364a46234bc9f6fd90bdb960f7a1ee8",
|
||||||
"sum": "Uv8ysXlEACF7BafoCkHnrBmJ2AHh/VldI5mm3BuMiy0="
|
"sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ="
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/coreos/kube-prometheus",
|
|
||||||
"subdir": "jsonnet/kube-prometheus"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "6771c9bcc287e8047510207a4ab60fa5e63e48fe",
|
|
||||||
"sum": "52ukcsyazUhdJWb48PPGQQurdFrGE0xgKYE++yWO7aI="
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/coreos/prometheus-operator",
|
|
||||||
"subdir": "jsonnet/prometheus-operator"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "0dca0f21ffff72a063db8855b5d515e15ab0dccb",
|
|
||||||
"sum": "WggWVWZ+CBEUThQCztSaRELbtqdXf9s3OFzf06HbYNA="
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
|
@ -48,8 +28,8 @@
|
||||||
"subdir": "grafonnet"
|
"subdir": "grafonnet"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "ad85aec356b4544a41f62ac8c32f8042c0ffc42e",
|
"version": "0cfef6b1e666316d40a255c93e1d98d2a2610009",
|
||||||
"sum": "JHhSwlCa9A+AwG4o+YEXXFDbQ91iwwd9G/FoYnGhObw="
|
"sum": "BVozpBqmTw67JSEDX9DRLEzCLMt0+aQjDBKGaTLS+Bc="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
|
@ -58,8 +38,8 @@
|
||||||
"subdir": "grafana-builder"
|
"subdir": "grafana-builder"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "f2a35172b97a0c944c4a167bb1f6e688624602e4",
|
"version": "0d5c2119373cb53d9f02fb82b94857412f7540c8",
|
||||||
"sum": "N65Fv0M2JvFE3GN8ZxP5xh1U5a314ey8geLAioJLzF8="
|
"sum": "R5WJe6wW0R9vMpOAHaGFwcK8q4NmGZ0aLhdZGKDHeMU="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
|
@ -79,8 +59,8 @@
|
||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
|
"version": "b1005adad5940eee9366272ab4c85cf077e547c2",
|
||||||
"sum": "skD7Rm0m5lOQOn8IrnGEdJyhWUI7qsKPXwcci7Hjn0E="
|
"sum": "FVMHGdPPdJ3OYoR2b2ZjqQq/cxmS7/Y5dYkA9oYQ3Vg="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
|
@ -89,7 +69,7 @@
|
||||||
"subdir": "lib/promgrafonnet"
|
"subdir": "lib/promgrafonnet"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
|
"version": "b1005adad5940eee9366272ab4c85cf077e547c2",
|
||||||
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -99,7 +79,7 @@
|
||||||
"subdir": "jsonnet/kube-state-metrics"
|
"subdir": "jsonnet/kube-state-metrics"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
|
"version": "f1166b40298a6846a329f567d0d509d32ee45a81",
|
||||||
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
|
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -109,8 +89,28 @@
|
||||||
"subdir": "jsonnet/kube-state-metrics-mixin"
|
"subdir": "jsonnet/kube-state-metrics-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
|
"version": "f1166b40298a6846a329f567d0d509d32ee45a81",
|
||||||
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8="
|
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": {
|
||||||
|
"git": {
|
||||||
|
"remote": "https://github.com/prometheus-operator/kube-prometheus",
|
||||||
|
"subdir": "jsonnet/kube-prometheus"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"version": "4f872f1e3187509e4c8d53d7e1f654b5fa422977",
|
||||||
|
"sum": "8pGMGEIp8YC6bpvrxhkSdi/mBbexogdFnvHPlZJmfg4="
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"source": {
|
||||||
|
"git": {
|
||||||
|
"remote": "https://github.com/prometheus-operator/prometheus-operator",
|
||||||
|
"subdir": "jsonnet/prometheus-operator"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"version": "312d675008306b13c24d241bf4f0a882dbfa90d8",
|
||||||
|
"sum": "NPuLvqEmYZ+dCQ/9U4wXtobBD6hYreEx3jPpLQKS/ig="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"source": {
|
"source": {
|
||||||
|
@ -119,7 +119,7 @@
|
||||||
"subdir": "docs/node-mixin"
|
"subdir": "docs/node-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "503e4fc8486c0082d6bd8c53fad646bcfafeedf6",
|
"version": "3b035c8fa1f75c4c00e57acc14fb71dfd62e31ee",
|
||||||
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
|
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -129,7 +129,7 @@
|
||||||
"subdir": "documentation/prometheus-mixin"
|
"subdir": "documentation/prometheus-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "348ff4285ffa59907c9d7fd7eb3cb7d748f42758",
|
"version": "983ebb4a513302315a8117932ab832815f85e3d2",
|
||||||
"sum": "TBq4SL7YsPInARbJqwz25JaBvvAegcnRCsuz3K9niWc=",
|
"sum": "TBq4SL7YsPInARbJqwz25JaBvvAegcnRCsuz3K9niWc=",
|
||||||
"name": "prometheus"
|
"name": "prometheus"
|
||||||
},
|
},
|
||||||
|
|
|
@ -6,7 +6,7 @@ metadata:
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
encryptedData:
|
encryptedData:
|
||||||
alertmanager.yaml: AgAXarLIP1IirGmTPrtvSSP9TvqIaj/af+j98QCp9+OXhqFtVmjCcKnv5gDRV+kRtGdX8p+GetDWq14uY0dq1uHMTgy/TV9cZ9FCsjEi8+Knk4h01FI/D8vjGA2en/tDkM8/Jz4XcqZ+2yU0Mh7yKhc0pg0/wg/tVLuI04ZzYkjYFPb2DZJyqQKOYZQaIhaf6NpLe4oFAb9OKbvIUT2hu5wGz0Q2aqjSiW0JyhVje367x6oYe/TNN6Qri8ZDeDKlq155XpXpSa3HcQLWuxIvizlTjEbnPr0fa8kFl+v1iRPtB5hNc6dB1w/oyj1YPpVk00JaIGeDy9FXmwDc6MyFjoLWfDWY95mGCN1NJ7q1see/OLZevwLYi+zQYuOH/SIhNHee9GAWTiqqt7RE0GnzEC1kp9J+8caVbtv6aUqpNCaUByFjmeE3gzFNGEZXVQEQpj+IeOSF2f46TOL/F7TS4EKlzMZZVapLLSyuHiq8MNW3xPNnpJ47fDPiJoLGTiYYQgnRrTXzCmsgAHwNBagRJug+y3g+kZbxdlvdjkzqJ3LEoMLxbh9xuHMuiosqodVjIWNT60R/k/Is5DeM9I1Jvfavx7bBeXAJEkkkgeQDNbzqHqlCEbda3dDpbtpxa19FcHvJuRrNlQ7t/eW0FZBviMl2ETI1j8k2HDJeHqJB94QxuPEtMFknE0elIFdJHG5aOmrGECa5u4Oucn4GnmAOLsrVKPgaIQiFka/BJBqmt5kliiDdB3R+1n2nL7X5iXQyD8Wt5j7CRstqb9VX1q4Hr/Devu0IebaGgE7Y2ledhlzsqlriphlyTLUSU4/55WH5hRvhRcakbwtswLUmHA+JeYDN0ojX8roGtEEedCQ9BBn0J6VADF8JyAfNKaliX1/lBW1hImAAUR16EY0fVRUaXWCCrF7n8uPjWAbbwEBkZeaI8j5yagrspD8XPTFPO2qny2ma54DF/sEPJ9cgPmK3P0U4L488G8io8mIBXTEv6XxZ41vV6fAXuWB4/OX8UWUmak7aRURssfAbYuT30XBvUl2dc4Hp8IVwhhpr8sDAhZS9BLBr1+SYn9GPR1NBO5GF8ow8pGr0b6382OinVEtquUO5K80UBsbLDg2OPW7sWIdzXEneoHiFFPNx0Vo8ljquP1qpcneHUl9V5p6yJPKo2iMj0dB8RwCm3ueoldydGq0HrK1ZY1D9gnyJmF/dq496woI8Ccv/EdYkpPw+E7bm3nnR8J+E9tuwhfUZQVQoaWz4HdWufk+FyfOOqo9x1KamekteMdw1ZcqV3a1UfE6M0+YjZtIy7511tcdKYwdHahcAmYGciTfQb5REyjHWEFlMalph+LPtfUzPlqGgN7CKptUhEsSOxMRll3z8LmeT9JiejWcdRtFI6lqJYk2GWKj0rVscZo3GUiqkvKMsDQGqCs4/r7OzWIPfDjkpH5SNCHjX5eMUI3LfuIDfNo2uEWLzabhWpbfw97dvH52/B56D9AnxqXdy0qUjU3ZGeS3yCDvhi29wPgaDgS9T7nao7PgHfPUFUO54NhKR6iY+rwWRyBa+2OMfe/YUvxslqAr758NKVPQM4iV5bUat1UzBy/qSspyig7GF2Dkc3FvqoxlJnWCyDFRCoWZJ1msR+CZdGlGOreNwpmrYVQnJmoxImKIS5chEfzGIDKJu+/4eO+g/ysE/vyHAyC9YG2bv2ueD5c+KYiG4pA==
|
alertmanager.yaml: AgCboDZWf4XxlcxeQlbcdqr32zZfxDW2NlpWo57B7tIoomuTwY8aprGZiFtrXOLJvSGZPh3JRjVoLxKayaO01Hkcok3kQ/DbxwJBQsH+4GPmk/7uXdM3Ph++IVtnVDLi5E6fv1GrmSlJFxvc4MaEUOGSbrSG+eubVQmwJAKxy1ODz4oL6M+1o8YkIBLUZBlD10yZDoh+NODzRJniVQKlyqiHTk9w+aI6KaZOtY+4HZBnO6jGAW4Jdk46V45vHwGkQrMDecNwqQiymCk1dAAcWrkn/FSEygMH7+SyVYi8Ej7QKEn835TKxAiQsOezhu/o3SNyYbBGnrBDH7JHI4ckXop2iVh5hEvprXmNemXrEzCwILsOF4A4NItbrZ88FQYitc6ppMvz5YIyF083IrPJgn8jkn6RYQL53nupfvlQECgh3au53IRqahiDJZRNz8QolOwozNASLOKrk3erK21az+vxoKB29yfVMBPE8+Z/1Yk6RNueliLnF2KpCMyK5Cz/WFsy3aiFE5T5QTDounFghL9p7wtPPJgoFQ5uGDeV4k705c/FQLXS8gCNkc+LrjdHpzWOnucjacfgepZmm5T3JMxTixu2CtuYbiNuinJSEJeyRilYpEDFdNW3NhP/oExDNh6FITz+qX/K8x7I++f6Os0LcUmDOfdU4ZleL6dMe5GLjERd7a7I4LVmMya6cZRvGaP4770K7ItQnCZecgl61KmQWlpPMwpjStsj9hCm+o6PRK+9Cf5LEujbP1maMpJvzonaLEAk9EHoYnXpHWXbCj7PGkqctRbHbLAv/WQoKL2NJdXmTqxMJzE+5CRKlWAIqGtaCVd7RzkwGArZtPy+Q1x7nK4JPpd28zoBQZEoLXQvgxbocmjEuaLEvbEy8neCd7FVTLsQnvrJUBCkuRaUm41ZtrjWzAslpSFWEDo/WnomF1aM/vs5zj7eiwHu/Q7nzInz9EKKlo9O3/r0EnhGP6M3bEHfr608fNzVLkdOlcrJLzliJ85h9c+8KrvrU75hzFylQ/7UWw3rnC+c1axMvCxRFvUFwdtn6DSkfL3ZxvK0J0S/nuH3FI/iTzYa5wO6ACihNPc1TbmSqqT1qolbIHq9iXGvDwA9WrcjCIAuwNpGPzw5WfUuskY4wDiJNJzVOGi8thcXH9y2epyiY+OTNGnkelJDnfhUwL0phlFgCgNI4e6TaJ5V3O5L2Psq6z8X0Zme4uR2AQl67NHhybneCa4eK+8XGNRmNRr78BQDW3tZ/i3Vk/lYDYMW+QzCqMmXlbBx5KemJdx8NDOwQnzEKnWmACqSP9ljZ8c0RCGlIjOXNwu/AZqUlQFhtn0BqPcifI8GGEZQWj5xGvZSwnn3Y32DmFBokSqcS1huaQ/q2BtUFU3pkNoI7A0pW7f/W1RxW8cvxr9oXkSijWjC763rno1BndOZtyedLKCdCORIf1PUjMKwKIq38xfHyGqNO+Hsi2WwIodJ6w1mQQfn+QBDVQHLeoc9fZ85lzw65WSPwSnGUtlZIQD/RktgzY8inGR74VE1wYXGy5AuvsLnn3GZ06rQgsho58JdR5rqKFGR8kVS7kTXrQpn2SOJ1AJrykFDdxKhx/ecsFBqDp/QsRIrUqhjCREGoI274rtTy2U9lduVhHfwJ6P/I68NdxXyOig8o1fZ4NnrxFDmV0X58Xd1yAWVIFGDFJW+VaQqxCUQH951F1GFXQ==
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
creationTimestamp: null
|
creationTimestamp: null
|
||||||
|
|
|
@ -2030,6 +2030,9 @@ items:
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"lines": true,
|
"lines": true,
|
||||||
"linewidth": 1,
|
"linewidth": 1,
|
||||||
|
"links": [
|
||||||
|
|
||||||
|
],
|
||||||
"minSpan": 24,
|
"minSpan": 24,
|
||||||
"nullPointMode": "null as zero",
|
"nullPointMode": "null as zero",
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
|
@ -19273,6 +19276,9 @@ items:
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"lines": true,
|
"lines": true,
|
||||||
"linewidth": 1,
|
"linewidth": 1,
|
||||||
|
"links": [
|
||||||
|
|
||||||
|
],
|
||||||
"minSpan": 24,
|
"minSpan": 24,
|
||||||
"nullPointMode": "null as zero",
|
"nullPointMode": "null as zero",
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
|
@ -20644,6 +20650,9 @@ items:
|
||||||
"id": 5,
|
"id": 5,
|
||||||
"lines": true,
|
"lines": true,
|
||||||
"linewidth": 1,
|
"linewidth": 1,
|
||||||
|
"links": [
|
||||||
|
|
||||||
|
],
|
||||||
"minSpan": 24,
|
"minSpan": 24,
|
||||||
"nullPointMode": "null as zero",
|
"nullPointMode": "null as zero",
|
||||||
"renderer": "flot",
|
"renderer": "flot",
|
||||||
|
|
|
@ -122,6 +122,7 @@ spec:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
beta.kubernetes.io/os: linux
|
beta.kubernetes.io/os: linux
|
||||||
securityContext:
|
securityContext:
|
||||||
|
fsGroup: 65534
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
runAsUser: 65534
|
runAsUser: 65534
|
||||||
serviceAccountName: grafana
|
serviceAccountName: grafana
|
||||||
|
|
|
@ -3,7 +3,7 @@ kind: ClusterRole
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
rules:
|
rules:
|
||||||
- apiGroups:
|
- apiGroups:
|
||||||
|
|
|
@ -3,7 +3,7 @@ kind: ClusterRoleBinding
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
roleRef:
|
roleRef:
|
||||||
apiGroup: rbac.authorization.k8s.io
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
|
|
@ -3,7 +3,7 @@ kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
@ -15,7 +15,7 @@ spec:
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- args:
|
- args:
|
||||||
|
@ -23,7 +23,7 @@ spec:
|
||||||
- --port=8081
|
- --port=8081
|
||||||
- --telemetry-host=127.0.0.1
|
- --telemetry-host=127.0.0.1
|
||||||
- --telemetry-port=8082
|
- --telemetry-port=8082
|
||||||
image: quay.io/coreos/kube-state-metrics:v1.9.5
|
image: quay.io/coreos/kube-state-metrics:v1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsUser: 65534
|
runAsUser: 65534
|
||||||
|
|
|
@ -3,7 +3,7 @@ kind: Service
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
|
|
@ -3,6 +3,6 @@ kind: ServiceAccount
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
|
|
|
@ -3,7 +3,7 @@ kind: ServiceMonitor
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/name: kube-state-metrics
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
app.kubernetes.io/version: 1.9.5
|
app.kubernetes.io/version: 1.9.7
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
|
|
@ -4,7 +4,7 @@ metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
@ -19,4 +19,4 @@ spec:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
|
|
|
@ -788,10 +788,11 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeStateMetricsListErrors
|
- alert: KubeStateMetricsListErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: kube-state-metrics is experiencing errors at an elevated rate in
|
description: kube-state-metrics is experiencing errors at an elevated rate
|
||||||
list operations. This is likely causing it to not be able to expose metrics
|
in list operations. This is likely causing it to not be able to expose metrics
|
||||||
about Kubernetes objects correctly or at all.
|
about Kubernetes objects correctly or at all.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
|
||||||
|
summary: kube-state-metrics is experiencing errors in list operations.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
/
|
/
|
||||||
|
@ -802,10 +803,11 @@ spec:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubeStateMetricsWatchErrors
|
- alert: KubeStateMetricsWatchErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: kube-state-metrics is experiencing errors at an elevated rate in
|
description: kube-state-metrics is experiencing errors at an elevated rate
|
||||||
watch operations. This is likely causing it to not be able to expose metrics
|
in watch operations. This is likely causing it to not be able to expose
|
||||||
about Kubernetes objects correctly or at all.
|
metrics about Kubernetes objects correctly or at all.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
|
||||||
|
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
/
|
/
|
||||||
|
@ -1024,9 +1026,10 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePodCrashLooping
|
- alert: KubePodCrashLooping
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||||
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||||
|
summary: Pod is crash looping.
|
||||||
expr: |
|
expr: |
|
||||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1034,9 +1037,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePodNotReady
|
- alert: KubePodNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||||
state for longer than 15 minutes.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
|
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (
|
sum by (namespace, pod) (
|
||||||
max by(namespace, pod) (
|
max by(namespace, pod) (
|
||||||
|
@ -1050,10 +1054,11 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
- alert: KubeDeploymentGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
}} does not match, this indicates that the Deployment has failed but has
|
}} does not match, this indicates that the Deployment has failed but has
|
||||||
not been rolled back.
|
not been rolled back.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
||||||
|
summary: Deployment generation mismatch due to possible roll-back
|
||||||
expr: |
|
expr: |
|
||||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||||
!=
|
!=
|
||||||
|
@ -1063,9 +1068,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||||
matched the expected number of replicas for longer than 15 minutes.
|
not matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||||
|
summary: Deployment has not matched the expected number of replicas.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||||
|
@ -1081,9 +1087,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
- alert: KubeStatefulSetReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
|
||||||
not matched the expected number of replicas for longer than 15 minutes.
|
has not matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
||||||
|
summary: Deployment has not matched the expected number of replicas.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||||
|
@ -1099,10 +1106,11 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetGenerationMismatch
|
- alert: KubeStatefulSetGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||||
}} does not match, this indicates that the StatefulSet has failed but has
|
}} does not match, this indicates that the StatefulSet has failed but has
|
||||||
not been rolled back.
|
not been rolled back.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
||||||
|
summary: StatefulSet generation mismatch due to possible roll-back
|
||||||
expr: |
|
expr: |
|
||||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||||
!=
|
!=
|
||||||
|
@ -1112,9 +1120,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||||
annotations:
|
annotations:
|
||||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
|
||||||
has not been rolled out.
|
update has not been rolled out.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
||||||
|
summary: StatefulSet update has not been rolled out.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
max without (revision) (
|
max without (revision) (
|
||||||
|
@ -1138,9 +1147,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetRolloutStuck
|
- alert: KubeDaemonSetRolloutStuck
|
||||||
annotations:
|
annotations:
|
||||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
|
||||||
finished or progressed for at least 15 minutes.
|
not finished or progressed for at least 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||||
|
summary: DaemonSet rollout is stuck.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
(
|
(
|
||||||
|
@ -1170,9 +1180,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeContainerWaiting
|
- alert: KubeContainerWaiting
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
|
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
|
||||||
has been in waiting state for longer than 1 hour.
|
has been in waiting state for longer than 1 hour.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
|
||||||
|
summary: Pod container waiting longer than 1 hour
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
|
||||||
for: 1h
|
for: 1h
|
||||||
|
@ -1180,9 +1191,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetNotScheduled
|
- alert: KubeDaemonSetNotScheduled
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
}} are not scheduled.'
|
}} are not scheduled.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
||||||
|
summary: DaemonSet pods are not scheduled.
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||||
-
|
-
|
||||||
|
@ -1192,9 +1204,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetMisScheduled
|
- alert: KubeDaemonSetMisScheduled
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
}} are running where they are not supposed to run.'
|
}} are running where they are not supposed to run.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
||||||
|
summary: DaemonSet pods are misscheduled.
|
||||||
expr: |
|
expr: |
|
||||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1202,9 +1215,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeJobCompletion
|
- alert: KubeJobCompletion
|
||||||
annotations:
|
annotations:
|
||||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
|
||||||
than 12 hours to complete.
|
more than 12 hours to complete.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||||
|
summary: Job did not complete in time
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
|
@ -1212,8 +1226,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeJobFailed
|
- alert: KubeJobFailed
|
||||||
annotations:
|
annotations:
|
||||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
|
||||||
|
complete.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
||||||
|
summary: Job failed to complete.
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_failed{job="kube-state-metrics"} > 0
|
kube_job_failed{job="kube-state-metrics"} > 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1221,9 +1237,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeHpaReplicasMismatch
|
- alert: KubeHpaReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
|
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched
|
||||||
desired number of replicas for longer than 15 minutes.
|
the desired number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||||
|
summary: HPA has not matched descired number of replicas.
|
||||||
expr: |
|
expr: |
|
||||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||||
!=
|
!=
|
||||||
|
@ -1235,9 +1252,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeHpaMaxedOut
|
- alert: KubeHpaMaxedOut
|
||||||
annotations:
|
annotations:
|
||||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
|
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running
|
||||||
max replicas for longer than 15 minutes.
|
at max replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||||
|
summary: HPA is running at max replicas
|
||||||
expr: |
|
expr: |
|
||||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||||
==
|
==
|
||||||
|
@ -1249,9 +1267,10 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeCPUOvercommit
|
- alert: KubeCPUOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot
|
description: Cluster has overcommitted CPU resource requests for Pods and
|
||||||
tolerate node failure.
|
cannot tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
||||||
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
|
||||||
/
|
/
|
||||||
|
@ -1263,9 +1282,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryOvercommit
|
- alert: KubeMemoryOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
description: Cluster has overcommitted memory resource requests for Pods and
|
||||||
tolerate node failure.
|
cannot tolerate node failure.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
|
||||||
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
|
||||||
/
|
/
|
||||||
|
@ -1279,8 +1299,9 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeCPUQuotaOvercommit
|
- alert: KubeCPUQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
description: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
|
||||||
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||||
/
|
/
|
||||||
|
@ -1291,8 +1312,9 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryQuotaOvercommit
|
- alert: KubeMemoryQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
message: Cluster has overcommitted memory resource requests for Namespaces.
|
description: Cluster has overcommitted memory resource requests for Namespaces.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
|
||||||
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
expr: |
|
expr: |
|
||||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||||
/
|
/
|
||||||
|
@ -1303,9 +1325,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeQuotaFullyUsed
|
- alert: KubeQuotaFullyUsed
|
||||||
annotations:
|
annotations:
|
||||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
}} of its {{ $labels.resource }} quota.
|
}} of its {{ $labels.resource }} quota.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
|
||||||
|
summary: Namespace quota is fully used.
|
||||||
expr: |
|
expr: |
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||||
/ ignoring(instance, job, type)
|
/ ignoring(instance, job, type)
|
||||||
|
@ -1316,10 +1339,11 @@ spec:
|
||||||
severity: info
|
severity: info
|
||||||
- alert: CPUThrottlingHigh
|
- alert: CPUThrottlingHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
|
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
|
||||||
$labels.pod }}.'
|
$labels.pod }}.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||||
|
summary: Processes experience elevated CPU throttling.
|
||||||
expr: |
|
expr: |
|
||||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||||
/
|
/
|
||||||
|
@ -1332,10 +1356,11 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
||||||
}} free.
|
}} free.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
|
summary: PersistentVolume is filling up.
|
||||||
expr: |
|
expr: |
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||||
/
|
/
|
||||||
|
@ -1346,10 +1371,12 @@ spec:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description: Based on recent sampling, the PersistentVolume claimed by {{
|
||||||
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is
|
||||||
days. Currently {{ $value | humanizePercentage }} is available.
|
expected to fill up within four days. Currently {{ $value | humanizePercentage
|
||||||
|
}} is available.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
|
||||||
|
summary: PersistentVolume is filling up.
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
|
||||||
|
@ -1363,9 +1390,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePersistentVolumeErrors
|
- alert: KubePersistentVolumeErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
description: The persistent volume {{ $labels.persistentvolume }} has status
|
||||||
$labels.phase }}.
|
{{ $labels.phase }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
||||||
|
summary: PersistentVolume is having issues with provisioning.
|
||||||
expr: |
|
expr: |
|
||||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -1375,9 +1403,10 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeVersionMismatch
|
- alert: KubeVersionMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: There are {{ $value }} different semantic versions of Kubernetes
|
description: There are {{ $value }} different semantic versions of Kubernetes
|
||||||
components running.
|
components running.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||||
|
summary: Different semantic versions of Kubernetes components running.
|
||||||
expr: |
|
expr: |
|
||||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1385,9 +1414,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientErrors
|
- alert: KubeClientErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||||
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||||
|
summary: Kubernetes API server client is experiencing errors.
|
||||||
expr: |
|
expr: |
|
||||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||||
/
|
/
|
||||||
|
@ -1400,8 +1430,9 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeAPIErrorBudgetBurn
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
annotations:
|
annotations:
|
||||||
message: The API server is burning too much error budget
|
description: The API server is burning too much error budget.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
|
summary: The API server is burning too much error budget.
|
||||||
expr: |
|
expr: |
|
||||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||||
and
|
and
|
||||||
|
@ -1413,8 +1444,9 @@ spec:
|
||||||
short: 5m
|
short: 5m
|
||||||
- alert: KubeAPIErrorBudgetBurn
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
annotations:
|
annotations:
|
||||||
message: The API server is burning too much error budget
|
description: The API server is burning too much error budget.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
|
summary: The API server is burning too much error budget.
|
||||||
expr: |
|
expr: |
|
||||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||||
and
|
and
|
||||||
|
@ -1426,8 +1458,9 @@ spec:
|
||||||
short: 30m
|
short: 30m
|
||||||
- alert: KubeAPIErrorBudgetBurn
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
annotations:
|
annotations:
|
||||||
message: The API server is burning too much error budget
|
description: The API server is burning too much error budget.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
|
summary: The API server is burning too much error budget.
|
||||||
expr: |
|
expr: |
|
||||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||||
and
|
and
|
||||||
|
@ -1439,8 +1472,9 @@ spec:
|
||||||
short: 2h
|
short: 2h
|
||||||
- alert: KubeAPIErrorBudgetBurn
|
- alert: KubeAPIErrorBudgetBurn
|
||||||
annotations:
|
annotations:
|
||||||
message: The API server is burning too much error budget
|
description: The API server is burning too much error budget.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
|
||||||
|
summary: The API server is burning too much error budget.
|
||||||
expr: |
|
expr: |
|
||||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||||
and
|
and
|
||||||
|
@ -1454,38 +1488,42 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring
|
description: A client certificate used to authenticate to the apiserver is
|
||||||
in less than 7.0 days.
|
expiring in less than 7.0 days.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
|
summary: Client certificate is about to expire.
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring
|
description: A client certificate used to authenticate to the apiserver is
|
||||||
in less than 24.0 hours.
|
expiring in less than 24.0 hours.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
|
summary: Client certificate is about to expire.
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: AggregatedAPIErrors
|
- alert: AggregatedAPIErrors
|
||||||
annotations:
|
annotations:
|
||||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
|
||||||
reported errors. The number of errors have increased for it in the past
|
has reported errors. The number of errors have increased for it in the past
|
||||||
five minutes. High values indicate that the availability of the service
|
five minutes. High values indicate that the availability of the service
|
||||||
changes too often.
|
changes too often.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
|
||||||
|
summary: An aggregated API has reported errors.
|
||||||
expr: |
|
expr: |
|
||||||
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: AggregatedAPIDown
|
- alert: AggregatedAPIDown
|
||||||
annotations:
|
annotations:
|
||||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
|
||||||
been only {{ $value | humanize }}% available over the last 5m.
|
has been only {{ $value | humanize }}% available over the last 5m.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||||
|
summary: An aggregated API is down.
|
||||||
expr: |
|
expr: |
|
||||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -1493,8 +1531,9 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeAPIDown
|
- alert: KubeAPIDown
|
||||||
annotations:
|
annotations:
|
||||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
||||||
|
summary: Target disappeared from Prometheus target discovery.
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="apiserver"} == 1)
|
absent(up{job="apiserver"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1504,8 +1543,9 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeNodeNotReady
|
- alert: KubeNodeNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
||||||
|
summary: Node is not ready.
|
||||||
expr: |
|
expr: |
|
||||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1513,17 +1553,20 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeNodeUnreachable
|
- alert: KubeNodeUnreachable
|
||||||
annotations:
|
annotations:
|
||||||
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
|
description: '{{ $labels.node }} is unreachable and some workloads may be
|
||||||
|
rescheduled.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
|
||||||
|
summary: Node is unreachable.
|
||||||
expr: |
|
expr: |
|
||||||
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletTooManyPods
|
- alert: KubeletTooManyPods
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||||
}} of its Pod capacity.
|
}} of its Pod capacity.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||||
|
summary: Kubelet is running at capacity.
|
||||||
expr: |
|
expr: |
|
||||||
count by(node) (
|
count by(node) (
|
||||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||||
|
@ -1537,9 +1580,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeNodeReadinessFlapping
|
- alert: KubeNodeReadinessFlapping
|
||||||
annotations:
|
annotations:
|
||||||
message: The readiness status of node {{ $labels.node }} has changed {{ $value
|
description: The readiness status of node {{ $labels.node }} has changed {{
|
||||||
}} times in the last 15 minutes.
|
$value }} times in the last 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
|
||||||
|
summary: Node readiness status is flapping.
|
||||||
expr: |
|
expr: |
|
||||||
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1547,9 +1591,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletPlegDurationHigh
|
- alert: KubeletPlegDurationHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
|
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
|
||||||
of {{ $value }} seconds on node {{ $labels.node }}.
|
duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
|
||||||
|
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||||
expr: |
|
expr: |
|
||||||
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||||
for: 5m
|
for: 5m
|
||||||
|
@ -1557,9 +1602,10 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletPodStartUpLatencyHigh
|
- alert: KubeletPodStartUpLatencyHigh
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||||
on node {{ $labels.node }}.
|
on node {{ $labels.node }}.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
|
||||||
|
summary: Kubelet Pod startup latency is too high.
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1567,8 +1613,9 @@ spec:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeletDown
|
- alert: KubeletDown
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubelet has disappeared from Prometheus target discovery.
|
description: Kubelet has disappeared from Prometheus target discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
||||||
|
summary: Target disappeared from Prometheus target discovery.
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1578,8 +1625,9 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeSchedulerDown
|
- alert: KubeSchedulerDown
|
||||||
annotations:
|
annotations:
|
||||||
message: KubeScheduler has disappeared from Prometheus target discovery.
|
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
||||||
|
summary: Target disappeared from Prometheus target discovery.
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="kube-scheduler"} == 1)
|
absent(up{job="kube-scheduler"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1589,8 +1637,10 @@ spec:
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeControllerManagerDown
|
- alert: KubeControllerManagerDown
|
||||||
annotations:
|
annotations:
|
||||||
message: KubeControllerManager has disappeared from Prometheus target discovery.
|
description: KubeControllerManager has disappeared from Prometheus target
|
||||||
|
discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
||||||
|
summary: Target disappeared from Prometheus target discovery.
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="kube-controller-manager"} == 1)
|
absent(up{job="kube-controller-manager"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
|
@ -1875,7 +1925,7 @@ spec:
|
||||||
message: Errors while performing List operations in controller {{$labels.controller}}
|
message: Errors while performing List operations in controller {{$labels.controller}}
|
||||||
in {{$labels.namespace}} namespace.
|
in {{$labels.namespace}} namespace.
|
||||||
expr: |
|
expr: |
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
|
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -1884,7 +1934,7 @@ spec:
|
||||||
message: Errors while performing Watch operations in controller {{$labels.controller}}
|
message: Errors while performing Watch operations in controller {{$labels.controller}}
|
||||||
in {{$labels.namespace}} namespace.
|
in {{$labels.namespace}} namespace.
|
||||||
expr: |
|
expr: |
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
|
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
@ -68,6 +68,18 @@ spec:
|
||||||
scheme: https
|
scheme: https
|
||||||
tlsConfig:
|
tlsConfig:
|
||||||
insecureSkipVerify: true
|
insecureSkipVerify: true
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
honorLabels: true
|
||||||
|
interval: 30s
|
||||||
|
path: /metrics/probes
|
||||||
|
port: https-metrics
|
||||||
|
relabelings:
|
||||||
|
- sourceLabels:
|
||||||
|
- __metrics_path__
|
||||||
|
targetLabel: metrics_path
|
||||||
|
scheme: https
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
jobLabel: k8s-app
|
jobLabel: k8s-app
|
||||||
namespaceSelector:
|
namespaceSelector:
|
||||||
matchNames:
|
matchNames:
|
||||||
|
|
|
@ -645,7 +645,8 @@ spec:
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
baseImage:
|
baseImage:
|
||||||
description: Base image that is used to deploy pods, without tag.
|
description: 'Base image that is used to deploy pods, without tag.
|
||||||
|
Deprecated: use ''image'' instead'
|
||||||
type: string
|
type: string
|
||||||
clusterAdvertiseAddress:
|
clusterAdvertiseAddress:
|
||||||
description: 'ClusterAdvertiseAddress is the explicit address to advertise
|
description: 'ClusterAdvertiseAddress is the explicit address to advertise
|
||||||
|
@ -3014,10 +3015,11 @@ spec:
|
||||||
to use to run the Prometheus Pods.
|
to use to run the Prometheus Pods.
|
||||||
type: string
|
type: string
|
||||||
sha:
|
sha:
|
||||||
description: SHA of Alertmanager container image to be deployed. Defaults
|
description: 'SHA of Alertmanager container image to be deployed.
|
||||||
to the value of `version`. Similar to a tag, but the SHA explicitly
|
Defaults to the value of `version`. Similar to a tag, but the SHA
|
||||||
deploys an immutable container image. Version and Tag are ignored
|
explicitly deploys an immutable container image. Version and Tag
|
||||||
if SHA is set.
|
are ignored if SHA is set. Deprecated: use ''image'' instead. The
|
||||||
|
image digest can be specified as part of the image URL.'
|
||||||
type: string
|
type: string
|
||||||
storage:
|
storage:
|
||||||
description: Storage is the definition of how storage will be used
|
description: Storage is the definition of how storage will be used
|
||||||
|
@ -3279,8 +3281,10 @@ spec:
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
tag:
|
tag:
|
||||||
description: Tag of Alertmanager container image to be deployed. Defaults
|
description: 'Tag of Alertmanager container image to be deployed.
|
||||||
to the value of `version`. Version is ignored if Tag is set.
|
Defaults to the value of `version`. Version is ignored if Tag is
|
||||||
|
set. Deprecated: use ''image'' instead. The image tag can be specified
|
||||||
|
as part of the image URL.'
|
||||||
type: string
|
type: string
|
||||||
tolerations:
|
tolerations:
|
||||||
description: If specified, the pod's tolerations.
|
description: If specified, the pod's tolerations.
|
||||||
|
|
|
@ -900,6 +900,11 @@ spec:
|
||||||
required:
|
required:
|
||||||
- alertmanagers
|
- alertmanagers
|
||||||
type: object
|
type: object
|
||||||
|
allowOverlappingBlocks:
|
||||||
|
description: AllowOverlappingBlocks enables vertical compaction and
|
||||||
|
vertical query merge in Prometheus. This is still experimental in
|
||||||
|
Prometheus so it may change in any upcoming release.
|
||||||
|
type: boolean
|
||||||
apiserverConfig:
|
apiserverConfig:
|
||||||
description: APIServerConfig allows specifying a host and auth methods
|
description: APIServerConfig allows specifying a host and auth methods
|
||||||
to access apiserver. If left empty, Prometheus is assumed to run
|
to access apiserver. If left empty, Prometheus is assumed to run
|
||||||
|
@ -1097,7 +1102,8 @@ spec:
|
||||||
type: boolean
|
type: boolean
|
||||||
type: object
|
type: object
|
||||||
baseImage:
|
baseImage:
|
||||||
description: Base image to use for a Prometheus deployment.
|
description: 'Base image to use for a Prometheus deployment. Deprecated:
|
||||||
|
use ''image'' instead'
|
||||||
type: string
|
type: string
|
||||||
configMaps:
|
configMaps:
|
||||||
description: ConfigMaps is a list of ConfigMaps in the same namespace
|
description: ConfigMaps is a list of ConfigMaps in the same namespace
|
||||||
|
@ -3432,6 +3438,95 @@ spec:
|
||||||
priorityClassName:
|
priorityClassName:
|
||||||
description: Priority class assigned to the Pods
|
description: Priority class assigned to the Pods
|
||||||
type: string
|
type: string
|
||||||
|
probeNamespaceSelector:
|
||||||
|
description: '*Experimental* Namespaces to be selected for Probe discovery.
|
||||||
|
If nil, only check own namespace.'
|
||||||
|
properties:
|
||||||
|
matchExpressions:
|
||||||
|
description: matchExpressions is a list of label selector requirements.
|
||||||
|
The requirements are ANDed.
|
||||||
|
items:
|
||||||
|
description: A label selector requirement is a selector that
|
||||||
|
contains values, a key, and an operator that relates the key
|
||||||
|
and values.
|
||||||
|
properties:
|
||||||
|
key:
|
||||||
|
description: key is the label key that the selector applies
|
||||||
|
to.
|
||||||
|
type: string
|
||||||
|
operator:
|
||||||
|
description: operator represents a key's relationship to
|
||||||
|
a set of values. Valid operators are In, NotIn, Exists
|
||||||
|
and DoesNotExist.
|
||||||
|
type: string
|
||||||
|
values:
|
||||||
|
description: values is an array of string values. If the
|
||||||
|
operator is In or NotIn, the values array must be non-empty.
|
||||||
|
If the operator is Exists or DoesNotExist, the values
|
||||||
|
array must be empty. This array is replaced during a strategic
|
||||||
|
merge patch.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- key
|
||||||
|
- operator
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
matchLabels:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: matchLabels is a map of {key,value} pairs. A single
|
||||||
|
{key,value} in the matchLabels map is equivalent to an element
|
||||||
|
of matchExpressions, whose key field is "key", the operator
|
||||||
|
is "In", and the values array contains only "value". The requirements
|
||||||
|
are ANDed.
|
||||||
|
type: object
|
||||||
|
type: object
|
||||||
|
probeSelector:
|
||||||
|
description: '*Experimental* Probes to be selected for target discovery.'
|
||||||
|
properties:
|
||||||
|
matchExpressions:
|
||||||
|
description: matchExpressions is a list of label selector requirements.
|
||||||
|
The requirements are ANDed.
|
||||||
|
items:
|
||||||
|
description: A label selector requirement is a selector that
|
||||||
|
contains values, a key, and an operator that relates the key
|
||||||
|
and values.
|
||||||
|
properties:
|
||||||
|
key:
|
||||||
|
description: key is the label key that the selector applies
|
||||||
|
to.
|
||||||
|
type: string
|
||||||
|
operator:
|
||||||
|
description: operator represents a key's relationship to
|
||||||
|
a set of values. Valid operators are In, NotIn, Exists
|
||||||
|
and DoesNotExist.
|
||||||
|
type: string
|
||||||
|
values:
|
||||||
|
description: values is an array of string values. If the
|
||||||
|
operator is In or NotIn, the values array must be non-empty.
|
||||||
|
If the operator is Exists or DoesNotExist, the values
|
||||||
|
array must be empty. This array is replaced during a strategic
|
||||||
|
merge patch.
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
type: array
|
||||||
|
required:
|
||||||
|
- key
|
||||||
|
- operator
|
||||||
|
type: object
|
||||||
|
type: array
|
||||||
|
matchLabels:
|
||||||
|
additionalProperties:
|
||||||
|
type: string
|
||||||
|
description: matchLabels is a map of {key,value} pairs. A single
|
||||||
|
{key,value} in the matchLabels map is equivalent to an element
|
||||||
|
of matchExpressions, whose key field is "key", the operator
|
||||||
|
is "In", and the values array contains only "value". The requirements
|
||||||
|
are ANDed.
|
||||||
|
type: object
|
||||||
|
type: object
|
||||||
prometheusExternalLabelName:
|
prometheusExternalLabelName:
|
||||||
description: Name of Prometheus external label used to denote Prometheus
|
description: Name of Prometheus external label used to denote Prometheus
|
||||||
instance name. Defaults to the value of `prometheus`. External label
|
instance name. Defaults to the value of `prometheus`. External label
|
||||||
|
@ -4374,10 +4469,11 @@ spec:
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
sha:
|
sha:
|
||||||
description: SHA of Prometheus container image to be deployed. Defaults
|
description: 'SHA of Prometheus container image to be deployed. Defaults
|
||||||
to the value of `version`. Similar to a tag, but the SHA explicitly
|
to the value of `version`. Similar to a tag, but the SHA explicitly
|
||||||
deploys an immutable container image. Version and Tag are ignored
|
deploys an immutable container image. Version and Tag are ignored
|
||||||
if SHA is set.
|
if SHA is set. Deprecated: use ''image'' instead. The image digest
|
||||||
|
can be specified as part of the image URL.'
|
||||||
type: string
|
type: string
|
||||||
storage:
|
storage:
|
||||||
description: Storage spec to specify how storage shall be used.
|
description: Storage spec to specify how storage shall be used.
|
||||||
|
@ -4638,8 +4734,10 @@ spec:
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
tag:
|
tag:
|
||||||
description: Tag of Prometheus container image to be deployed. Defaults
|
description: 'Tag of Prometheus container image to be deployed. Defaults
|
||||||
to the value of `version`. Version is ignored if Tag is set.
|
to the value of `version`. Version is ignored if Tag is set. Deprecated:
|
||||||
|
use ''image'' instead. The image tag can be specified as part of
|
||||||
|
the image URL.'
|
||||||
type: string
|
type: string
|
||||||
thanos:
|
thanos:
|
||||||
description: "Thanos configuration allows configuring various aspects
|
description: "Thanos configuration allows configuring various aspects
|
||||||
|
@ -4649,7 +4747,8 @@ spec:
|
||||||
without backward compatibility in any release."
|
without backward compatibility in any release."
|
||||||
properties:
|
properties:
|
||||||
baseImage:
|
baseImage:
|
||||||
description: Thanos base image if other than default.
|
description: 'Thanos base image if other than default. Deprecated:
|
||||||
|
use ''image'' instead'
|
||||||
type: string
|
type: string
|
||||||
grpcServerTlsConfig:
|
grpcServerTlsConfig:
|
||||||
description: 'GRPCServerTLSConfig configures the gRPC server from
|
description: 'GRPCServerTLSConfig configures the gRPC server from
|
||||||
|
@ -4842,15 +4941,17 @@ spec:
|
||||||
type: object
|
type: object
|
||||||
type: object
|
type: object
|
||||||
sha:
|
sha:
|
||||||
description: SHA of Thanos container image to be deployed. Defaults
|
description: 'SHA of Thanos container image to be deployed. Defaults
|
||||||
to the value of `version`. Similar to a tag, but the SHA explicitly
|
to the value of `version`. Similar to a tag, but the SHA explicitly
|
||||||
deploys an immutable container image. Version and Tag are ignored
|
deploys an immutable container image. Version and Tag are ignored
|
||||||
if SHA is set.
|
if SHA is set. Deprecated: use ''image'' instead. The image
|
||||||
|
digest can be specified as part of the image URL.'
|
||||||
type: string
|
type: string
|
||||||
tag:
|
tag:
|
||||||
description: Tag of Thanos sidecar container image to be deployed.
|
description: 'Tag of Thanos sidecar container image to be deployed.
|
||||||
Defaults to the value of `version`. Version is ignored if Tag
|
Defaults to the value of `version`. Version is ignored if Tag
|
||||||
is set.
|
is set. Deprecated: use ''image'' instead. The image tag can
|
||||||
|
be specified as part of the image URL.'
|
||||||
type: string
|
type: string
|
||||||
tracingConfig:
|
tracingConfig:
|
||||||
description: TracingConfig configures tracing in Thanos. This
|
description: TracingConfig configures tracing in Thanos. This
|
||||||
|
|
|
@ -4,7 +4,7 @@ metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
rules:
|
rules:
|
||||||
- apiGroups:
|
- apiGroups:
|
||||||
|
@ -18,6 +18,7 @@ rules:
|
||||||
- thanosrulers/finalizers
|
- thanosrulers/finalizers
|
||||||
- servicemonitors
|
- servicemonitors
|
||||||
- podmonitors
|
- podmonitors
|
||||||
|
- probes
|
||||||
- prometheusrules
|
- prometheusrules
|
||||||
verbs:
|
verbs:
|
||||||
- '*'
|
- '*'
|
||||||
|
|
|
@ -4,7 +4,7 @@ metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
roleRef:
|
roleRef:
|
||||||
apiGroup: rbac.authorization.k8s.io
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
|
|
@ -4,7 +4,7 @@ metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
@ -18,15 +18,15 @@ spec:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- args:
|
- args:
|
||||||
- --kubelet-service=kube-system/kubelet
|
- --kubelet-service=kube-system/kubelet
|
||||||
- --logtostderr=true
|
- --logtostderr=true
|
||||||
- --config-reloader-image=jimmidyson/configmap-reload:v0.3.0
|
- --config-reloader-image=jimmidyson/configmap-reload:v0.4.0
|
||||||
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.40.0
|
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.41.1
|
||||||
image: quay.io/coreos/prometheus-operator:v0.40.0
|
image: quay.io/coreos/prometheus-operator:v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8080
|
- containerPort: 8080
|
||||||
|
|
|
@ -4,7 +4,7 @@ metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
|
|
@ -4,6 +4,6 @@ metadata:
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
app.kubernetes.io/name: prometheus-operator
|
app.kubernetes.io/name: prometheus-operator
|
||||||
app.kubernetes.io/version: v0.40.0
|
app.kubernetes.io/version: v0.41.1
|
||||||
name: prometheus-operator
|
name: prometheus-operator
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
|
|
1
monitoring/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet
generated
vendored
1
monitoring/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet
generated
vendored
|
@ -237,6 +237,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||||
deployment.mixin.spec.template.spec.withVolumes(volumes) +
|
deployment.mixin.spec.template.spec.withVolumes(volumes) +
|
||||||
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
|
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
|
||||||
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
|
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
|
||||||
|
deployment.mixin.spec.template.spec.securityContext.withFsGroup(65534) +
|
||||||
deployment.mixin.spec.template.spec.withServiceAccountName('grafana'),
|
deployment.mixin.spec.template.spec.withServiceAccountName('grafana'),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
9
monitoring/vendor/github.com/coreos/etcd/Documentation/etcd-mixin/mixin.libsonnet
generated
vendored
9
monitoring/vendor/github.com/coreos/etcd/Documentation/etcd-mixin/mixin.libsonnet
generated
vendored
|
@ -7,6 +7,9 @@
|
||||||
// instances are deployed on K8s, you will likely want to change
|
// instances are deployed on K8s, you will likely want to change
|
||||||
// this to 'instance, pod'.
|
// this to 'instance, pod'.
|
||||||
etcd_instance_labels: 'instance',
|
etcd_instance_labels: 'instance',
|
||||||
|
// scrape_interval_seconds is the global scrape interval which can be
|
||||||
|
// used to dynamically adjust rate windows as a function of the interval.
|
||||||
|
scrape_interval_seconds: 30,
|
||||||
},
|
},
|
||||||
|
|
||||||
prometheusAlerts+:: {
|
prometheusAlerts+:: {
|
||||||
|
@ -21,12 +24,12 @@
|
||||||
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
||||||
or
|
or
|
||||||
count without (To) (
|
count without (To) (
|
||||||
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01
|
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
> 0
|
> 0
|
||||||
||| % $._config,
|
||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4},
|
||||||
'for': '3m',
|
'for': '10m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
|
|
26
monitoring/vendor/github.com/coreos/etcd/Documentation/etcd-mixin/test.yaml
generated
vendored
26
monitoring/vendor/github.com/coreos/etcd/Documentation/etcd-mixin/test.yaml
generated
vendored
|
@ -17,16 +17,16 @@ tests:
|
||||||
alertname: etcdInsufficientMembers
|
alertname: etcdInsufficientMembers
|
||||||
- eval_time: 5m
|
- eval_time: 5m
|
||||||
alertname: etcdInsufficientMembers
|
alertname: etcdInsufficientMembers
|
||||||
- eval_time: 5m
|
- eval_time: 12m
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
- eval_time: 7m
|
- eval_time: 14m
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
job: etcd
|
job: etcd
|
||||||
severity: critical
|
severity: critical
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: 'etcd cluster "etcd": members are down (1).'
|
message: 'etcd cluster "etcd": members are down (3).'
|
||||||
- eval_time: 7m
|
- eval_time: 7m
|
||||||
alertname: etcdInsufficientMembers
|
alertname: etcdInsufficientMembers
|
||||||
- eval_time: 11m
|
- eval_time: 11m
|
||||||
|
@ -49,33 +49,31 @@ tests:
|
||||||
- interval: 1m
|
- interval: 1m
|
||||||
input_series:
|
input_series:
|
||||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
||||||
values: '1 1 1 1 0 0 0 0'
|
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||||
alert_rule_test:
|
alert_rule_test:
|
||||||
- eval_time: 10m
|
- eval_time: 14m
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
job: etcd
|
job: etcd
|
||||||
severity: critical
|
severity: critical
|
||||||
exp_annotations:
|
exp_annotations:
|
||||||
message: 'etcd cluster "etcd": members are down (2).'
|
message: 'etcd cluster "etcd": members are down (3).'
|
||||||
|
|
||||||
- interval: 1m
|
- interval: 1m
|
||||||
input_series:
|
input_series:
|
||||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||||
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
||||||
values: '0 0 1 2 3 4 5 6 7 8 9 10'
|
values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18'
|
||||||
alert_rule_test:
|
alert_rule_test:
|
||||||
- eval_time: 4m
|
- eval_time: 13m
|
||||||
alertname: etcdMembersDown
|
|
||||||
- eval_time: 6m
|
|
||||||
alertname: etcdMembersDown
|
alertname: etcdMembersDown
|
||||||
exp_alerts:
|
exp_alerts:
|
||||||
- exp_labels:
|
- exp_labels:
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
jsonnetfile.lock.json
|
|
||||||
vendor/
|
|
|
@ -1,155 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
namespace: 'default',
|
|
||||||
|
|
||||||
versions+:: {
|
|
||||||
alertmanager: 'v0.21.0',
|
|
||||||
},
|
|
||||||
|
|
||||||
imageRepos+:: {
|
|
||||||
alertmanager: 'quay.io/prometheus/alertmanager',
|
|
||||||
},
|
|
||||||
|
|
||||||
alertmanager+:: {
|
|
||||||
name: 'main',
|
|
||||||
config: {
|
|
||||||
global: {
|
|
||||||
resolve_timeout: '5m',
|
|
||||||
},
|
|
||||||
inhibit_rules: [{
|
|
||||||
source_match: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
target_match_re: {
|
|
||||||
severity: 'warning|info',
|
|
||||||
},
|
|
||||||
equal: ['namespace', 'alertname'],
|
|
||||||
}, {
|
|
||||||
source_match: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
target_match_re: {
|
|
||||||
severity: 'info',
|
|
||||||
},
|
|
||||||
equal: ['namespace', 'alertname'],
|
|
||||||
}],
|
|
||||||
route: {
|
|
||||||
group_by: ['namespace'],
|
|
||||||
group_wait: '30s',
|
|
||||||
group_interval: '5m',
|
|
||||||
repeat_interval: '12h',
|
|
||||||
receiver: 'Default',
|
|
||||||
routes: [
|
|
||||||
{
|
|
||||||
receiver: 'Watchdog',
|
|
||||||
match: {
|
|
||||||
alertname: 'Watchdog',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
receiver: 'Critical',
|
|
||||||
match: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
receivers: [
|
|
||||||
{
|
|
||||||
name: 'Default',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'Watchdog',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'Critical',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
replicas: 3,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
alertmanager+:: {
|
|
||||||
secret:
|
|
||||||
local secret = k.core.v1.secret;
|
|
||||||
|
|
||||||
if std.type($._config.alertmanager.config) == 'object' then
|
|
||||||
secret.new('alertmanager-' + $._config.alertmanager.name, {})
|
|
||||||
.withStringData({ 'alertmanager.yaml': std.manifestYamlDoc($._config.alertmanager.config) }) +
|
|
||||||
secret.mixin.metadata.withNamespace($._config.namespace)
|
|
||||||
else
|
|
||||||
secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) +
|
|
||||||
secret.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
|
|
||||||
serviceAccount:
|
|
||||||
local serviceAccount = k.core.v1.serviceAccount;
|
|
||||||
|
|
||||||
serviceAccount.new('alertmanager-' + $._config.alertmanager.name) +
|
|
||||||
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
|
|
||||||
service:
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
local alertmanagerPort = servicePort.newNamed('web', 9093, 'web');
|
|
||||||
|
|
||||||
service.new('alertmanager-' + $._config.alertmanager.name, { app: 'alertmanager', alertmanager: $._config.alertmanager.name }, alertmanagerPort) +
|
|
||||||
service.mixin.spec.withSessionAffinity('ClientIP') +
|
|
||||||
service.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
service.mixin.metadata.withLabels({ alertmanager: $._config.alertmanager.name }),
|
|
||||||
|
|
||||||
serviceMonitor:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'alertmanager',
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'alertmanager',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
alertmanager: $._config.alertmanager.name,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'web',
|
|
||||||
interval: '30s',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
alertmanager:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'Alertmanager',
|
|
||||||
metadata: {
|
|
||||||
name: $._config.alertmanager.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
labels: {
|
|
||||||
alertmanager: $._config.alertmanager.name,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
replicas: $._config.alertmanager.replicas,
|
|
||||||
version: $._config.versions.alertmanager,
|
|
||||||
image: $._config.imageRepos.alertmanager + ':' + $._config.versions.alertmanager,
|
|
||||||
nodeSelector: { 'kubernetes.io/os': 'linux' },
|
|
||||||
serviceAccountName: 'alertmanager-' + $._config.alertmanager.name,
|
|
||||||
securityContext: {
|
|
||||||
runAsUser: 1000,
|
|
||||||
runAsNonRoot: true,
|
|
||||||
fsGroup: 2000,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,57 +0,0 @@
|
||||||
{
|
|
||||||
prometheusAlerts+:: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'alertmanager.rules',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
alert: 'AlertmanagerConfigInconsistent',
|
|
||||||
annotations: {
|
|
||||||
message: |||
|
|
||||||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
|
|
||||||
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
|
|
||||||
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
|
|
||||||
{{ end }}
|
|
||||||
|||,
|
|
||||||
},
|
|
||||||
expr: |||
|
|
||||||
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
|
|
||||||
||| % $._config,
|
|
||||||
'for': '5m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'AlertmanagerFailedReload',
|
|
||||||
annotations: {
|
|
||||||
message: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.",
|
|
||||||
},
|
|
||||||
expr: |||
|
|
||||||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
|
|
||||||
||| % $._config,
|
|
||||||
'for': '10m',
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'AlertmanagerMembersInconsistent',
|
|
||||||
annotations: {
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.',
|
|
||||||
},
|
|
||||||
expr: |||
|
|
||||||
alertmanager_cluster_members{%(alertmanagerSelector)s}
|
|
||||||
!= on (service) GROUP_LEFT()
|
|
||||||
count by (service) (alertmanager_cluster_members{%(alertmanagerSelector)s})
|
|
||||||
||| % $._config,
|
|
||||||
'for': '5m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,4 +0,0 @@
|
||||||
(import 'alertmanager.libsonnet') +
|
|
||||||
(import 'general.libsonnet') +
|
|
||||||
(import 'node.libsonnet') +
|
|
||||||
(import 'prometheus-operator.libsonnet')
|
|
|
@ -1,38 +0,0 @@
|
||||||
{
|
|
||||||
prometheusAlerts+:: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'general.rules',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
alert: 'TargetDown',
|
|
||||||
annotations: {
|
|
||||||
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
|
|
||||||
},
|
|
||||||
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10',
|
|
||||||
'for': '10m',
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'Watchdog',
|
|
||||||
annotations: {
|
|
||||||
message: |||
|
|
||||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
|
||||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
|
||||||
and always fire against a receiver. There are integrations with various notification
|
|
||||||
mechanisms that send a notification when this alert is not firing. For example the
|
|
||||||
"DeadMansSnitch" integration in PagerDuty.
|
|
||||||
|||,
|
|
||||||
},
|
|
||||||
expr: 'vector(1)',
|
|
||||||
labels: {
|
|
||||||
severity: 'none',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,24 +0,0 @@
|
||||||
{
|
|
||||||
prometheusAlerts+:: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'node-network',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
alert: 'NodeNetworkInterfaceFlapping',
|
|
||||||
annotations: {
|
|
||||||
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"',
|
|
||||||
},
|
|
||||||
expr: |||
|
|
||||||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
|
|
||||||
||| % $._config,
|
|
||||||
'for': '2m',
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,63 +0,0 @@
|
||||||
{
|
|
||||||
prometheusAlerts+:: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'prometheus-operator',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
alert: 'PrometheusOperatorListErrors',
|
|
||||||
expr: |||
|
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
|
|
||||||
||| % $._config,
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
|
|
||||||
},
|
|
||||||
'for': '15m',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'PrometheusOperatorWatchErrors',
|
|
||||||
expr: |||
|
|
||||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
|
|
||||||
||| % $._config,
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
|
|
||||||
},
|
|
||||||
'for': '15m',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'PrometheusOperatorReconcileErrors',
|
|
||||||
expr: |||
|
|
||||||
rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
|
|
||||||
||| % $._config,
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
message: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.',
|
|
||||||
},
|
|
||||||
'for': '10m',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'PrometheusOperatorNodeLookupErrors',
|
|
||||||
expr: |||
|
|
||||||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
|
|
||||||
||| % $._config,
|
|
||||||
labels: {
|
|
||||||
severity: 'warning',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
|
|
||||||
},
|
|
||||||
'for': '10m',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,157 +0,0 @@
|
||||||
# TODO(metalmatze): This file is temporarily saved here for later reference
|
|
||||||
# until we find out how to integrate the tests into our jsonnet stack.
|
|
||||||
|
|
||||||
rule_files:
|
|
||||||
- rules.yaml
|
|
||||||
|
|
||||||
evaluation_interval: 1m
|
|
||||||
|
|
||||||
tests:
|
|
||||||
- interval: 1m
|
|
||||||
input_series:
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
|
|
||||||
alert_rule_test:
|
|
||||||
- eval_time: 5m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
- eval_time: 11m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 17m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 23m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- interval: 1m
|
|
||||||
input_series:
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
|
||||||
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
|
|
||||||
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
|
|
||||||
alert_rule_test:
|
|
||||||
- eval_time: 5m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
- eval_time: 11m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.1
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-1
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.2
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-2
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 17m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.1
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-1
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.2
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-2
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- eval_time: 23m
|
|
||||||
alertname: AlertmanagerMembersInconsistent
|
|
||||||
exp_alerts:
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.0
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-0
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.1
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-1
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
||||||
- exp_labels:
|
|
||||||
service: 'alertmanager-main'
|
|
||||||
severity: critical
|
|
||||||
job: 'alertmanager-main'
|
|
||||||
instance: 10.10.10.2
|
|
||||||
namespace: monitoring
|
|
||||||
pod: alertmanager-main-2
|
|
||||||
exp_annotations:
|
|
||||||
message: 'Alertmanager has not found all other members of the cluster.'
|
|
|
@ -1,50 +0,0 @@
|
||||||
[
|
|
||||||
// Drop all kubelet metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all scheduler metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all apiserver metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all docker metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all reflector metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all etcd metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all transformation metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'transformation_(transformation_latencies_microseconds|failures_total)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
// Drop all other metrics which are deprecated in kubernetes.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
]
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,89 +0,0 @@
|
||||||
{
|
|
||||||
"version": 1,
|
|
||||||
"dependencies": [
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/brancz/kubernetes-grafana",
|
|
||||||
"subdir": "grafana"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/coreos/etcd",
|
|
||||||
"subdir": "Documentation/etcd-mixin"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/coreos/prometheus-operator",
|
|
||||||
"subdir": "jsonnet/prometheus-operator"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "release-0.40"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/ksonnet/ksonnet-lib",
|
|
||||||
"subdir": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master",
|
|
||||||
"name": "ksonnet"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
|
|
||||||
"subdir": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/kubernetes/kube-state-metrics",
|
|
||||||
"subdir": "jsonnet/kube-state-metrics"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/kubernetes/kube-state-metrics",
|
|
||||||
"subdir": "jsonnet/kube-state-metrics-mixin"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/prometheus/node_exporter",
|
|
||||||
"subdir": "docs/node-mixin"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/prometheus/prometheus",
|
|
||||||
"subdir": "documentation/prometheus-mixin"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master",
|
|
||||||
"name": "prometheus"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"legacyImports": true
|
|
||||||
}
|
|
|
@ -1,118 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
versions+:: {
|
|
||||||
clusterVerticalAutoscaler: "v0.8.1"
|
|
||||||
},
|
|
||||||
|
|
||||||
imageRepos+:: {
|
|
||||||
clusterVerticalAutoscaler: 'gcr.io/google_containers/cpvpa-amd64'
|
|
||||||
},
|
|
||||||
|
|
||||||
kubeStateMetrics+:: {
|
|
||||||
stepCPU: '1m',
|
|
||||||
stepMemory: '2Mi',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
ksmAutoscaler+:: {
|
|
||||||
clusterRole:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local rulesType = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local rules = [
|
|
||||||
rulesType.new() +
|
|
||||||
rulesType.withApiGroups(['']) +
|
|
||||||
rulesType.withResources([
|
|
||||||
'nodes',
|
|
||||||
]) +
|
|
||||||
rulesType.withVerbs(['list', 'watch']),
|
|
||||||
];
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName('ksm-autoscaler') +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
clusterRoleBinding:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('ksm-autoscaler') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('ksm-autoscaler') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $._config.namespace }]),
|
|
||||||
|
|
||||||
roleBinding:
|
|
||||||
local roleBinding = k.rbac.v1.roleBinding;
|
|
||||||
|
|
||||||
roleBinding.new() +
|
|
||||||
roleBinding.mixin.metadata.withName('ksm-autoscaler') +
|
|
||||||
roleBinding.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
roleBinding.mixin.roleRef.withName('ksm-autoscaler') +
|
|
||||||
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
|
|
||||||
roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler' }]),
|
|
||||||
|
|
||||||
role:
|
|
||||||
local role = k.rbac.v1.role;
|
|
||||||
local rulesType = role.rulesType;
|
|
||||||
|
|
||||||
local extensionsRule = rulesType.new() +
|
|
||||||
rulesType.withApiGroups(['extensions']) +
|
|
||||||
rulesType.withResources([
|
|
||||||
'deployments',
|
|
||||||
]) +
|
|
||||||
rulesType.withVerbs(['patch']) +
|
|
||||||
rulesType.withResourceNames(['kube-state-metrics']);
|
|
||||||
|
|
||||||
local appsRule = rulesType.new() +
|
|
||||||
rulesType.withApiGroups(['apps']) +
|
|
||||||
rulesType.withResources([
|
|
||||||
'deployments',
|
|
||||||
]) +
|
|
||||||
rulesType.withVerbs(['patch']) +
|
|
||||||
rulesType.withResourceNames(['kube-state-metrics']);
|
|
||||||
|
|
||||||
local rules = [extensionsRule, appsRule];
|
|
||||||
|
|
||||||
role.new() +
|
|
||||||
role.mixin.metadata.withName('ksm-autoscaler') +
|
|
||||||
role.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
role.withRules(rules),
|
|
||||||
|
|
||||||
serviceAccount:
|
|
||||||
local serviceAccount = k.core.v1.serviceAccount;
|
|
||||||
|
|
||||||
serviceAccount.new('ksm-autoscaler') +
|
|
||||||
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
deployment:
|
|
||||||
local deployment = k.apps.v1.deployment;
|
|
||||||
local container = deployment.mixin.spec.template.spec.containersType;
|
|
||||||
local podSelector = deployment.mixin.spec.template.spec.selectorType;
|
|
||||||
local podLabels = { app: 'ksm-autoscaler' };
|
|
||||||
|
|
||||||
local kubeStateMetricsAutoscaler =
|
|
||||||
container.new('ksm-autoscaler', $._config.imageRepos.clusterVerticalAutoscaler + ':' + $._config.versions.clusterVerticalAutoscaler) +
|
|
||||||
container.withArgs([
|
|
||||||
'/cpvpa',
|
|
||||||
'--target=deployment/kube-state-metrics',
|
|
||||||
'--namespace=' + $._config.namespace,
|
|
||||||
'--logtostderr=true',
|
|
||||||
'--poll-period-seconds=10',
|
|
||||||
'--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}}}}'
|
|
||||||
]) +
|
|
||||||
container.mixin.resources.withRequests({cpu: '20m', memory: '10Mi'});
|
|
||||||
|
|
||||||
local c = [kubeStateMetricsAutoscaler];
|
|
||||||
|
|
||||||
deployment.new('ksm-autoscaler', 1, c, podLabels) +
|
|
||||||
deployment.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
deployment.mixin.metadata.withLabels(podLabels) +
|
|
||||||
deployment.mixin.spec.selector.withMatchLabels(podLabels) +
|
|
||||||
deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
|
|
||||||
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
|
|
||||||
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
|
|
||||||
deployment.mixin.spec.template.spec.withServiceAccountName('ksm-autoscaler'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,20 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+:: {
|
|
||||||
clusterRole+: {
|
|
||||||
rules+:
|
|
||||||
local role = k.rbac.v1.role;
|
|
||||||
local policyRule = role.rulesType;
|
|
||||||
local rule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'services',
|
|
||||||
'endpoints',
|
|
||||||
'pods',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['get', 'list', 'watch']);
|
|
||||||
[rule]
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,41 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local statefulSet = k.apps.v1.statefulSet;
|
|
||||||
local affinity = statefulSet.mixin.spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecutionType;
|
|
||||||
local matchExpression = affinity.mixin.podAffinityTerm.labelSelector.matchExpressionsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
local antiaffinity(key, values, namespace) = {
|
|
||||||
affinity: {
|
|
||||||
podAntiAffinity: {
|
|
||||||
preferredDuringSchedulingIgnoredDuringExecution: [
|
|
||||||
affinity.new() +
|
|
||||||
affinity.withWeight(100) +
|
|
||||||
affinity.mixin.podAffinityTerm.withNamespaces(namespace) +
|
|
||||||
affinity.mixin.podAffinityTerm.withTopologyKey('kubernetes.io/hostname') +
|
|
||||||
affinity.mixin.podAffinityTerm.labelSelector.withMatchExpressions([
|
|
||||||
matchExpression.new() +
|
|
||||||
matchExpression.withKey(key) +
|
|
||||||
matchExpression.withOperator('In') +
|
|
||||||
matchExpression.withValues(values),
|
|
||||||
]),
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
alertmanager+:: {
|
|
||||||
alertmanager+: {
|
|
||||||
spec+:
|
|
||||||
antiaffinity('alertmanager', [$._config.alertmanager.name], $._config.namespace),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheus+: {
|
|
||||||
local p = self,
|
|
||||||
|
|
||||||
prometheus+: {
|
|
||||||
spec+:
|
|
||||||
antiaffinity('prometheus', [p.name], p.namespace),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+:: {
|
|
||||||
kubeControllerManagerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeSchedulerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeDnsPrometheusDiscoveryService:
|
|
||||||
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,20 +0,0 @@
|
||||||
local l = import 'lib/lib.libsonnet';
|
|
||||||
|
|
||||||
// withImageRepository is a mixin that replaces all images prefixes by repository. eg.
|
|
||||||
// quay.io/coreos/addon-resizer -> $repository/addon-resizer
|
|
||||||
// grafana/grafana -> grafana $repository/grafana
|
|
||||||
local withImageRepository(repository) = {
|
|
||||||
local oldRepos = super._config.imageRepos,
|
|
||||||
local substituteRepository(image, repository) =
|
|
||||||
if repository == null then image else repository + '/' + l.imageName(image),
|
|
||||||
_config+:: {
|
|
||||||
imageRepos:: {
|
|
||||||
[field]: substituteRepository(oldRepos[field], repository),
|
|
||||||
for field in std.objectFields(oldRepos)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
{
|
|
||||||
withImageRepository:: withImageRepository,
|
|
||||||
}
|
|
|
@ -1,197 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
// Custom metrics API allows the HPA v2 to scale based on arbirary metrics.
|
|
||||||
// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
prometheusAdapter+:: {
|
|
||||||
// Rules for custom-metrics
|
|
||||||
config+:: {
|
|
||||||
rules+: [
|
|
||||||
{
|
|
||||||
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
|
|
||||||
seriesFilters: [],
|
|
||||||
resources: {
|
|
||||||
overrides: {
|
|
||||||
namespace: {
|
|
||||||
resource: 'namespace'
|
|
||||||
},
|
|
||||||
pod: {
|
|
||||||
resource: 'pod'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
name: {
|
|
||||||
matches: '^container_(.*)_seconds_total$',
|
|
||||||
as: ""
|
|
||||||
},
|
|
||||||
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>,container!="POD"}[1m])) by (<<.GroupBy>>)'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
|
|
||||||
seriesFilters: [
|
|
||||||
{ isNot: '^container_.*_seconds_total$' },
|
|
||||||
],
|
|
||||||
resources: {
|
|
||||||
overrides: {
|
|
||||||
namespace: {
|
|
||||||
resource: 'namespace'
|
|
||||||
},
|
|
||||||
pod: {
|
|
||||||
resource: 'pod'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
name: {
|
|
||||||
matches: '^container_(.*)_total$',
|
|
||||||
as: ''
|
|
||||||
},
|
|
||||||
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>,container!="POD"}[1m])) by (<<.GroupBy>>)'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
|
|
||||||
seriesFilters: [
|
|
||||||
{ isNot: '^container_.*_total$' },
|
|
||||||
],
|
|
||||||
resources: {
|
|
||||||
overrides: {
|
|
||||||
namespace: {
|
|
||||||
resource: 'namespace'
|
|
||||||
},
|
|
||||||
pod: {
|
|
||||||
resource: 'pod'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
name: {
|
|
||||||
matches: '^container_(.*)$',
|
|
||||||
as: ''
|
|
||||||
},
|
|
||||||
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>,container!="POD"}) by (<<.GroupBy>>)'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
|
|
||||||
seriesFilters: [
|
|
||||||
{ isNot: '.*_total$' },
|
|
||||||
],
|
|
||||||
resources: {
|
|
||||||
template: '<<.Resource>>'
|
|
||||||
},
|
|
||||||
name: {
|
|
||||||
matches: '',
|
|
||||||
as: ''
|
|
||||||
},
|
|
||||||
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
|
|
||||||
seriesFilters: [
|
|
||||||
{ isNot: '.*_seconds_total' },
|
|
||||||
],
|
|
||||||
resources: {
|
|
||||||
template: '<<.Resource>>'
|
|
||||||
},
|
|
||||||
name: {
|
|
||||||
matches: '^(.*)_total$',
|
|
||||||
as: ''
|
|
||||||
},
|
|
||||||
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)'
|
|
||||||
},
|
|
||||||
{
|
|
||||||
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
|
|
||||||
seriesFilters: [],
|
|
||||||
resources: {
|
|
||||||
template: '<<.Resource>>'
|
|
||||||
},
|
|
||||||
name: {
|
|
||||||
matches: '^(.*)_seconds_total$',
|
|
||||||
as: ''
|
|
||||||
},
|
|
||||||
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)'
|
|
||||||
}
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheusAdapter+:: {
|
|
||||||
customMetricsApiService: {
|
|
||||||
apiVersion: 'apiregistration.k8s.io/v1',
|
|
||||||
kind: 'APIService',
|
|
||||||
metadata: {
|
|
||||||
name: 'v1beta1.custom.metrics.k8s.io',
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
service: {
|
|
||||||
name: $.prometheusAdapter.service.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
},
|
|
||||||
group: 'custom.metrics.k8s.io',
|
|
||||||
version: 'v1beta1',
|
|
||||||
insecureSkipTLSVerify: true,
|
|
||||||
groupPriorityMinimum: 100,
|
|
||||||
versionPriority: 100,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
customMetricsApiServiceV1Beta2: {
|
|
||||||
apiVersion: 'apiregistration.k8s.io/v1',
|
|
||||||
kind: 'APIService',
|
|
||||||
metadata: {
|
|
||||||
name: 'v1beta2.custom.metrics.k8s.io',
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
service: {
|
|
||||||
name: $.prometheusAdapter.service.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
},
|
|
||||||
group: 'custom.metrics.k8s.io',
|
|
||||||
version: 'v1beta2',
|
|
||||||
insecureSkipTLSVerify: true,
|
|
||||||
groupPriorityMinimum: 100,
|
|
||||||
versionPriority: 200,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
customMetricsClusterRoleServerResources:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local rules =
|
|
||||||
policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['custom.metrics.k8s.io']) +
|
|
||||||
policyRule.withResources(['*']) +
|
|
||||||
policyRule.withVerbs(['*']);
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName('custom-metrics-server-resources') +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
customMetricsClusterRoleBindingServerResources:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('custom-metrics-server-resources') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('custom-metrics-server-resources') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{
|
|
||||||
kind: 'ServiceAccount',
|
|
||||||
name: $.prometheusAdapter.serviceAccount.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
}]),
|
|
||||||
|
|
||||||
customMetricsClusterRoleBindingHPA:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('hpa-controller-custom-metrics') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('custom-metrics-server-resources') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{
|
|
||||||
kind: 'ServiceAccount',
|
|
||||||
name: 'horizontal-pod-autoscaler',
|
|
||||||
namespace: 'kube-system',
|
|
||||||
}]),
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,82 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
eks: {
|
|
||||||
minimumAvailableIPs: 10,
|
|
||||||
minimumAvailableIPsTime: '10m'
|
|
||||||
}
|
|
||||||
},
|
|
||||||
prometheus+: {
|
|
||||||
serviceMonitorCoreDNS+: {
|
|
||||||
spec+: {
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
|
||||||
interval: "15s",
|
|
||||||
targetPort: 9153
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
},
|
|
||||||
AwsEksCniMetricService:
|
|
||||||
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
serviceMonitorAwsEksCNI:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'awsekscni',
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'eks-cni',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'aws-node',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'kube-system',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'cni-metrics-port',
|
|
||||||
interval: '30s',
|
|
||||||
path: '/metrics',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheusRules+: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'kube-prometheus-eks.rules',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
|
|
||||||
},
|
|
||||||
'for': $._config.eks.minimumAvailableIPsTime,
|
|
||||||
alert: 'EksAvailableIPs'
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,46 +0,0 @@
|
||||||
{
|
|
||||||
prometheus+:: {
|
|
||||||
serviceMonitorKubelet+:
|
|
||||||
{
|
|
||||||
spec+: {
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'http-metrics',
|
|
||||||
scheme: 'http',
|
|
||||||
interval: '30s',
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
relabelings: [
|
|
||||||
{
|
|
||||||
sourceLabels: ['__metrics_path__'],
|
|
||||||
targetLabel: 'metrics_path'
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
port: 'http-metrics',
|
|
||||||
scheme: 'http',
|
|
||||||
path: '/metrics/cadvisor',
|
|
||||||
interval: '30s',
|
|
||||||
honorLabels: true,
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
relabelings: [
|
|
||||||
{
|
|
||||||
sourceLabels: ['__metrics_path__'],
|
|
||||||
targetLabel: 'metrics_path'
|
|
||||||
},
|
|
||||||
],
|
|
||||||
metricRelabelings: [
|
|
||||||
// Drop a bunch of metrics which are disabled but still sent, see
|
|
||||||
// https://github.com/google/cadvisor/issues/1925.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,13 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+:: {
|
|
||||||
kubeDnsPrometheusDiscoveryService:
|
|
||||||
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 9153, 9153)]) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+:: {
|
|
||||||
kubeControllerManagerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeSchedulerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeDnsPrometheusDiscoveryService:
|
|
||||||
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet');
|
|
||||||
|
|
||||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
|
|
||||||
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
|
|
||||||
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
|
|
||||||
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
|
|
||||||
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
|
|
||||||
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }
|
|
|
@ -1,18 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+: {
|
|
||||||
kubeControllerManagerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeSchedulerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,18 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+: {
|
|
||||||
kubeControllerManagerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeSchedulerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,40 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
|
|
||||||
prometheus+: {
|
|
||||||
kubeControllerManagerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-controller-manager-prometheus-discovery', { 'component': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
kubeSchedulerPrometheusDiscoveryService:
|
|
||||||
service.new('kube-scheduler-prometheus-discovery', { 'component': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
|
|
||||||
serviceMonitorKubeScheduler+: {
|
|
||||||
spec+: {
|
|
||||||
selector+: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'kube-scheduler',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
serviceMonitorKubeControllerManager+: {
|
|
||||||
spec+: {
|
|
||||||
selector+: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'kube-controller-manager',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,35 +0,0 @@
|
||||||
// On managed Kubernetes clusters some of the control plane components are not exposed to customers.
|
|
||||||
// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'kube-prometheus.libsonnet' defaults
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
// This snippet walks the original object (super.jobs, set as temp var j) and creates a replacement jobs object
|
|
||||||
// excluding any members of the set specified (eg: controller and scheduler).
|
|
||||||
local j = super.jobs,
|
|
||||||
jobs: {
|
|
||||||
[k]: j[k]
|
|
||||||
for k in std.objectFields(j)
|
|
||||||
if !std.setMember(k, ['KubeControllerManager', 'KubeScheduler'])
|
|
||||||
},
|
|
||||||
|
|
||||||
// Skip alerting rules too
|
|
||||||
prometheus+:: {
|
|
||||||
rules+:: {
|
|
||||||
local g = super.groups,
|
|
||||||
groups: [
|
|
||||||
h
|
|
||||||
for h in g
|
|
||||||
if !std.setMember(h.name, ['kubernetes-system-controller-manager', 'kubernetes-system-scheduler'])
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
// Same as above but for ServiceMonitor's
|
|
||||||
local p = super.prometheus,
|
|
||||||
prometheus: {
|
|
||||||
[q]: p[q]
|
|
||||||
for q in std.objectFields(p)
|
|
||||||
if !std.setMember(q, ['serviceMonitorKubeControllerManager', 'serviceMonitorKubeScheduler'])
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,21 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+: {
|
|
||||||
service+:
|
|
||||||
service.mixin.spec.withPorts(servicePort.newNamed('web', 9090, 'web') + servicePort.withNodePort(30900)) +
|
|
||||||
service.mixin.spec.withType('NodePort'),
|
|
||||||
},
|
|
||||||
alertmanager+: {
|
|
||||||
service+:
|
|
||||||
service.mixin.spec.withPorts(servicePort.newNamed('web', 9093, 'web') + servicePort.withNodePort(30903)) +
|
|
||||||
service.mixin.spec.withType('NodePort'),
|
|
||||||
},
|
|
||||||
grafana+: {
|
|
||||||
service+:
|
|
||||||
service.mixin.spec.withPorts(servicePort.newNamed('http', 3000, 'http') + servicePort.withNodePort(30902)) +
|
|
||||||
service.mixin.spec.withType('NodePort'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,99 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
(import 'etcd-mixin/mixin.libsonnet') + {
|
|
||||||
_config+:: {
|
|
||||||
etcd: {
|
|
||||||
ips: [],
|
|
||||||
clientCA: null,
|
|
||||||
clientKey: null,
|
|
||||||
clientCert: null,
|
|
||||||
serverName: null,
|
|
||||||
insecureSkipVerify: null,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheus+:: {
|
|
||||||
serviceEtcd:
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
local etcdServicePort = servicePort.newNamed('metrics', 2379, 2379);
|
|
||||||
|
|
||||||
service.new('etcd', null, etcdServicePort) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'etcd' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
endpointsEtcd:
|
|
||||||
local endpoints = k.core.v1.endpoints;
|
|
||||||
local endpointSubset = endpoints.subsetsType;
|
|
||||||
local endpointPort = endpointSubset.portsType;
|
|
||||||
|
|
||||||
local etcdPort = endpointPort.new() +
|
|
||||||
endpointPort.withName('metrics') +
|
|
||||||
endpointPort.withPort(2379) +
|
|
||||||
endpointPort.withProtocol('TCP');
|
|
||||||
|
|
||||||
local subset = endpointSubset.new() +
|
|
||||||
endpointSubset.withAddresses([
|
|
||||||
{ ip: etcdIP }
|
|
||||||
for etcdIP in $._config.etcd.ips
|
|
||||||
]) +
|
|
||||||
endpointSubset.withPorts(etcdPort);
|
|
||||||
|
|
||||||
endpoints.new() +
|
|
||||||
endpoints.mixin.metadata.withName('etcd') +
|
|
||||||
endpoints.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
endpoints.mixin.metadata.withLabels({ 'k8s-app': 'etcd' }) +
|
|
||||||
endpoints.withSubsets(subset),
|
|
||||||
serviceMonitorEtcd:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'etcd',
|
|
||||||
namespace: 'kube-system',
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'etcd',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'metrics',
|
|
||||||
interval: '30s',
|
|
||||||
scheme: 'https',
|
|
||||||
// Prometheus Operator (and Prometheus) allow us to specify a tlsConfig. This is required as most likely your etcd metrics end points is secure.
|
|
||||||
tlsConfig: {
|
|
||||||
caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt',
|
|
||||||
keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key',
|
|
||||||
certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt',
|
|
||||||
[if $._config.etcd.serverName != null then 'serverName']: $._config.etcd.serverName,
|
|
||||||
[if $._config.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $._config.etcd.insecureSkipVerify,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'etcd',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
secretEtcdCerts:
|
|
||||||
// Prometheus Operator allows us to mount secrets in the pod. By loading the secrets as files, they can be made available inside the Prometheus pod.
|
|
||||||
local secret = k.core.v1.secret;
|
|
||||||
secret.new('kube-etcd-client-certs', {
|
|
||||||
'etcd-client-ca.crt': std.base64($._config.etcd.clientCA),
|
|
||||||
'etcd-client.key': std.base64($._config.etcd.clientKey),
|
|
||||||
'etcd-client.crt': std.base64($._config.etcd.clientCert),
|
|
||||||
}) +
|
|
||||||
secret.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
prometheus+:
|
|
||||||
{
|
|
||||||
// Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec
|
|
||||||
spec+: {
|
|
||||||
secrets+: [$.prometheus.secretEtcdCerts.metadata.name],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,35 +0,0 @@
|
||||||
// Strips spec.containers[].limits for certain containers
|
|
||||||
// https://github.com/coreos/kube-prometheus/issues/72
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
resources+:: {
|
|
||||||
'addon-resizer'+: {
|
|
||||||
limits: {},
|
|
||||||
},
|
|
||||||
'kube-rbac-proxy'+: {
|
|
||||||
limits: {},
|
|
||||||
},
|
|
||||||
'kube-state-metrics'+: {
|
|
||||||
limits: {},
|
|
||||||
},
|
|
||||||
'node-exporter'+: {
|
|
||||||
limits: {},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheusOperator+: {
|
|
||||||
deployment+: {
|
|
||||||
spec+: {
|
|
||||||
template+: {
|
|
||||||
spec+: {
|
|
||||||
local addArgs(c) =
|
|
||||||
if c.name == 'prometheus-operator'
|
|
||||||
then c + {args+: ['--config-reloader-cpu=0']}
|
|
||||||
else c,
|
|
||||||
containers: std.map(addArgs, super.containers),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,39 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
versions+:: {
|
|
||||||
thanos: 'v0.14.0',
|
|
||||||
},
|
|
||||||
imageRepos+:: {
|
|
||||||
thanos: 'quay.io/thanos/thanos',
|
|
||||||
},
|
|
||||||
thanos+:: {
|
|
||||||
objectStorageConfig: {
|
|
||||||
key: 'thanos.yaml', // How the file inside the secret is called
|
|
||||||
name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheus+:: {
|
|
||||||
// Add the grpc port to the Prometheus service to be able to query it with the Thanos Querier
|
|
||||||
service+: {
|
|
||||||
spec+: {
|
|
||||||
ports+: [
|
|
||||||
servicePort.newNamed('grpc', 10901, 10901),
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheus+: {
|
|
||||||
spec+: {
|
|
||||||
thanos+: {
|
|
||||||
version: $._config.versions.thanos,
|
|
||||||
image: $._config.imageRepos.thanos + ':' + $._config.versions.thanos,
|
|
||||||
objectStorageConfig: $._config.thanos.objectStorageConfig,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,189 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
prometheus+: {
|
|
||||||
serviceWeaveNet:
|
|
||||||
service.new('weave-net', { 'name': 'weave-net' }, servicePort.newNamed('weave-net-metrics', 6782, 6782)) +
|
|
||||||
service.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
service.mixin.metadata.withLabels({ 'k8s-app': 'weave-net' }) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
serviceMonitorWeaveNet: {
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'weave-net',
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'weave-net',
|
|
||||||
},
|
|
||||||
namespace: 'monitoring',
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'weave-net-metrics',
|
|
||||||
path: '/metrics',
|
|
||||||
interval: '15s',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'kube-system',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'weave-net',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheusRules+: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'weave-net',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetIPAMSplitBrain',
|
|
||||||
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
|
||||||
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetIPAMUnreachable',
|
|
||||||
expr: 'weave_ipam_unreachable_percentage > 25',
|
|
||||||
'for': '10m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
|
||||||
description: 'actionable: Please find the problem and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetIPAMPendingAllocates',
|
|
||||||
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'Number of pending allocates is above the threshold.',
|
|
||||||
description: 'actionable: Please find the problem and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetIPAMPendingClaims',
|
|
||||||
expr: 'sum(weave_ipam_pending_claims) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'Number of pending claims is above the threshold.',
|
|
||||||
description: 'actionable: Please find the problem and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetFastDPFlowsLow',
|
|
||||||
expr: 'sum(weave_flows) < 15000',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'Number of FastDP flows is below the threshold.',
|
|
||||||
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetFastDPFlowsOff',
|
|
||||||
expr: 'sum(weave_flows == bool 0) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'FastDP flows is zero.',
|
|
||||||
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetHighConnectionTerminationRate',
|
|
||||||
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
|
||||||
'for': '5m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'A lot of connections are getting terminated.',
|
|
||||||
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetConnectionsConnecting',
|
|
||||||
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'A lot of connections are in connecting state.',
|
|
||||||
description: 'actionable: Please find the reason for this and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetConnectionsRetying',
|
|
||||||
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'A lot of connections are in retrying state.',
|
|
||||||
description: 'actionable: Please find the reason for this and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetConnectionsPending',
|
|
||||||
expr: 'sum(weave_connections{state="pending"}) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'A lot of connections are in pending state.',
|
|
||||||
description: 'actionable: Please find the reason for this and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert: 'WeaveNetConnectionsFailed',
|
|
||||||
expr: 'sum(weave_connections{state="failed"}) > 0',
|
|
||||||
'for': '3m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'A lot of connections are in failed state.',
|
|
||||||
description: 'actionable: Please find the reason and fix it.',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
grafanaDashboards+:: {
|
|
||||||
'weave-net.json': (import 'grafana-weave-net.json'),
|
|
||||||
'weave-net-cluster.json': (import 'grafana-weave-net-cluster.json'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,196 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local k3 = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
|
||||||
local configMapList = k3.core.v1.configMapList;
|
|
||||||
|
|
||||||
(import 'grafana/grafana.libsonnet') +
|
|
||||||
(import 'kube-state-metrics/kube-state-metrics.libsonnet') +
|
|
||||||
(import 'kube-state-metrics-mixin/mixin.libsonnet') +
|
|
||||||
(import 'node-exporter/node-exporter.libsonnet') +
|
|
||||||
(import 'node-mixin/mixin.libsonnet') +
|
|
||||||
(import 'alertmanager/alertmanager.libsonnet') +
|
|
||||||
(import 'prometheus-operator/prometheus-operator.libsonnet') +
|
|
||||||
(import 'prometheus/prometheus.libsonnet') +
|
|
||||||
(import 'prometheus-adapter/prometheus-adapter.libsonnet') +
|
|
||||||
(import 'kubernetes-mixin/mixin.libsonnet') +
|
|
||||||
(import 'prometheus/mixin.libsonnet') +
|
|
||||||
(import 'alerts/alerts.libsonnet') +
|
|
||||||
(import 'rules/rules.libsonnet') + {
|
|
||||||
kubePrometheus+:: {
|
|
||||||
namespace: k.core.v1.namespace.new($._config.namespace),
|
|
||||||
},
|
|
||||||
prometheusOperator+:: {
|
|
||||||
service+: {
|
|
||||||
spec+: {
|
|
||||||
ports: [
|
|
||||||
{
|
|
||||||
name: 'https',
|
|
||||||
port: 8443,
|
|
||||||
targetPort: 'https',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitor+: {
|
|
||||||
spec+: {
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https',
|
|
||||||
scheme: 'https',
|
|
||||||
honorLabels: true,
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
},
|
|
||||||
},
|
|
||||||
clusterRole+: {
|
|
||||||
rules+: [
|
|
||||||
{
|
|
||||||
apiGroups: ['authentication.k8s.io'],
|
|
||||||
resources: ['tokenreviews'],
|
|
||||||
verbs: ['create'],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
apiGroups: ['authorization.k8s.io'],
|
|
||||||
resources: ['subjectaccessreviews'],
|
|
||||||
verbs: ['create'],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
} +
|
|
||||||
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
local cfg = self,
|
|
||||||
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
|
|
||||||
name: 'kube-rbac-proxy',
|
|
||||||
securePortName: 'https',
|
|
||||||
securePort: 8443,
|
|
||||||
secureListenAddress: ':%d' % self.securePort,
|
|
||||||
upstream: 'http://127.0.0.1:8080/',
|
|
||||||
tlsCipherSuites: $._config.tlsCipherSuites,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}).deploymentMixin,
|
|
||||||
|
|
||||||
grafana+:: {
|
|
||||||
dashboardDefinitions: configMapList.new(super.dashboardDefinitions),
|
|
||||||
serviceMonitor: {
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'grafana',
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
app: 'grafana',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'http',
|
|
||||||
interval: '15s',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
} + {
|
|
||||||
_config+:: {
|
|
||||||
namespace: 'default',
|
|
||||||
|
|
||||||
versions+:: {
|
|
||||||
grafana: '7.1.0',
|
|
||||||
},
|
|
||||||
|
|
||||||
tlsCipherSuites: [
|
|
||||||
'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
|
|
||||||
'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
|
|
||||||
|
|
||||||
// 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
|
||||||
// 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
|
|
||||||
// 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
|
||||||
// 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
|
||||||
// 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
|
|
||||||
// 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2
|
|
||||||
// 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2
|
|
||||||
// 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
|
||||||
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
|
||||||
// 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
|
||||||
// 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
|
||||||
// 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
|
|
||||||
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
|
||||||
// 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
|
||||||
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
|
|
||||||
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
|
|
||||||
|
|
||||||
// disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go
|
|
||||||
|
|
||||||
'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384',
|
|
||||||
'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384',
|
|
||||||
'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305',
|
|
||||||
'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305',
|
|
||||||
],
|
|
||||||
|
|
||||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
|
||||||
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
|
|
||||||
kubeStateMetricsSelector: 'job="kube-state-metrics"',
|
|
||||||
nodeExporterSelector: 'job="node-exporter"',
|
|
||||||
fsSpaceFillingUpCriticalThreshold: 15,
|
|
||||||
notKubeDnsSelector: 'job!="kube-dns"',
|
|
||||||
kubeSchedulerSelector: 'job="kube-scheduler"',
|
|
||||||
kubeControllerManagerSelector: 'job="kube-controller-manager"',
|
|
||||||
kubeApiserverSelector: 'job="apiserver"',
|
|
||||||
coreDNSSelector: 'job="kube-dns"',
|
|
||||||
podLabel: 'pod',
|
|
||||||
|
|
||||||
alertmanagerSelector: 'job="alertmanager-' + $._config.alertmanager.name + '",namespace="' + $._config.namespace + '"',
|
|
||||||
prometheusSelector: 'job="prometheus-' + $._config.prometheus.name + '",namespace="' + $._config.namespace + '"',
|
|
||||||
prometheusName: '{{$labels.namespace}}/{{$labels.pod}}',
|
|
||||||
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + $._config.namespace + '"',
|
|
||||||
|
|
||||||
jobs: {
|
|
||||||
Kubelet: $._config.kubeletSelector,
|
|
||||||
KubeScheduler: $._config.kubeSchedulerSelector,
|
|
||||||
KubeControllerManager: $._config.kubeControllerManagerSelector,
|
|
||||||
KubeAPI: $._config.kubeApiserverSelector,
|
|
||||||
KubeStateMetrics: $._config.kubeStateMetricsSelector,
|
|
||||||
NodeExporter: $._config.nodeExporterSelector,
|
|
||||||
Alertmanager: $._config.alertmanagerSelector,
|
|
||||||
Prometheus: $._config.prometheusSelector,
|
|
||||||
PrometheusOperator: $._config.prometheusOperatorSelector,
|
|
||||||
CoreDNS: $._config.coreDNSSelector,
|
|
||||||
},
|
|
||||||
|
|
||||||
resources+:: {
|
|
||||||
'addon-resizer': {
|
|
||||||
requests: { cpu: '10m', memory: '30Mi' },
|
|
||||||
limits: { cpu: '50m', memory: '30Mi' },
|
|
||||||
},
|
|
||||||
'kube-rbac-proxy': {
|
|
||||||
requests: { cpu: '10m', memory: '20Mi' },
|
|
||||||
limits: { cpu: '20m', memory: '40Mi' },
|
|
||||||
},
|
|
||||||
'kube-state-metrics': {
|
|
||||||
requests: { cpu: '100m', memory: '150Mi' },
|
|
||||||
limits: { cpu: '100m', memory: '150Mi' },
|
|
||||||
},
|
|
||||||
'node-exporter': {
|
|
||||||
requests: { cpu: '102m', memory: '180Mi' },
|
|
||||||
limits: { cpu: '250m', memory: '180Mi' },
|
|
||||||
},
|
|
||||||
},
|
|
||||||
prometheus+:: {
|
|
||||||
rules: $.prometheusRules + $.prometheusAlerts,
|
|
||||||
},
|
|
||||||
|
|
||||||
grafana+:: {
|
|
||||||
dashboards: $.grafanaDashboards,
|
|
||||||
},
|
|
||||||
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,91 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
local deployment = k.apps.v1.deployment;
|
|
||||||
local container = deployment.mixin.spec.template.spec.containersType;
|
|
||||||
local containerPort = container.portsType;
|
|
||||||
|
|
||||||
{
|
|
||||||
local krp = self,
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
image: error 'must provide image',
|
|
||||||
name: error 'must provide name',
|
|
||||||
securePortName: error 'must provide securePortName',
|
|
||||||
securePort: error 'must provide securePort',
|
|
||||||
secureListenAddress: error 'must provide secureListenAddress',
|
|
||||||
upstream: error 'must provide upstream',
|
|
||||||
tlsCipherSuites: error 'must provide tlsCipherSuites',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
specMixin:: {
|
|
||||||
local sm = self,
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
image: error 'must provide image',
|
|
||||||
name: error 'must provide name',
|
|
||||||
securePortName: error 'must provide securePortName',
|
|
||||||
securePort: error 'must provide securePort',
|
|
||||||
secureListenAddress: error 'must provide secureListenAddress',
|
|
||||||
upstream: error 'must provide upstream',
|
|
||||||
tlsCipherSuites: error 'must provide tlsCipherSuites',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec+: {
|
|
||||||
template+: {
|
|
||||||
spec+: {
|
|
||||||
containers+: [
|
|
||||||
container.new(krp.config.kubeRbacProxy.name, krp.config.kubeRbacProxy.image) +
|
|
||||||
container.mixin.securityContext.withRunAsUser(65534) +
|
|
||||||
container.withArgs([
|
|
||||||
'--logtostderr',
|
|
||||||
'--secure-listen-address=' + krp.config.kubeRbacProxy.secureListenAddress,
|
|
||||||
'--tls-cipher-suites=' + std.join(',', krp.config.kubeRbacProxy.tlsCipherSuites),
|
|
||||||
'--upstream=' + krp.config.kubeRbacProxy.upstream,
|
|
||||||
]) +
|
|
||||||
container.withPorts(containerPort.newNamed(krp.config.kubeRbacProxy.securePort, krp.config.kubeRbacProxy.securePortName)),
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
deploymentMixin:: {
|
|
||||||
local dm = self,
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
image: error 'must provide image',
|
|
||||||
name: error 'must provide name',
|
|
||||||
securePortName: error 'must provide securePortName',
|
|
||||||
securePort: error 'must provide securePort',
|
|
||||||
secureListenAddress: error 'must provide secureListenAddress',
|
|
||||||
upstream: error 'must provide upstream',
|
|
||||||
tlsCipherSuites: error 'must provide tlsCipherSuites',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
deployment+: krp.specMixin {
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy+: dm.config.kubeRbacProxy,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
statefulSetMixin:: {
|
|
||||||
local sm = self,
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
image: error 'must provide image',
|
|
||||||
name: error 'must provide name',
|
|
||||||
securePortName: error 'must provide securePortName',
|
|
||||||
securePort: error 'must provide securePort',
|
|
||||||
secureListenAddress: error 'must provide secureListenAddress',
|
|
||||||
upstream: error 'must provide upstream',
|
|
||||||
tlsCipherSuites: error 'must provide tlsCipherSuites',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
statefulSet+: krp.specMixin {
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy+: sm.config.kubeRbacProxy,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,129 +0,0 @@
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
versions+:: {
|
|
||||||
kubeStateMetrics: '1.9.5',
|
|
||||||
},
|
|
||||||
imageRepos+:: {
|
|
||||||
kubeStateMetrics: 'quay.io/coreos/kube-state-metrics',
|
|
||||||
},
|
|
||||||
kubeStateMetrics+:: {
|
|
||||||
scrapeInterval: '30s',
|
|
||||||
scrapeTimeout: '30s',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
kubeStateMetrics+:: (import 'kube-state-metrics/kube-state-metrics.libsonnet') +
|
|
||||||
{
|
|
||||||
local ksm = self,
|
|
||||||
name:: 'kube-state-metrics',
|
|
||||||
namespace:: $._config.namespace,
|
|
||||||
version:: $._config.versions.kubeStateMetrics,
|
|
||||||
image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics,
|
|
||||||
service+: {
|
|
||||||
spec+: {
|
|
||||||
ports: [
|
|
||||||
{
|
|
||||||
name: 'https-main',
|
|
||||||
port: 8443,
|
|
||||||
targetPort: 'https-main',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'https-self',
|
|
||||||
port: 9443,
|
|
||||||
targetPort: 'https-self',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
deployment+: {
|
|
||||||
spec+: {
|
|
||||||
template+: {
|
|
||||||
spec+: {
|
|
||||||
containers: std.map(function(c) c {
|
|
||||||
ports:: null,
|
|
||||||
livenessProbe:: null,
|
|
||||||
readinessProbe:: null,
|
|
||||||
args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'],
|
|
||||||
}, super.containers),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitor:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'kube-state-metrics',
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
labels: {
|
|
||||||
'app.kubernetes.io/name': 'kube-state-metrics',
|
|
||||||
'app.kubernetes.io/version': ksm.version,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'app.kubernetes.io/name',
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'app.kubernetes.io/name': 'kube-state-metrics',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https-main',
|
|
||||||
scheme: 'https',
|
|
||||||
interval: $._config.kubeStateMetrics.scrapeInterval,
|
|
||||||
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
|
|
||||||
honorLabels: true,
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
relabelings: [
|
|
||||||
{
|
|
||||||
regex: '(pod|service|endpoint|namespace)',
|
|
||||||
action: 'labeldrop',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
port: 'https-self',
|
|
||||||
scheme: 'https',
|
|
||||||
interval: $._config.kubeStateMetrics.scrapeInterval,
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
} +
|
|
||||||
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
local cfg = self,
|
|
||||||
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
|
|
||||||
name: 'kube-rbac-proxy-main',
|
|
||||||
securePortName: 'https-main',
|
|
||||||
securePort: 8443,
|
|
||||||
secureListenAddress: ':%d' % self.securePort,
|
|
||||||
upstream: 'http://127.0.0.1:8081/',
|
|
||||||
tlsCipherSuites: $._config.tlsCipherSuites,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}).deploymentMixin +
|
|
||||||
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
|
|
||||||
config+:: {
|
|
||||||
kubeRbacProxy: {
|
|
||||||
local cfg = self,
|
|
||||||
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
|
|
||||||
name: 'kube-rbac-proxy-self',
|
|
||||||
securePortName: 'https-self',
|
|
||||||
securePort: 9443,
|
|
||||||
secureListenAddress: ':%d' % self.securePort,
|
|
||||||
upstream: 'http://127.0.0.1:8082/',
|
|
||||||
tlsCipherSuites: $._config.tlsCipherSuites,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}).deploymentMixin,
|
|
||||||
}
|
|
|
@ -1,21 +0,0 @@
|
||||||
// imageName extracts the image name from a fully qualified image string. eg.
|
|
||||||
// quay.io/coreos/addon-resizer -> addon-resizer
|
|
||||||
// grafana/grafana -> grafana
|
|
||||||
local imageName(image) =
|
|
||||||
local parts = std.split(image, '/');
|
|
||||||
local len = std.length(parts);
|
|
||||||
if len == 3 then
|
|
||||||
# registry.com/org/image
|
|
||||||
parts[2]
|
|
||||||
else if len == 2 then
|
|
||||||
# org/image
|
|
||||||
parts[1]
|
|
||||||
else if len == 1 then
|
|
||||||
# image, ie. busybox
|
|
||||||
parts[0]
|
|
||||||
else
|
|
||||||
error 'unknown image format: ' + image;
|
|
||||||
|
|
||||||
{
|
|
||||||
imageName:: imageName,
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
(import 'image.libsonnet')
|
|
|
@ -1,205 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
namespace: 'default',
|
|
||||||
|
|
||||||
versions+:: {
|
|
||||||
nodeExporter: 'v0.18.1',
|
|
||||||
kubeRbacProxy: 'v0.4.1',
|
|
||||||
},
|
|
||||||
|
|
||||||
imageRepos+:: {
|
|
||||||
nodeExporter: 'quay.io/prometheus/node-exporter',
|
|
||||||
kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy',
|
|
||||||
},
|
|
||||||
|
|
||||||
nodeExporter+:: {
|
|
||||||
listenAddress: '127.0.0.1',
|
|
||||||
port: 9100,
|
|
||||||
labels: {
|
|
||||||
'app.kubernetes.io/name': 'node-exporter',
|
|
||||||
'app.kubernetes.io/version': $._config.versions.nodeExporter,
|
|
||||||
},
|
|
||||||
selectorLabels: {
|
|
||||||
[labelName]: $._config.nodeExporter.labels[labelName]
|
|
||||||
for labelName in std.objectFields($._config.nodeExporter.labels)
|
|
||||||
if !std.setMember(labelName, ['app.kubernetes.io/version'])
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
nodeExporter+:: {
|
|
||||||
clusterRoleBinding:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('node-exporter') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('node-exporter') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter', namespace: $._config.namespace }]),
|
|
||||||
|
|
||||||
clusterRole:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local authenticationRole = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['authentication.k8s.io']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'tokenreviews',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['create']);
|
|
||||||
|
|
||||||
local authorizationRole = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['authorization.k8s.io']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'subjectaccessreviews',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['create']);
|
|
||||||
|
|
||||||
local rules = [authenticationRole, authorizationRole];
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName('node-exporter') +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
daemonset:
|
|
||||||
local daemonset = k.apps.v1.daemonSet;
|
|
||||||
local container = daemonset.mixin.spec.template.spec.containersType;
|
|
||||||
local volume = daemonset.mixin.spec.template.spec.volumesType;
|
|
||||||
local containerPort = container.portsType;
|
|
||||||
local containerVolumeMount = container.volumeMountsType;
|
|
||||||
local podSelector = daemonset.mixin.spec.template.spec.selectorType;
|
|
||||||
local toleration = daemonset.mixin.spec.template.spec.tolerationsType;
|
|
||||||
local containerEnv = container.envType;
|
|
||||||
|
|
||||||
local podLabels = $._config.nodeExporter.labels;
|
|
||||||
local selectorLabels = $._config.nodeExporter.selectorLabels;
|
|
||||||
|
|
||||||
local existsToleration = toleration.new() +
|
|
||||||
toleration.withOperator('Exists');
|
|
||||||
local procVolumeName = 'proc';
|
|
||||||
local procVolume = volume.fromHostPath(procVolumeName, '/proc');
|
|
||||||
local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc');
|
|
||||||
|
|
||||||
local sysVolumeName = 'sys';
|
|
||||||
local sysVolume = volume.fromHostPath(sysVolumeName, '/sys');
|
|
||||||
local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys');
|
|
||||||
|
|
||||||
local rootVolumeName = 'root';
|
|
||||||
local rootVolume = volume.fromHostPath(rootVolumeName, '/');
|
|
||||||
local rootVolumeMount = containerVolumeMount.new(rootVolumeName, '/host/root').
|
|
||||||
withMountPropagation('HostToContainer').
|
|
||||||
withReadOnly(true);
|
|
||||||
|
|
||||||
local nodeExporter =
|
|
||||||
container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) +
|
|
||||||
container.withArgs([
|
|
||||||
'--web.listen-address=' + std.join(':', [$._config.nodeExporter.listenAddress, std.toString($._config.nodeExporter.port)]),
|
|
||||||
'--path.procfs=/host/proc',
|
|
||||||
'--path.sysfs=/host/sys',
|
|
||||||
'--path.rootfs=/host/root',
|
|
||||||
'--no-collector.wifi',
|
|
||||||
'--no-collector.hwmon',
|
|
||||||
'--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)',
|
|
||||||
]) +
|
|
||||||
container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) +
|
|
||||||
container.mixin.resources.withRequests($._config.resources['node-exporter'].requests) +
|
|
||||||
container.mixin.resources.withLimits($._config.resources['node-exporter'].limits);
|
|
||||||
|
|
||||||
local ip = containerEnv.fromFieldPath('IP', 'status.podIP');
|
|
||||||
local proxy =
|
|
||||||
container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) +
|
|
||||||
container.withArgs([
|
|
||||||
'--logtostderr',
|
|
||||||
'--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port,
|
|
||||||
'--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites),
|
|
||||||
'--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/',
|
|
||||||
]) +
|
|
||||||
// Keep `hostPort` here, rather than in the node-exporter container
|
|
||||||
// because Kubernetes mandates that if you define a `hostPort` then
|
|
||||||
// `containerPort` must match. In our case, we are splitting the
|
|
||||||
// host port and container port between the two containers.
|
|
||||||
// We'll keep the port specification here so that the named port
|
|
||||||
// used by the service is tied to the proxy container. We *could*
|
|
||||||
// forgo declaring the host port, however it is important to declare
|
|
||||||
// it so that the scheduler can decide if the pod is schedulable.
|
|
||||||
container.withPorts(containerPort.new($._config.nodeExporter.port) + containerPort.withHostPort($._config.nodeExporter.port) + containerPort.withName('https')) +
|
|
||||||
container.mixin.resources.withRequests($._config.resources['kube-rbac-proxy'].requests) +
|
|
||||||
container.mixin.resources.withLimits($._config.resources['kube-rbac-proxy'].limits) +
|
|
||||||
container.withEnv([ip]);
|
|
||||||
|
|
||||||
local c = [nodeExporter, proxy];
|
|
||||||
|
|
||||||
daemonset.new() +
|
|
||||||
daemonset.mixin.metadata.withName('node-exporter') +
|
|
||||||
daemonset.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
daemonset.mixin.metadata.withLabels(podLabels) +
|
|
||||||
daemonset.mixin.spec.selector.withMatchLabels(selectorLabels) +
|
|
||||||
daemonset.mixin.spec.template.metadata.withLabels(podLabels) +
|
|
||||||
daemonset.mixin.spec.template.spec.withTolerations([existsToleration]) +
|
|
||||||
daemonset.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
|
|
||||||
daemonset.mixin.spec.template.spec.withContainers(c) +
|
|
||||||
daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume, rootVolume]) +
|
|
||||||
daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
|
|
||||||
daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
|
|
||||||
daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter') +
|
|
||||||
daemonset.mixin.spec.template.spec.withHostPid(true) +
|
|
||||||
daemonset.mixin.spec.template.spec.withHostNetwork(true),
|
|
||||||
|
|
||||||
serviceAccount:
|
|
||||||
local serviceAccount = k.core.v1.serviceAccount;
|
|
||||||
|
|
||||||
serviceAccount.new('node-exporter') +
|
|
||||||
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
|
|
||||||
serviceMonitor:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'node-exporter',
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
labels: $._config.nodeExporter.labels,
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'app.kubernetes.io/name',
|
|
||||||
selector: {
|
|
||||||
matchLabels: $._config.nodeExporter.selectorLabels,
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https',
|
|
||||||
scheme: 'https',
|
|
||||||
interval: '15s',
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
relabelings: [
|
|
||||||
{
|
|
||||||
action: 'replace',
|
|
||||||
regex: '(.*)',
|
|
||||||
replacement: '$1',
|
|
||||||
sourceLabels: ['__meta_kubernetes_pod_node_name'],
|
|
||||||
targetLabel: 'instance',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
service:
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
local nodeExporterPort = servicePort.newNamed('https', $._config.nodeExporter.port, 'https');
|
|
||||||
|
|
||||||
service.new('node-exporter', $._config.nodeExporter.selectorLabels, nodeExporterPort) +
|
|
||||||
service.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
service.mixin.metadata.withLabels($._config.nodeExporter.labels) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,234 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
namespace: 'default',
|
|
||||||
|
|
||||||
versions+:: {
|
|
||||||
prometheusAdapter: 'v0.7.0',
|
|
||||||
},
|
|
||||||
|
|
||||||
imageRepos+:: {
|
|
||||||
prometheusAdapter: 'directxman12/k8s-prometheus-adapter',
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheusAdapter+:: {
|
|
||||||
name: 'prometheus-adapter',
|
|
||||||
labels: { name: $._config.prometheusAdapter.name },
|
|
||||||
prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc.cluster.local:9090/',
|
|
||||||
config: {
|
|
||||||
resourceRules: {
|
|
||||||
cpu: {
|
|
||||||
containerQuery: 'sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)',
|
|
||||||
nodeQuery: 'sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)',
|
|
||||||
resources: {
|
|
||||||
overrides: {
|
|
||||||
node: {
|
|
||||||
resource: 'node'
|
|
||||||
},
|
|
||||||
namespace: {
|
|
||||||
resource: 'namespace'
|
|
||||||
},
|
|
||||||
pod: {
|
|
||||||
resource: 'pod'
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
containerLabel: 'container'
|
|
||||||
},
|
|
||||||
memory: {
|
|
||||||
containerQuery: 'sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)',
|
|
||||||
nodeQuery: 'sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)',
|
|
||||||
resources: {
|
|
||||||
overrides: {
|
|
||||||
instance: {
|
|
||||||
resource: 'node'
|
|
||||||
},
|
|
||||||
namespace: {
|
|
||||||
resource: 'namespace'
|
|
||||||
},
|
|
||||||
pod: {
|
|
||||||
resource: 'pod'
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
containerLabel: 'container'
|
|
||||||
},
|
|
||||||
window: '5m',
|
|
||||||
},
|
|
||||||
}
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheusAdapter+:: {
|
|
||||||
apiService:
|
|
||||||
{
|
|
||||||
apiVersion: 'apiregistration.k8s.io/v1',
|
|
||||||
kind: 'APIService',
|
|
||||||
metadata: {
|
|
||||||
name: 'v1beta1.metrics.k8s.io',
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
service: {
|
|
||||||
name: $.prometheusAdapter.service.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
},
|
|
||||||
group: 'metrics.k8s.io',
|
|
||||||
version: 'v1beta1',
|
|
||||||
insecureSkipTLSVerify: true,
|
|
||||||
groupPriorityMinimum: 100,
|
|
||||||
versionPriority: 100,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
configMap:
|
|
||||||
local configmap = k.core.v1.configMap;
|
|
||||||
configmap.new('adapter-config', { 'config.yaml': std.manifestYamlDoc($._config.prometheusAdapter.config) }) +
|
|
||||||
|
|
||||||
configmap.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
|
|
||||||
service:
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
service.new(
|
|
||||||
$._config.prometheusAdapter.name,
|
|
||||||
$._config.prometheusAdapter.labels,
|
|
||||||
servicePort.newNamed('https', 443, 6443),
|
|
||||||
) +
|
|
||||||
service.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
service.mixin.metadata.withLabels($._config.prometheusAdapter.labels),
|
|
||||||
|
|
||||||
deployment:
|
|
||||||
local deployment = k.apps.v1.deployment;
|
|
||||||
local volume = deployment.mixin.spec.template.spec.volumesType;
|
|
||||||
local container = deployment.mixin.spec.template.spec.containersType;
|
|
||||||
local containerVolumeMount = container.volumeMountsType;
|
|
||||||
|
|
||||||
local c =
|
|
||||||
container.new($._config.prometheusAdapter.name, $._config.imageRepos.prometheusAdapter + ':' + $._config.versions.prometheusAdapter) +
|
|
||||||
container.withArgs([
|
|
||||||
'--cert-dir=/var/run/serving-cert',
|
|
||||||
'--config=/etc/adapter/config.yaml',
|
|
||||||
'--logtostderr=true',
|
|
||||||
'--metrics-relist-interval=1m',
|
|
||||||
'--prometheus-url=' + $._config.prometheusAdapter.prometheusURL,
|
|
||||||
'--secure-port=6443',
|
|
||||||
]) +
|
|
||||||
container.withPorts([{ containerPort: 6443 }]) +
|
|
||||||
container.withVolumeMounts([
|
|
||||||
containerVolumeMount.new('tmpfs', '/tmp'),
|
|
||||||
containerVolumeMount.new('volume-serving-cert', '/var/run/serving-cert'),
|
|
||||||
containerVolumeMount.new('config', '/etc/adapter'),
|
|
||||||
],);
|
|
||||||
|
|
||||||
deployment.new($._config.prometheusAdapter.name, 1, c, $._config.prometheusAdapter.labels) +
|
|
||||||
deployment.mixin.metadata.withNamespace($._config.namespace) +
|
|
||||||
deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) +
|
|
||||||
deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) +
|
|
||||||
deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
|
|
||||||
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) +
|
|
||||||
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(0) +
|
|
||||||
deployment.mixin.spec.template.spec.withVolumes([
|
|
||||||
volume.fromEmptyDir(name='tmpfs'),
|
|
||||||
volume.fromEmptyDir(name='volume-serving-cert'),
|
|
||||||
{ name: 'config', configMap: { name: 'adapter-config' } },
|
|
||||||
]),
|
|
||||||
|
|
||||||
serviceAccount:
|
|
||||||
local serviceAccount = k.core.v1.serviceAccount;
|
|
||||||
|
|
||||||
serviceAccount.new($._config.prometheusAdapter.name) +
|
|
||||||
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
|
|
||||||
|
|
||||||
clusterRole:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local rules =
|
|
||||||
policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources(['nodes', 'namespaces', 'pods', 'services']) +
|
|
||||||
policyRule.withVerbs(['get', 'list', 'watch']);
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName($._config.prometheusAdapter.name) +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
clusterRoleBinding:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName($._config.prometheusAdapter.name) +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName($.prometheusAdapter.clusterRole.metadata.name) +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{
|
|
||||||
kind: 'ServiceAccount',
|
|
||||||
name: $.prometheusAdapter.serviceAccount.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
}]),
|
|
||||||
|
|
||||||
clusterRoleBindingDelegator:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('resource-metrics:system:auth-delegator') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('system:auth-delegator') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{
|
|
||||||
kind: 'ServiceAccount',
|
|
||||||
name: $.prometheusAdapter.serviceAccount.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
}]),
|
|
||||||
|
|
||||||
clusterRoleServerResources:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local rules =
|
|
||||||
policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['metrics.k8s.io']) +
|
|
||||||
policyRule.withResources(['*']) +
|
|
||||||
policyRule.withVerbs(['*']);
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName('resource-metrics-server-resources') +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
clusterRoleAggregatedMetricsReader:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local rules =
|
|
||||||
policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['metrics.k8s.io']) +
|
|
||||||
policyRule.withResources(['pods', 'nodes']) +
|
|
||||||
policyRule.withVerbs(['get','list','watch']);
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName('system:aggregated-metrics-reader') +
|
|
||||||
clusterRole.mixin.metadata.withLabels({
|
|
||||||
"rbac.authorization.k8s.io/aggregate-to-admin": "true",
|
|
||||||
"rbac.authorization.k8s.io/aggregate-to-edit": "true",
|
|
||||||
"rbac.authorization.k8s.io/aggregate-to-view": "true",
|
|
||||||
}) +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
roleBindingAuthReader:
|
|
||||||
local roleBinding = k.rbac.v1.roleBinding;
|
|
||||||
|
|
||||||
roleBinding.new() +
|
|
||||||
roleBinding.mixin.metadata.withName('resource-metrics-auth-reader') +
|
|
||||||
roleBinding.mixin.metadata.withNamespace('kube-system') +
|
|
||||||
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
roleBinding.mixin.roleRef.withName('extension-apiserver-authentication-reader') +
|
|
||||||
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
|
|
||||||
roleBinding.withSubjects([{
|
|
||||||
kind: 'ServiceAccount',
|
|
||||||
name: $.prometheusAdapter.serviceAccount.metadata.name,
|
|
||||||
namespace: $._config.namespace,
|
|
||||||
}]),
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,476 +0,0 @@
|
||||||
local k3 = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
|
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
namespace: 'default',
|
|
||||||
|
|
||||||
versions+:: {
|
|
||||||
prometheus: 'v2.20.0',
|
|
||||||
},
|
|
||||||
|
|
||||||
imageRepos+:: {
|
|
||||||
prometheus: 'quay.io/prometheus/prometheus',
|
|
||||||
},
|
|
||||||
|
|
||||||
alertmanager+:: {
|
|
||||||
name: 'main',
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheus+:: {
|
|
||||||
name: 'k8s',
|
|
||||||
replicas: 2,
|
|
||||||
rules: {},
|
|
||||||
namespaces: ['default', 'kube-system', $._config.namespace],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheus+:: {
|
|
||||||
local p = self,
|
|
||||||
|
|
||||||
name:: $._config.prometheus.name,
|
|
||||||
namespace:: $._config.namespace,
|
|
||||||
roleBindingNamespaces:: $._config.prometheus.namespaces,
|
|
||||||
replicas:: $._config.prometheus.replicas,
|
|
||||||
prometheusRules:: $._config.prometheus.rules,
|
|
||||||
alertmanagerName:: $.alertmanager.service.metadata.name,
|
|
||||||
|
|
||||||
serviceAccount:
|
|
||||||
local serviceAccount = k.core.v1.serviceAccount;
|
|
||||||
|
|
||||||
serviceAccount.new('prometheus-' + p.name) +
|
|
||||||
serviceAccount.mixin.metadata.withNamespace(p.namespace),
|
|
||||||
service:
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
local prometheusPort = servicePort.newNamed('web', 9090, 'web');
|
|
||||||
|
|
||||||
service.new('prometheus-' + p.name, { app: 'prometheus', prometheus: p.name }, prometheusPort) +
|
|
||||||
service.mixin.spec.withSessionAffinity('ClientIP') +
|
|
||||||
service.mixin.metadata.withNamespace(p.namespace) +
|
|
||||||
service.mixin.metadata.withLabels({ prometheus: p.name }),
|
|
||||||
|
|
||||||
rules:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'PrometheusRule',
|
|
||||||
metadata: {
|
|
||||||
labels: {
|
|
||||||
prometheus: p.name,
|
|
||||||
role: 'alert-rules',
|
|
||||||
},
|
|
||||||
name: 'prometheus-' + p.name + '-rules',
|
|
||||||
namespace: p.namespace,
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
groups: p.prometheusRules.groups,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
roleBindingSpecificNamespaces:
|
|
||||||
local roleBinding = k.rbac.v1.roleBinding;
|
|
||||||
|
|
||||||
local newSpecificRoleBinding(namespace) =
|
|
||||||
roleBinding.new() +
|
|
||||||
roleBinding.mixin.metadata.withName('prometheus-' + p.name) +
|
|
||||||
roleBinding.mixin.metadata.withNamespace(namespace) +
|
|
||||||
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
roleBinding.mixin.roleRef.withName('prometheus-' + p.name) +
|
|
||||||
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
|
|
||||||
roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]);
|
|
||||||
|
|
||||||
local roleBindingList = k3.rbac.v1.roleBindingList;
|
|
||||||
roleBindingList.new([newSpecificRoleBinding(x) for x in p.roleBindingNamespaces]),
|
|
||||||
clusterRole:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local nodeMetricsRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources(['nodes/metrics']) +
|
|
||||||
policyRule.withVerbs(['get']);
|
|
||||||
|
|
||||||
local metricsRule = policyRule.new() +
|
|
||||||
policyRule.withNonResourceUrls('/metrics') +
|
|
||||||
policyRule.withVerbs(['get']);
|
|
||||||
|
|
||||||
local rules = [nodeMetricsRule, metricsRule];
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withName('prometheus-' + p.name) +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
roleConfig:
|
|
||||||
local role = k.rbac.v1.role;
|
|
||||||
local policyRule = role.rulesType;
|
|
||||||
|
|
||||||
local configmapRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'configmaps',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['get']);
|
|
||||||
|
|
||||||
role.new() +
|
|
||||||
role.mixin.metadata.withName('prometheus-' + p.name + '-config') +
|
|
||||||
role.mixin.metadata.withNamespace(p.namespace) +
|
|
||||||
role.withRules(configmapRule),
|
|
||||||
roleBindingConfig:
|
|
||||||
local roleBinding = k.rbac.v1.roleBinding;
|
|
||||||
|
|
||||||
roleBinding.new() +
|
|
||||||
roleBinding.mixin.metadata.withName('prometheus-' + p.name + '-config') +
|
|
||||||
roleBinding.mixin.metadata.withNamespace(p.namespace) +
|
|
||||||
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
roleBinding.mixin.roleRef.withName('prometheus-' + p.name + '-config') +
|
|
||||||
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
|
|
||||||
roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]),
|
|
||||||
clusterRoleBinding:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('prometheus-' + p.name) +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('prometheus-' + p.name) +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]),
|
|
||||||
roleSpecificNamespaces:
|
|
||||||
local role = k.rbac.v1.role;
|
|
||||||
local policyRule = role.rulesType;
|
|
||||||
local coreRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'services',
|
|
||||||
'endpoints',
|
|
||||||
'pods',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['get', 'list', 'watch']);
|
|
||||||
|
|
||||||
local newSpecificRole(namespace) =
|
|
||||||
role.new() +
|
|
||||||
role.mixin.metadata.withName('prometheus-' + p.name) +
|
|
||||||
role.mixin.metadata.withNamespace(namespace) +
|
|
||||||
role.withRules(coreRule);
|
|
||||||
|
|
||||||
local roleList = k3.rbac.v1.roleList;
|
|
||||||
roleList.new([newSpecificRole(x) for x in p.roleBindingNamespaces]),
|
|
||||||
prometheus:
|
|
||||||
local statefulSet = k.apps.v1.statefulSet;
|
|
||||||
local container = statefulSet.mixin.spec.template.spec.containersType;
|
|
||||||
local resourceRequirements = container.mixin.resourcesType;
|
|
||||||
local selector = statefulSet.mixin.spec.selectorType;
|
|
||||||
|
|
||||||
|
|
||||||
local resources =
|
|
||||||
resourceRequirements.new() +
|
|
||||||
resourceRequirements.withRequests({ memory: '400Mi' });
|
|
||||||
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'Prometheus',
|
|
||||||
metadata: {
|
|
||||||
name: p.name,
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
prometheus: p.name,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
replicas: p.replicas,
|
|
||||||
version: $._config.versions.prometheus,
|
|
||||||
image: $._config.imageRepos.prometheus + ':' + $._config.versions.prometheus,
|
|
||||||
serviceAccountName: 'prometheus-' + p.name,
|
|
||||||
serviceMonitorSelector: {},
|
|
||||||
podMonitorSelector: {},
|
|
||||||
serviceMonitorNamespaceSelector: {},
|
|
||||||
podMonitorNamespaceSelector: {},
|
|
||||||
nodeSelector: { 'kubernetes.io/os': 'linux' },
|
|
||||||
ruleSelector: selector.withMatchLabels({
|
|
||||||
role: 'alert-rules',
|
|
||||||
prometheus: p.name,
|
|
||||||
}),
|
|
||||||
resources: resources,
|
|
||||||
alerting: {
|
|
||||||
alertmanagers: [
|
|
||||||
{
|
|
||||||
namespace: p.namespace,
|
|
||||||
name: p.alertmanagerName,
|
|
||||||
port: 'web',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
securityContext: {
|
|
||||||
runAsUser: 1000,
|
|
||||||
runAsNonRoot: true,
|
|
||||||
fsGroup: 2000,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitor:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'prometheus',
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'prometheus',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
prometheus: p.name,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'web',
|
|
||||||
interval: '30s',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitorKubeScheduler:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'kube-scheduler',
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'kube-scheduler',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https-metrics',
|
|
||||||
interval: '30s',
|
|
||||||
scheme: "https",
|
|
||||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true
|
|
||||||
}
|
|
||||||
},
|
|
||||||
],
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'kube-scheduler',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'kube-system',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitorKubelet:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'kubelet',
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'kubelet',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https-metrics',
|
|
||||||
scheme: 'https',
|
|
||||||
interval: '30s',
|
|
||||||
honorLabels: true,
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true,
|
|
||||||
},
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'),
|
|
||||||
relabelings: [
|
|
||||||
{
|
|
||||||
sourceLabels: ['__metrics_path__'],
|
|
||||||
targetLabel: 'metrics_path',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
port: 'https-metrics',
|
|
||||||
scheme: 'https',
|
|
||||||
path: '/metrics/cadvisor',
|
|
||||||
interval: '30s',
|
|
||||||
honorLabels: true,
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true,
|
|
||||||
},
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
relabelings: [
|
|
||||||
{
|
|
||||||
sourceLabels: ['__metrics_path__'],
|
|
||||||
targetLabel: 'metrics_path',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
metricRelabelings: [
|
|
||||||
// Drop a bunch of metrics which are disabled but still sent, see
|
|
||||||
// https://github.com/google/cadvisor/issues/1925.
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'kubelet',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'kube-system',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitorKubeControllerManager:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'kube-controller-manager',
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'kube-controller-manager',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https-metrics',
|
|
||||||
interval: '30s',
|
|
||||||
scheme: "https",
|
|
||||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
|
||||||
tlsConfig: {
|
|
||||||
insecureSkipVerify: true
|
|
||||||
},
|
|
||||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'etcd_(debugging|disk|request|server).*',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'kube-controller-manager',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'kube-system',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitorApiserver:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'kube-apiserver',
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'apiserver',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'component',
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
component: 'apiserver',
|
|
||||||
provider: 'kubernetes',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'default',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'https',
|
|
||||||
interval: '30s',
|
|
||||||
scheme: 'https',
|
|
||||||
tlsConfig: {
|
|
||||||
caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt',
|
|
||||||
serverName: 'kubernetes',
|
|
||||||
},
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'etcd_(debugging|disk|server).*',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'apiserver_admission_controller_admission_latencies_seconds_.*',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__'],
|
|
||||||
regex: 'apiserver_admission_step_admission_latencies_seconds_.*',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
sourceLabels: ['__name__', 'le'],
|
|
||||||
regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)',
|
|
||||||
action: 'drop',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
serviceMonitorCoreDNS:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'coredns',
|
|
||||||
namespace: p.namespace,
|
|
||||||
labels: {
|
|
||||||
'k8s-app': 'coredns',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
jobLabel: 'k8s-app',
|
|
||||||
selector: {
|
|
||||||
matchLabels: {
|
|
||||||
'k8s-app': 'kube-dns',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
namespaceSelector: {
|
|
||||||
matchNames: [
|
|
||||||
'kube-system',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'metrics',
|
|
||||||
interval: '15s',
|
|
||||||
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,19 +0,0 @@
|
||||||
{
|
|
||||||
prometheusRules+:: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'kube-prometheus-general.rules',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
expr: 'count without(instance, pod, node) (up == 1)',
|
|
||||||
record: 'count:up1',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
expr: 'count without(instance, pod, node) (up == 0)',
|
|
||||||
record: 'count:up0',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,35 +0,0 @@
|
||||||
{
|
|
||||||
prometheusRules+:: {
|
|
||||||
groups+: [
|
|
||||||
{
|
|
||||||
name: 'kube-prometheus-node-recording.rules',
|
|
||||||
rules: [
|
|
||||||
{
|
|
||||||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)',
|
|
||||||
record: 'instance:node_cpu:rate:sum',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)',
|
|
||||||
record: 'instance:node_network_receive_bytes:rate:sum',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
expr: 'sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)',
|
|
||||||
record: 'instance:node_network_transmit_bytes:rate:sum',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)',
|
|
||||||
record: 'instance:node_cpu:ratio',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))',
|
|
||||||
record: 'cluster:node_cpu:sum_rate5m',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
expr: 'cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))',
|
|
||||||
record: 'cluster:node_cpu:ratio',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1,2 +0,0 @@
|
||||||
(import 'node-rules.libsonnet') +
|
|
||||||
(import 'general.libsonnet')
|
|
|
@ -1,2 +0,0 @@
|
||||||
vendor/
|
|
||||||
jsonnetfile.lock.json
|
|
File diff suppressed because one or more lines are too long
|
@ -1,14 +0,0 @@
|
||||||
{
|
|
||||||
"dependencies": [
|
|
||||||
{
|
|
||||||
"name": "ksonnet",
|
|
||||||
"source": {
|
|
||||||
"git": {
|
|
||||||
"remote": "https://github.com/ksonnet/ksonnet-lib",
|
|
||||||
"subdir": ""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"version": "master"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,204 +0,0 @@
|
||||||
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
||||||
|
|
||||||
{
|
|
||||||
_config+:: {
|
|
||||||
namespace: 'default',
|
|
||||||
|
|
||||||
prometheusOperator+:: {
|
|
||||||
deploymentSelectorLabels: {
|
|
||||||
'app.kubernetes.io/name': 'prometheus-operator',
|
|
||||||
'app.kubernetes.io/component': 'controller',
|
|
||||||
},
|
|
||||||
commonLabels:
|
|
||||||
$._config.prometheusOperator.deploymentSelectorLabels
|
|
||||||
{ 'app.kubernetes.io/version': $._config.versions.prometheusOperator },
|
|
||||||
},
|
|
||||||
|
|
||||||
versions+:: {
|
|
||||||
prometheusOperator: 'v0.40.0',
|
|
||||||
prometheusConfigReloader: self.prometheusOperator,
|
|
||||||
configmapReloader: 'v0.3.0',
|
|
||||||
},
|
|
||||||
|
|
||||||
imageRepos+:: {
|
|
||||||
prometheusOperator: 'quay.io/coreos/prometheus-operator',
|
|
||||||
configmapReloader: 'jimmidyson/configmap-reload',
|
|
||||||
prometheusConfigReloader: 'quay.io/coreos/prometheus-config-reloader',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
|
|
||||||
prometheusOperator+:: {
|
|
||||||
local po = self,
|
|
||||||
|
|
||||||
namespace:: $._config.namespace,
|
|
||||||
commonLabels:: $._config.prometheusOperator.commonLabels,
|
|
||||||
deploymentSelectorLabels:: $._config.prometheusOperator.deploymentSelectorLabels,
|
|
||||||
|
|
||||||
image:: $._config.imageRepos.prometheusOperator,
|
|
||||||
version:: $._config.versions.prometheusOperator,
|
|
||||||
configReloaderImage:: $._config.imageRepos.configmapReloader,
|
|
||||||
configReloaderVersion:: $._config.versions.configmapReloader,
|
|
||||||
prometheusConfigReloaderImage:: $._config.imageRepos.prometheusConfigReloader,
|
|
||||||
prometheusConfigReloaderVersion:: $._config.versions.prometheusConfigReloader,
|
|
||||||
|
|
||||||
// Prefixing with 0 to ensure these manifests are listed and therefore created first.
|
|
||||||
'0alertmanagerCustomResourceDefinition': import 'alertmanager-crd.libsonnet',
|
|
||||||
'0prometheusCustomResourceDefinition': import 'prometheus-crd.libsonnet',
|
|
||||||
'0servicemonitorCustomResourceDefinition': import 'servicemonitor-crd.libsonnet',
|
|
||||||
'0podmonitorCustomResourceDefinition': import 'podmonitor-crd.libsonnet',
|
|
||||||
'0prometheusruleCustomResourceDefinition': import 'prometheusrule-crd.libsonnet',
|
|
||||||
'0thanosrulerCustomResourceDefinition': import 'thanosruler-crd.libsonnet',
|
|
||||||
|
|
||||||
clusterRoleBinding:
|
|
||||||
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
|
|
||||||
|
|
||||||
clusterRoleBinding.new() +
|
|
||||||
clusterRoleBinding.mixin.metadata.withLabels(po.commonLabels) +
|
|
||||||
clusterRoleBinding.mixin.metadata.withName('prometheus-operator') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.withName('prometheus-operator') +
|
|
||||||
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
|
|
||||||
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-operator', namespace: po.namespace }]),
|
|
||||||
|
|
||||||
clusterRole:
|
|
||||||
local clusterRole = k.rbac.v1.clusterRole;
|
|
||||||
local policyRule = clusterRole.rulesType;
|
|
||||||
|
|
||||||
local monitoringRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['monitoring.coreos.com']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'alertmanagers',
|
|
||||||
'alertmanagers/finalizers',
|
|
||||||
'prometheuses',
|
|
||||||
'prometheuses/finalizers',
|
|
||||||
'thanosrulers',
|
|
||||||
'thanosrulers/finalizers',
|
|
||||||
'servicemonitors',
|
|
||||||
'podmonitors',
|
|
||||||
'prometheusrules',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['*']);
|
|
||||||
|
|
||||||
local appsRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['apps']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'statefulsets',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['*']);
|
|
||||||
|
|
||||||
local coreRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'configmaps',
|
|
||||||
'secrets',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['*']);
|
|
||||||
|
|
||||||
local podRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'pods',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['list', 'delete']);
|
|
||||||
|
|
||||||
local routingRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'services',
|
|
||||||
'services/finalizers',
|
|
||||||
'endpoints',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['get', 'create', 'update', 'delete']);
|
|
||||||
|
|
||||||
local nodeRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'nodes',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['list', 'watch']);
|
|
||||||
|
|
||||||
local namespaceRule = policyRule.new() +
|
|
||||||
policyRule.withApiGroups(['']) +
|
|
||||||
policyRule.withResources([
|
|
||||||
'namespaces',
|
|
||||||
]) +
|
|
||||||
policyRule.withVerbs(['get', 'list', 'watch']);
|
|
||||||
|
|
||||||
local rules = [monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule];
|
|
||||||
|
|
||||||
clusterRole.new() +
|
|
||||||
clusterRole.mixin.metadata.withLabels(po.commonLabels) +
|
|
||||||
clusterRole.mixin.metadata.withName('prometheus-operator') +
|
|
||||||
clusterRole.withRules(rules),
|
|
||||||
|
|
||||||
deployment:
|
|
||||||
local deployment = k.apps.v1.deployment;
|
|
||||||
local container = k.apps.v1.deployment.mixin.spec.template.spec.containersType;
|
|
||||||
local containerPort = container.portsType;
|
|
||||||
|
|
||||||
local targetPort = 8080;
|
|
||||||
|
|
||||||
local operatorContainer =
|
|
||||||
container.new('prometheus-operator', po.image + ':' + po.version) +
|
|
||||||
container.withPorts(containerPort.newNamed(targetPort, 'http')) +
|
|
||||||
container.withArgs([
|
|
||||||
'--kubelet-service=kube-system/kubelet',
|
|
||||||
// Prometheus Operator is run with a read-only root file system. By
|
|
||||||
// default glog saves logfiles to /tmp. Make it log to stderr instead.
|
|
||||||
'--logtostderr=true',
|
|
||||||
'--config-reloader-image=' + po.configReloaderImage + ':' + po.configReloaderVersion,
|
|
||||||
'--prometheus-config-reloader=' + po.prometheusConfigReloaderImage + ':' + po.prometheusConfigReloaderVersion,
|
|
||||||
]) +
|
|
||||||
container.mixin.securityContext.withAllowPrivilegeEscalation(false) +
|
|
||||||
container.mixin.resources.withRequests({ cpu: '100m', memory: '100Mi' }) +
|
|
||||||
container.mixin.resources.withLimits({ cpu: '200m', memory: '200Mi' });
|
|
||||||
|
|
||||||
deployment.new('prometheus-operator', 1, operatorContainer, po.commonLabels) +
|
|
||||||
deployment.mixin.metadata.withNamespace(po.namespace) +
|
|
||||||
deployment.mixin.metadata.withLabels(po.commonLabels) +
|
|
||||||
deployment.mixin.spec.selector.withMatchLabels(po.deploymentSelectorLabels) +
|
|
||||||
deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) +
|
|
||||||
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
|
|
||||||
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
|
|
||||||
deployment.mixin.spec.template.spec.withServiceAccountName('prometheus-operator'),
|
|
||||||
|
|
||||||
serviceAccount:
|
|
||||||
local serviceAccount = k.core.v1.serviceAccount;
|
|
||||||
|
|
||||||
serviceAccount.new('prometheus-operator') +
|
|
||||||
serviceAccount.mixin.metadata.withLabels(po.commonLabels) +
|
|
||||||
serviceAccount.mixin.metadata.withNamespace(po.namespace),
|
|
||||||
|
|
||||||
service:
|
|
||||||
local service = k.core.v1.service;
|
|
||||||
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
||||||
|
|
||||||
local poServicePort = servicePort.newNamed('http', 8080, 'http');
|
|
||||||
|
|
||||||
service.new('prometheus-operator', po.deployment.spec.selector.matchLabels, [poServicePort]) +
|
|
||||||
service.mixin.metadata.withLabels(po.commonLabels) +
|
|
||||||
service.mixin.metadata.withNamespace(po.namespace) +
|
|
||||||
service.mixin.spec.withClusterIp('None'),
|
|
||||||
serviceMonitor:
|
|
||||||
{
|
|
||||||
apiVersion: 'monitoring.coreos.com/v1',
|
|
||||||
kind: 'ServiceMonitor',
|
|
||||||
metadata: {
|
|
||||||
name: 'prometheus-operator',
|
|
||||||
namespace: po.namespace,
|
|
||||||
labels: po.commonLabels,
|
|
||||||
},
|
|
||||||
spec: {
|
|
||||||
endpoints: [
|
|
||||||
{
|
|
||||||
port: 'http',
|
|
||||||
honorLabels: true,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
selector: {
|
|
||||||
matchLabels: po.commonLabels,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
{"apiVersion":"apiextensions.k8s.io/v1","kind":"CustomResourceDefinition","metadata":{"annotations":{"controller-gen.kubebuilder.io/version":"v0.2.4"},"creationTimestamp":null,"name":"prometheusrules.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"PrometheusRule","listKind":"PrometheusRuleList","plural":"prometheusrules","singular":"prometheusrule"},"scope":"Namespaced","versions":[{"name":"v1","schema":{"openAPIV3Schema":{"description":"PrometheusRule defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds","type":"string"},"metadata":{"type":"object"},"spec":{"description":"Specification of desired alerting rule definitions for Prometheus.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules. Note: PartialResponseStrategy is only used by ThanosRuler and will be ignored by Prometheus instances. Valid values for this field are 'warn' or 'abort'. More info: https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md#partial-response","properties":{"interval":{"type":"string"},"name":{"type":"string"},"partial_response_strategy":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"additionalProperties":{"type":"string"},"type":"object"},"expr":{"anyOf":[{"type":"integer"},{"type":"string"}],"x-kubernetes-int-or-string":true},"for":{"type":"string"},"labels":{"additionalProperties":{"type":"string"},"type":"object"},"record":{"type":"string"}},"required":["expr"],"type":"object"},"type":"array"}},"required":["name","rules"],"type":"object"},"type":"array"}},"type":"object"}},"required":["spec"],"type":"object"}},"served":true,"storage":true}]},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":[],"storedVersions":[]}}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
10
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/dashboard.libsonnet
generated
vendored
10
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/dashboard.libsonnet
generated
vendored
|
@ -133,16 +133,18 @@ local timepickerlib = import 'timepicker.libsonnet';
|
||||||
name,
|
name,
|
||||||
label,
|
label,
|
||||||
type,
|
type,
|
||||||
pluginId,
|
pluginId=null,
|
||||||
pluginName,
|
pluginName=null,
|
||||||
description='',
|
description='',
|
||||||
|
value=null,
|
||||||
):: self {
|
):: self {
|
||||||
inputs+: [{
|
inputs+: [{
|
||||||
name: name,
|
name: name,
|
||||||
label: label,
|
label: label,
|
||||||
type: type,
|
type: type,
|
||||||
pluginId: pluginId,
|
[if pluginId != null then 'pluginId']: pluginId,
|
||||||
pluginName: pluginName,
|
[if pluginName != null then 'pluginName']: pluginName,
|
||||||
|
[if value != null then 'value']: value,
|
||||||
description: description,
|
description: description,
|
||||||
}],
|
}],
|
||||||
},
|
},
|
||||||
|
|
1
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet
generated
vendored
1
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet
generated
vendored
|
@ -25,5 +25,6 @@
|
||||||
pluginlist:: import 'pluginlist.libsonnet',
|
pluginlist:: import 'pluginlist.libsonnet',
|
||||||
gauge:: error 'gauge is removed, migrate to gaugePanel',
|
gauge:: error 'gauge is removed, migrate to gaugePanel',
|
||||||
gaugePanel:: import 'gauge_panel.libsonnet',
|
gaugePanel:: import 'gauge_panel.libsonnet',
|
||||||
|
barGaugePanel:: import 'bar_gauge_panel.libsonnet',
|
||||||
statPanel:: import 'stat_panel.libsonnet',
|
statPanel:: import 'stat_panel.libsonnet',
|
||||||
}
|
}
|
||||||
|
|
13
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/pie_chart_panel.libsonnet
generated
vendored
13
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/pie_chart_panel.libsonnet
generated
vendored
|
@ -12,6 +12,13 @@
|
||||||
* @param datasource Datasource
|
* @param datasource Datasource
|
||||||
* @param aliasColors Define color mappings
|
* @param aliasColors Define color mappings
|
||||||
* @param pieType Type of pie chart (one of pie or donut)
|
* @param pieType Type of pie chart (one of pie or donut)
|
||||||
|
* @param showLegend Show legend
|
||||||
|
* @param showLegendPercentage Show percentage values in the legend
|
||||||
|
* @param legendType Type of legend (one of 'Right side', 'Under graph' or 'On graph')
|
||||||
|
* @param valueName Type of tooltip value
|
||||||
|
* @param repeat Variable used to repeat the pie chart
|
||||||
|
* @param repeatDirection Which direction to repeat the panel, 'h' for horizontal and 'v' for vertical
|
||||||
|
* @param maxPerRow Number of panels to display when repeated. Used in combination with repeat.
|
||||||
* @return A json that represents a pie chart panel
|
* @return A json that represents a pie chart panel
|
||||||
*/
|
*/
|
||||||
new(
|
new(
|
||||||
|
@ -27,6 +34,9 @@
|
||||||
showLegend=true,
|
showLegend=true,
|
||||||
showLegendPercentage=true,
|
showLegendPercentage=true,
|
||||||
legendType='Right side',
|
legendType='Right side',
|
||||||
|
repeat=null,
|
||||||
|
repeatDirection=null,
|
||||||
|
maxPerRow=null,
|
||||||
):: {
|
):: {
|
||||||
type: 'grafana-piechart-panel',
|
type: 'grafana-piechart-panel',
|
||||||
[if description != null then 'description']: description,
|
[if description != null then 'description']: description,
|
||||||
|
@ -36,6 +46,9 @@
|
||||||
[if span != null then 'span']: span,
|
[if span != null then 'span']: span,
|
||||||
[if min_span != null then 'minSpan']: min_span,
|
[if min_span != null then 'minSpan']: min_span,
|
||||||
[if height != null then 'height']: height,
|
[if height != null then 'height']: height,
|
||||||
|
[if repeat != null then 'repeat']: repeat,
|
||||||
|
[if repeatDirection != null then 'repeatDirection']: repeatDirection,
|
||||||
|
[if maxPerRow != null then 'maxPerRow']: maxPerRow,
|
||||||
valueName: valueName,
|
valueName: valueName,
|
||||||
datasource: datasource,
|
datasource: datasource,
|
||||||
legend: {
|
legend: {
|
||||||
|
|
6
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/table_panel.libsonnet
generated
vendored
6
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/table_panel.libsonnet
generated
vendored
|
@ -16,6 +16,7 @@
|
||||||
* @param sort Sorting instruction for the panel
|
* @param sort Sorting instruction for the panel
|
||||||
* @param transform allow table manipulation to present data as desired
|
* @param transform allow table manipulation to present data as desired
|
||||||
* @param transparent Boolean (default: false) If set to true the panel will be transparent
|
* @param transparent Boolean (default: false) If set to true the panel will be transparent
|
||||||
|
* @param links Set of links for the panel.
|
||||||
* @return A json that represents a table panel
|
* @return A json that represents a table panel
|
||||||
*/
|
*/
|
||||||
new(
|
new(
|
||||||
|
@ -32,6 +33,7 @@
|
||||||
sort=null,
|
sort=null,
|
||||||
time_from=null,
|
time_from=null,
|
||||||
time_shift=null,
|
time_shift=null,
|
||||||
|
links=[],
|
||||||
):: {
|
):: {
|
||||||
type: 'table',
|
type: 'table',
|
||||||
title: title,
|
title: title,
|
||||||
|
@ -45,6 +47,7 @@
|
||||||
columns: columns,
|
columns: columns,
|
||||||
timeFrom: time_from,
|
timeFrom: time_from,
|
||||||
timeShift: time_shift,
|
timeShift: time_shift,
|
||||||
|
links: links,
|
||||||
[if sort != null then 'sort']: sort,
|
[if sort != null then 'sort']: sort,
|
||||||
[if description != null then 'description']: description,
|
[if description != null then 'description']: description,
|
||||||
[if transform != null then 'transform']: transform,
|
[if transform != null then 'transform']: transform,
|
||||||
|
@ -69,5 +72,8 @@
|
||||||
type: 'hidden',
|
type: 'hidden',
|
||||||
}],
|
}],
|
||||||
},
|
},
|
||||||
|
addLink(link):: self {
|
||||||
|
links+: [link],
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
90
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/template.libsonnet
generated
vendored
90
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/template.libsonnet
generated
vendored
|
@ -1,6 +1,25 @@
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
|
* Returns a new template that can be added to a dashboard.
|
||||||
|
* See what's a [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates).
|
||||||
|
*
|
||||||
* @name template.new
|
* @name template.new
|
||||||
|
*
|
||||||
|
* @param name Name of variable
|
||||||
|
* @param datasource Template [datasource](https://grafana.com/docs/grafana/latest/variables/variable-types/add-data-source-variable/)
|
||||||
|
* @param query [Query expression](https://grafana.com/docs/grafana/latest/variables/variable-types/add-query-variable/) for the datasource.
|
||||||
|
* @param label (optional) Display name of the variable dropdown. If null, then the dropdown label will be the variable name.
|
||||||
|
* @param allValues (optional) Formatting for [multi-value variables](https://grafana.com/docs/grafana/latest/variables/formatting-multi-value-variables/#formatting-multi-value-variables)
|
||||||
|
* @param tagValuesQuery (optional, experimental feature) Group values into [selectable tags](https://grafana.com/docs/grafana/latest/variables/variable-value-tags/)
|
||||||
|
* @param current
|
||||||
|
* @param hide '' (default) the variable dropdown displays the variable Name or Label value. 'label' the variable dropdown only displays the selected variable value and a down arrow. Any other value, no variable dropdown is displayed on the dashboard.
|
||||||
|
* @param regex (optional) Regex expression to filter or capture specific parts of the names returned by your data source query. To see examples, refer to [Filter variables with regex](https://grafana.com/docs/grafana/latest/variables/filter-variables-with-regex/).
|
||||||
|
* @param refresh 'never' (default): Variables queries are cached and values are not updated. This is fine if the values never change, but problematic if they are dynamic and change a lot. 'load': Queries the data source every time the dashboard loads. This slows down dashboard loading, because the variable query needs to be completed before dashboard can be initialized. 'time': Queries the data source when the dashboard time range changes. Only use this option if your variable options query contains a time range filter or is dependent on the dashboard time range.
|
||||||
|
* @param includeAll Whether all value option is available or not. False by default.
|
||||||
|
* @param multi Whether multiple values can be selected or not from variable value list. False by default.
|
||||||
|
* @param sort 0 (default): Without Sort, 1: Alphabetical (asc), 2: Alphabetical (desc), 3: Numerical (asc), 4: Numerical (desc).
|
||||||
|
*
|
||||||
|
* @return A [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates)
|
||||||
*/
|
*/
|
||||||
new(
|
new(
|
||||||
name,
|
name,
|
||||||
|
@ -38,7 +57,20 @@
|
||||||
useTags: false,
|
useTags: false,
|
||||||
},
|
},
|
||||||
/**
|
/**
|
||||||
|
* Use an [interval variable](https://grafana.com/docs/grafana/latest/variables/variable-types/add-interval-variable/) to represent time spans such as '1m', '1h', '1d'. You can think of them as a dashboard-wide "group by time" command. Interval variables change how the data is grouped in the visualization. You can also use the Auto Option to return a set number of data points per time span.
|
||||||
|
* You can use an interval variable as a parameter to group by time (for InfluxDB), date histogram interval (for Elasticsearch), or as a summarize function parameter (for Graphite).
|
||||||
|
*
|
||||||
* @name template.interval
|
* @name template.interval
|
||||||
|
*
|
||||||
|
* @param name Variable name
|
||||||
|
* @param query
|
||||||
|
* @param current
|
||||||
|
* @param hide '' (default) the variable dropdown displays the variable Name or Label value. 'label' the variable dropdown only displays the selected variable value and a down arrow. Any other value, no variable dropdown is displayed on the dashboard.
|
||||||
|
* @param label (optional) Display name of the variable dropdown. If null, then the dropdown label will be the variable name.
|
||||||
|
* @param auto_count (default: 300) Valid only if 'auto' is defined in query. Number of times the current time range will be divided to calculate the value, similar to the Max data points query option. For example, if the current visible time range is 30 minutes, then the auto interval groups the data into 30 one-minute increments. The default value is 30 steps.
|
||||||
|
* @param auto_min (default: '10s') Valid only if 'auto' is defined in query. The minimum threshold below which the step count intervals will not divide the time. To continue the 30 minute example, if the minimum interval is set to 2m, then Grafana would group the data into 15 two-minute increments.
|
||||||
|
*
|
||||||
|
* @return A new interval variable for templating.
|
||||||
*/
|
*/
|
||||||
interval(
|
interval(
|
||||||
name,
|
name,
|
||||||
|
@ -122,18 +154,24 @@
|
||||||
hide='',
|
hide='',
|
||||||
)::
|
)::
|
||||||
{
|
{
|
||||||
|
// self has dynamic scope, so self may not be myself below.
|
||||||
|
// '$' can't be used neither as this object is not top-level object.
|
||||||
|
local custom = self,
|
||||||
|
|
||||||
allValue: allValues,
|
allValue: allValues,
|
||||||
current: {
|
current: {
|
||||||
value: current,
|
// Both 'all' and 'All' are accepted for consistency.
|
||||||
text: if current in valuelabels then valuelabels[current] else current,
|
value: if includeAll && (current == 'All' || current == 'all') then
|
||||||
|
if multi then ['$__all'] else '$__all'
|
||||||
|
else
|
||||||
|
current,
|
||||||
|
text: if std.isArray(current) then
|
||||||
|
std.join(' + ', std.map(custom.valuelabel, current))
|
||||||
|
else
|
||||||
|
custom.valuelabel(current),
|
||||||
|
[if multi then 'selected']: true,
|
||||||
},
|
},
|
||||||
options: std.map(
|
options: std.map(self.option, self.query_array(query)),
|
||||||
function(i)
|
|
||||||
{
|
|
||||||
text: if i in valuelabels then valuelabels[i] else i,
|
|
||||||
value: i,
|
|
||||||
}, std.split(query, ',')
|
|
||||||
),
|
|
||||||
hide: $.hide(hide),
|
hide: $.hide(hide),
|
||||||
includeAll: includeAll,
|
includeAll: includeAll,
|
||||||
label: label,
|
label: label,
|
||||||
|
@ -142,6 +180,24 @@
|
||||||
name: name,
|
name: name,
|
||||||
query: query,
|
query: query,
|
||||||
type: 'custom',
|
type: 'custom',
|
||||||
|
|
||||||
|
valuelabel(value):: if value in valuelabels then
|
||||||
|
valuelabels[value]
|
||||||
|
else value,
|
||||||
|
|
||||||
|
option(option):: {
|
||||||
|
text: custom.valuelabel(option),
|
||||||
|
value: if includeAll && option == 'All' then '$__all' else option,
|
||||||
|
[if multi then 'selected']: if multi && std.isArray(current) then
|
||||||
|
std.member(current, option)
|
||||||
|
else if multi then
|
||||||
|
current == option
|
||||||
|
else
|
||||||
|
null,
|
||||||
|
},
|
||||||
|
query_array(query):: std.split(
|
||||||
|
if includeAll then 'All,' + query else query, ','
|
||||||
|
),
|
||||||
},
|
},
|
||||||
/**
|
/**
|
||||||
* @name template.text
|
* @name template.text
|
||||||
|
@ -161,4 +217,20 @@
|
||||||
query: '',
|
query: '',
|
||||||
type: 'textbox',
|
type: 'textbox',
|
||||||
},
|
},
|
||||||
|
/**
|
||||||
|
* @name template.adhoc
|
||||||
|
*/
|
||||||
|
adhoc(
|
||||||
|
name,
|
||||||
|
datasource,
|
||||||
|
label=null,
|
||||||
|
hide='',
|
||||||
|
)::
|
||||||
|
{
|
||||||
|
datasource: datasource,
|
||||||
|
hide: $.hide(hide),
|
||||||
|
label: label,
|
||||||
|
name: name,
|
||||||
|
type: 'adhoc',
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
6
monitoring/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet
generated
vendored
6
monitoring/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet
generated
vendored
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
dashboard(title, uid=''):: {
|
dashboard(title, uid='', datasource='default'):: {
|
||||||
// Stuff that isn't materialised.
|
// Stuff that isn't materialised.
|
||||||
_nextPanel:: 1,
|
_nextPanel:: 1,
|
||||||
addRow(row):: self {
|
addRow(row):: self {
|
||||||
|
@ -88,8 +88,8 @@
|
||||||
list: [
|
list: [
|
||||||
{
|
{
|
||||||
current: {
|
current: {
|
||||||
text: 'default',
|
text: datasource,
|
||||||
value: 'default',
|
value: datasource,
|
||||||
},
|
},
|
||||||
hide: 0,
|
hide: 0,
|
||||||
label: null,
|
label: null,
|
||||||
|
|
5
monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/README.md
generated
vendored
5
monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/README.md
generated
vendored
|
@ -13,7 +13,8 @@ A set of Grafana dashboards and Prometheus alerts for Kubernetes.
|
||||||
| release-0.2 | v1.14.1 and before | v2.11.0+ |
|
| release-0.2 | v1.14.1 and before | v2.11.0+ |
|
||||||
| release-0.3 | v1.17 and before | v2.11.0+ |
|
| release-0.3 | v1.17 and before | v2.11.0+ |
|
||||||
| release-0.4 | v1.18 | v2.11.0+ |
|
| release-0.4 | v1.18 | v2.11.0+ |
|
||||||
| master | v1.18+ | v2.11.0+ |
|
| release-0.5 | v1.19 | v2.11.0+ |
|
||||||
|
| master | v1.19 | v2.11.0+ |
|
||||||
|
|
||||||
In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.
|
In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.
|
||||||
Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis.
|
Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis.
|
||||||
|
@ -254,4 +255,4 @@ While the community has not yet fully agreed on alert severities and their to be
|
||||||
## Note
|
## Note
|
||||||
|
|
||||||
You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance.
|
You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance.
|
||||||
Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.
|
Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.
|
||||||
|
|
|
@ -18,7 +18,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.',
|
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.',
|
||||||
|
summary: 'Pod is crash looping.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubePodCrashLooping',
|
alert: 'KubePodCrashLooping',
|
||||||
|
@ -41,7 +42,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
|
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
|
||||||
|
summary: 'Pod has been in a non-ready state for more than 15 minutes.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubePodNotReady',
|
alert: 'KubePodNotReady',
|
||||||
|
@ -56,7 +58,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
|
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
|
||||||
|
summary: 'Deployment generation mismatch due to possible roll-back',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeDeploymentGenerationMismatch',
|
alert: 'KubeDeploymentGenerationMismatch',
|
||||||
|
@ -77,7 +80,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
|
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
|
||||||
|
summary: 'Deployment has not matched the expected number of replicas.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeDeploymentReplicasMismatch',
|
alert: 'KubeDeploymentReplicasMismatch',
|
||||||
|
@ -98,7 +102,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
|
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
|
||||||
|
summary: 'Deployment has not matched the expected number of replicas.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeStatefulSetReplicasMismatch',
|
alert: 'KubeStatefulSetReplicasMismatch',
|
||||||
|
@ -113,7 +118,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
|
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
|
||||||
|
summary: 'StatefulSet generation mismatch due to possible roll-back',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeStatefulSetGenerationMismatch',
|
alert: 'KubeStatefulSetGenerationMismatch',
|
||||||
|
@ -142,7 +148,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
|
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
|
||||||
|
summary: 'StatefulSet update has not been rolled out.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeStatefulSetUpdateNotRolledOut',
|
alert: 'KubeStatefulSetUpdateNotRolledOut',
|
||||||
|
@ -178,7 +185,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
|
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
|
||||||
|
summary: 'DaemonSet rollout is stuck.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
},
|
},
|
||||||
|
@ -190,7 +198,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
|
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
|
||||||
|
summary: 'Pod container waiting longer than 1 hour',
|
||||||
},
|
},
|
||||||
'for': '1h',
|
'for': '1h',
|
||||||
alert: 'KubeContainerWaiting',
|
alert: 'KubeContainerWaiting',
|
||||||
|
@ -206,7 +215,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
|
||||||
|
summary: 'DaemonSet pods are not scheduled.',
|
||||||
},
|
},
|
||||||
'for': '10m',
|
'for': '10m',
|
||||||
},
|
},
|
||||||
|
@ -219,7 +229,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
|
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
|
||||||
|
summary: 'DaemonSet pods are misscheduled.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
},
|
},
|
||||||
|
@ -233,7 +244,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
|
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
|
||||||
|
summary: 'Job did not complete in time',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -246,7 +258,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.',
|
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.',
|
||||||
|
summary: 'Job failed to complete.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -261,7 +274,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.',
|
description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.',
|
||||||
|
summary: 'HPA has not matched descired number of replicas.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeHpaReplicasMismatch',
|
alert: 'KubeHpaReplicasMismatch',
|
||||||
|
@ -276,7 +290,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.',
|
description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.',
|
||||||
|
summary: 'HPA is running at max replicas',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeHpaMaxedOut',
|
alert: 'KubeHpaMaxedOut',
|
||||||
|
|
|
@ -35,7 +35,8 @@ local utils = import 'utils.libsonnet';
|
||||||
long: '%(long)s' % w,
|
long: '%(long)s' % w,
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'The API server is burning too much error budget',
|
description: 'The API server is burning too much error budget.',
|
||||||
|
summary: 'The API server is burning too much error budget.',
|
||||||
},
|
},
|
||||||
'for': '%(for)s' % w,
|
'for': '%(for)s' % w,
|
||||||
}
|
}
|
||||||
|
@ -54,7 +55,8 @@ local utils = import 'utils.libsonnet';
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationWarningSeconds)),
|
description: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationWarningSeconds)),
|
||||||
|
summary: 'Client certificate is about to expire.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -66,7 +68,8 @@ local utils = import 'utils.libsonnet';
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationCriticalSeconds)),
|
description: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationCriticalSeconds)),
|
||||||
|
summary: 'Client certificate is about to expire.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -78,7 +81,8 @@ local utils = import 'utils.libsonnet';
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.',
|
description: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.',
|
||||||
|
summary: 'An aggregated API has reported errors.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -91,7 +95,8 @@ local utils = import 'utils.libsonnet';
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.',
|
description: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.',
|
||||||
|
summary: 'An aggregated API is down.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
(import '../lib/absent_alert.libsonnet') {
|
(import '../lib/absent_alert.libsonnet') {
|
||||||
|
|
|
@ -22,7 +22,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $labels.node }} has been unready for more than 15 minutes.',
|
description: '{{ $labels.node }} has been unready for more than 15 minutes.',
|
||||||
|
summary: 'Node is not ready.',
|
||||||
},
|
},
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
alert: 'KubeNodeNotReady',
|
alert: 'KubeNodeNotReady',
|
||||||
|
@ -37,7 +38,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.',
|
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.',
|
||||||
|
summary: 'Node is unreachable.',
|
||||||
},
|
},
|
||||||
alert: 'KubeNodeUnreachable',
|
alert: 'KubeNodeUnreachable',
|
||||||
},
|
},
|
||||||
|
@ -59,7 +61,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
|
description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
|
||||||
|
summary: 'Kubelet is running at capacity.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -72,7 +75,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.',
|
description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.',
|
||||||
|
summary: 'Node readiness status is flapping.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -85,7 +89,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.',
|
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.',
|
||||||
|
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -98,7 +103,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.',
|
description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.',
|
||||||
|
summary: 'Kubelet Pod startup latency is too high.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
(import '../lib/absent_alert.libsonnet') {
|
(import '../lib/absent_alert.libsonnet') {
|
||||||
|
|
|
@ -35,7 +35,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.',
|
description: 'Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.',
|
||||||
|
summary: 'Cluster has overcommitted CPU resource requests.',
|
||||||
},
|
},
|
||||||
'for': '5m',
|
'for': '5m',
|
||||||
},
|
},
|
||||||
|
@ -54,7 +55,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.',
|
description: 'Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.',
|
||||||
|
summary: 'Cluster has overcommitted memory resource requests.',
|
||||||
},
|
},
|
||||||
'for': '5m',
|
'for': '5m',
|
||||||
},
|
},
|
||||||
|
@ -70,7 +72,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Cluster has overcommitted CPU resource requests for Namespaces.',
|
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
|
||||||
|
summary: 'Cluster has overcommitted CPU resource requests.',
|
||||||
},
|
},
|
||||||
'for': '5m',
|
'for': '5m',
|
||||||
},
|
},
|
||||||
|
@ -86,7 +89,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Cluster has overcommitted memory resource requests for Namespaces.',
|
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
|
||||||
|
summary: 'Cluster has overcommitted memory resource requests.',
|
||||||
},
|
},
|
||||||
'for': '5m',
|
'for': '5m',
|
||||||
},
|
},
|
||||||
|
@ -103,7 +107,8 @@
|
||||||
severity: 'info',
|
severity: 'info',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
|
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
|
||||||
|
summary: 'Namespace quota is fully used.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -119,7 +124,8 @@
|
||||||
severity: 'info',
|
severity: 'info',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
|
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
|
||||||
|
summary: 'Processes experience elevated CPU throttling.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
|
@ -29,7 +29,8 @@
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.',
|
description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.',
|
||||||
|
summary: 'PersistentVolume is filling up.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -48,7 +49,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
|
description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
|
||||||
|
summary: 'PersistentVolume is filling up.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -61,7 +63,8 @@
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.',
|
description: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.',
|
||||||
|
summary: 'PersistentVolume is having issues with provisioning.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
|
@ -18,7 +18,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'There are {{ $value }} different semantic versions of Kubernetes components running.',
|
description: 'There are {{ $value }} different semantic versions of Kubernetes components running.',
|
||||||
|
summary: 'Different semantic versions of Kubernetes components running.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -37,7 +38,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
|
description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
|
||||||
|
summary: 'Kubernetes API server client is experiencing errors.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
|
@ -247,10 +247,10 @@ local singlestat = grafana.singlestat;
|
||||||
span=12,
|
span=12,
|
||||||
),
|
),
|
||||||
gridPos={
|
gridPos={
|
||||||
"h": 2,
|
h: 2,
|
||||||
"w": 24,
|
w: 24,
|
||||||
"x": 0,
|
x: 0,
|
||||||
"y": 0
|
y: 0,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.addRow(
|
.addRow(
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '%s has disappeared from Prometheus target discovery.' % absentAlert.componentName,
|
description: '%s has disappeared from Prometheus target discovery.' % absentAlert.componentName,
|
||||||
|
summary: 'Target disappeared from Prometheus target discovery.',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,8 @@
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
|
summary: 'kube-state-metrics is experiencing errors in list operations.',
|
||||||
|
description: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -33,7 +34,8 @@
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
|
summary: 'kube-state-metrics is experiencing errors in watch operations.',
|
||||||
|
description: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
2
monitoring/vendor/kube-prometheus
vendored
2
monitoring/vendor/kube-prometheus
vendored
|
@ -1 +1 @@
|
||||||
github.com/coreos/kube-prometheus/jsonnet/kube-prometheus
|
github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus
|
2
monitoring/vendor/prometheus-operator
vendored
2
monitoring/vendor/prometheus-operator
vendored
|
@ -1 +1 @@
|
||||||
github.com/coreos/prometheus-operator/jsonnet/prometheus-operator
|
github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator
|
Reference in a new issue