upgrade monitoring - switch to prometheus-operator/kube-prometheus
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2020-08-19 20:44:52 +02:00
parent df9c32f59c
commit 0108ac6084
94 changed files with 539 additions and 9535 deletions

View File

@ -4,7 +4,7 @@
{
"source": {
"git": {
"remote": "https://github.com/coreos/kube-prometheus",
"remote": "https://github.com/prometheus-operator/kube-prometheus",
"subdir": "jsonnet/kube-prometheus"
}
},

View File

@ -8,8 +8,8 @@
"subdir": "grafana"
}
},
"version": "014301fd5f71d8305a395b2fb437089a7b1a3999",
"sum": "RHtpk2c0CcliWyt6F4DIgwpi4cEfHADK7nAxIw6RTGs="
"version": "18c50c83ea49291b0aa00067e4b2b386556ba0e3",
"sum": "GEVLrcKGvUuvjq6bDhaWr4fOwkG5QMDpnUhHxUUywwg="
},
{
"source": {
@ -18,28 +18,8 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "9006d8d4f9d82f6cce6eb93d6f2dfe7c154fa05d",
"sum": "Uv8ysXlEACF7BafoCkHnrBmJ2AHh/VldI5mm3BuMiy0="
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/kube-prometheus",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "6771c9bcc287e8047510207a4ab60fa5e63e48fe",
"sum": "52ukcsyazUhdJWb48PPGQQurdFrGE0xgKYE++yWO7aI="
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/prometheus-operator",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "0dca0f21ffff72a063db8855b5d515e15ab0dccb",
"sum": "WggWVWZ+CBEUThQCztSaRELbtqdXf9s3OFzf06HbYNA="
"version": "44dea5df0364a46234bc9f6fd90bdb960f7a1ee8",
"sum": "NhOkJWkO7ZO2DSE8Fvipcs7Hh2/GOCS0WjPPZU8OiaQ="
},
{
"source": {
@ -48,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "ad85aec356b4544a41f62ac8c32f8042c0ffc42e",
"sum": "JHhSwlCa9A+AwG4o+YEXXFDbQ91iwwd9G/FoYnGhObw="
"version": "0cfef6b1e666316d40a255c93e1d98d2a2610009",
"sum": "BVozpBqmTw67JSEDX9DRLEzCLMt0+aQjDBKGaTLS+Bc="
},
{
"source": {
@ -58,8 +38,8 @@
"subdir": "grafana-builder"
}
},
"version": "f2a35172b97a0c944c4a167bb1f6e688624602e4",
"sum": "N65Fv0M2JvFE3GN8ZxP5xh1U5a314ey8geLAioJLzF8="
"version": "0d5c2119373cb53d9f02fb82b94857412f7540c8",
"sum": "R5WJe6wW0R9vMpOAHaGFwcK8q4NmGZ0aLhdZGKDHeMU="
},
{
"source": {
@ -79,8 +59,8 @@
"subdir": ""
}
},
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
"sum": "skD7Rm0m5lOQOn8IrnGEdJyhWUI7qsKPXwcci7Hjn0E="
"version": "b1005adad5940eee9366272ab4c85cf077e547c2",
"sum": "FVMHGdPPdJ3OYoR2b2ZjqQq/cxmS7/Y5dYkA9oYQ3Vg="
},
{
"source": {
@ -89,7 +69,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
"version": "b1005adad5940eee9366272ab4c85cf077e547c2",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
@ -99,7 +79,7 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
"version": "f1166b40298a6846a329f567d0d509d32ee45a81",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
@ -109,8 +89,28 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8="
"version": "f1166b40298a6846a329f567d0d509d32ee45a81",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/kube-prometheus",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "4f872f1e3187509e4c8d53d7e1f654b5fa422977",
"sum": "8pGMGEIp8YC6bpvrxhkSdi/mBbexogdFnvHPlZJmfg4="
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus-operator/prometheus-operator",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "312d675008306b13c24d241bf4f0a882dbfa90d8",
"sum": "NPuLvqEmYZ+dCQ/9U4wXtobBD6hYreEx3jPpLQKS/ig="
},
{
"source": {
@ -119,7 +119,7 @@
"subdir": "docs/node-mixin"
}
},
"version": "503e4fc8486c0082d6bd8c53fad646bcfafeedf6",
"version": "3b035c8fa1f75c4c00e57acc14fb71dfd62e31ee",
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
},
{
@ -129,7 +129,7 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "348ff4285ffa59907c9d7fd7eb3cb7d748f42758",
"version": "983ebb4a513302315a8117932ab832815f85e3d2",
"sum": "TBq4SL7YsPInARbJqwz25JaBvvAegcnRCsuz3K9niWc=",
"name": "prometheus"
},

View File

@ -6,7 +6,7 @@ metadata:
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgAXarLIP1IirGmTPrtvSSP9TvqIaj/af+j98QCp9+OXhqFtVmjCcKnv5gDRV+kRtGdX8p+GetDWq14uY0dq1uHMTgy/TV9cZ9FCsjEi8+Knk4h01FI/D8vjGA2en/tDkM8/Jz4XcqZ+2yU0Mh7yKhc0pg0/wg/tVLuI04ZzYkjYFPb2DZJyqQKOYZQaIhaf6NpLe4oFAb9OKbvIUT2hu5wGz0Q2aqjSiW0JyhVje367x6oYe/TNN6Qri8ZDeDKlq155XpXpSa3HcQLWuxIvizlTjEbnPr0fa8kFl+v1iRPtB5hNc6dB1w/oyj1YPpVk00JaIGeDy9FXmwDc6MyFjoLWfDWY95mGCN1NJ7q1see/OLZevwLYi+zQYuOH/SIhNHee9GAWTiqqt7RE0GnzEC1kp9J+8caVbtv6aUqpNCaUByFjmeE3gzFNGEZXVQEQpj+IeOSF2f46TOL/F7TS4EKlzMZZVapLLSyuHiq8MNW3xPNnpJ47fDPiJoLGTiYYQgnRrTXzCmsgAHwNBagRJug+y3g+kZbxdlvdjkzqJ3LEoMLxbh9xuHMuiosqodVjIWNT60R/k/Is5DeM9I1Jvfavx7bBeXAJEkkkgeQDNbzqHqlCEbda3dDpbtpxa19FcHvJuRrNlQ7t/eW0FZBviMl2ETI1j8k2HDJeHqJB94QxuPEtMFknE0elIFdJHG5aOmrGECa5u4Oucn4GnmAOLsrVKPgaIQiFka/BJBqmt5kliiDdB3R+1n2nL7X5iXQyD8Wt5j7CRstqb9VX1q4Hr/Devu0IebaGgE7Y2ledhlzsqlriphlyTLUSU4/55WH5hRvhRcakbwtswLUmHA+JeYDN0ojX8roGtEEedCQ9BBn0J6VADF8JyAfNKaliX1/lBW1hImAAUR16EY0fVRUaXWCCrF7n8uPjWAbbwEBkZeaI8j5yagrspD8XPTFPO2qny2ma54DF/sEPJ9cgPmK3P0U4L488G8io8mIBXTEv6XxZ41vV6fAXuWB4/OX8UWUmak7aRURssfAbYuT30XBvUl2dc4Hp8IVwhhpr8sDAhZS9BLBr1+SYn9GPR1NBO5GF8ow8pGr0b6382OinVEtquUO5K80UBsbLDg2OPW7sWIdzXEneoHiFFPNx0Vo8ljquP1qpcneHUl9V5p6yJPKo2iMj0dB8RwCm3ueoldydGq0HrK1ZY1D9gnyJmF/dq496woI8Ccv/EdYkpPw+E7bm3nnR8J+E9tuwhfUZQVQoaWz4HdWufk+FyfOOqo9x1KamekteMdw1ZcqV3a1UfE6M0+YjZtIy7511tcdKYwdHahcAmYGciTfQb5REyjHWEFlMalph+LPtfUzPlqGgN7CKptUhEsSOxMRll3z8LmeT9JiejWcdRtFI6lqJYk2GWKj0rVscZo3GUiqkvKMsDQGqCs4/r7OzWIPfDjkpH5SNCHjX5eMUI3LfuIDfNo2uEWLzabhWpbfw97dvH52/B56D9AnxqXdy0qUjU3ZGeS3yCDvhi29wPgaDgS9T7nao7PgHfPUFUO54NhKR6iY+rwWRyBa+2OMfe/YUvxslqAr758NKVPQM4iV5bUat1UzBy/qSspyig7GF2Dkc3FvqoxlJnWCyDFRCoWZJ1msR+CZdGlGOreNwpmrYVQnJmoxImKIS5chEfzGIDKJu+/4eO+g/ysE/vyHAyC9YG2bv2ueD5c+KYiG4pA==
alertmanager.yaml: AgCboDZWf4XxlcxeQlbcdqr32zZfxDW2NlpWo57B7tIoomuTwY8aprGZiFtrXOLJvSGZPh3JRjVoLxKayaO01Hkcok3kQ/DbxwJBQsH+4GPmk/7uXdM3Ph++IVtnVDLi5E6fv1GrmSlJFxvc4MaEUOGSbrSG+eubVQmwJAKxy1ODz4oL6M+1o8YkIBLUZBlD10yZDoh+NODzRJniVQKlyqiHTk9w+aI6KaZOtY+4HZBnO6jGAW4Jdk46V45vHwGkQrMDecNwqQiymCk1dAAcWrkn/FSEygMH7+SyVYi8Ej7QKEn835TKxAiQsOezhu/o3SNyYbBGnrBDH7JHI4ckXop2iVh5hEvprXmNemXrEzCwILsOF4A4NItbrZ88FQYitc6ppMvz5YIyF083IrPJgn8jkn6RYQL53nupfvlQECgh3au53IRqahiDJZRNz8QolOwozNASLOKrk3erK21az+vxoKB29yfVMBPE8+Z/1Yk6RNueliLnF2KpCMyK5Cz/WFsy3aiFE5T5QTDounFghL9p7wtPPJgoFQ5uGDeV4k705c/FQLXS8gCNkc+LrjdHpzWOnucjacfgepZmm5T3JMxTixu2CtuYbiNuinJSEJeyRilYpEDFdNW3NhP/oExDNh6FITz+qX/K8x7I++f6Os0LcUmDOfdU4ZleL6dMe5GLjERd7a7I4LVmMya6cZRvGaP4770K7ItQnCZecgl61KmQWlpPMwpjStsj9hCm+o6PRK+9Cf5LEujbP1maMpJvzonaLEAk9EHoYnXpHWXbCj7PGkqctRbHbLAv/WQoKL2NJdXmTqxMJzE+5CRKlWAIqGtaCVd7RzkwGArZtPy+Q1x7nK4JPpd28zoBQZEoLXQvgxbocmjEuaLEvbEy8neCd7FVTLsQnvrJUBCkuRaUm41ZtrjWzAslpSFWEDo/WnomF1aM/vs5zj7eiwHu/Q7nzInz9EKKlo9O3/r0EnhGP6M3bEHfr608fNzVLkdOlcrJLzliJ85h9c+8KrvrU75hzFylQ/7UWw3rnC+c1axMvCxRFvUFwdtn6DSkfL3ZxvK0J0S/nuH3FI/iTzYa5wO6ACihNPc1TbmSqqT1qolbIHq9iXGvDwA9WrcjCIAuwNpGPzw5WfUuskY4wDiJNJzVOGi8thcXH9y2epyiY+OTNGnkelJDnfhUwL0phlFgCgNI4e6TaJ5V3O5L2Psq6z8X0Zme4uR2AQl67NHhybneCa4eK+8XGNRmNRr78BQDW3tZ/i3Vk/lYDYMW+QzCqMmXlbBx5KemJdx8NDOwQnzEKnWmACqSP9ljZ8c0RCGlIjOXNwu/AZqUlQFhtn0BqPcifI8GGEZQWj5xGvZSwnn3Y32DmFBokSqcS1huaQ/q2BtUFU3pkNoI7A0pW7f/W1RxW8cvxr9oXkSijWjC763rno1BndOZtyedLKCdCORIf1PUjMKwKIq38xfHyGqNO+Hsi2WwIodJ6w1mQQfn+QBDVQHLeoc9fZ85lzw65WSPwSnGUtlZIQD/RktgzY8inGR74VE1wYXGy5AuvsLnn3GZ06rQgsho58JdR5rqKFGR8kVS7kTXrQpn2SOJ1AJrykFDdxKhx/ecsFBqDp/QsRIrUqhjCREGoI274rtTy2U9lduVhHfwJ6P/I68NdxXyOig8o1fZ4NnrxFDmV0X58Xd1yAWVIFGDFJW+VaQqxCUQH951F1GFXQ==
template:
metadata:
creationTimestamp: null

View File

@ -2030,6 +2030,9 @@ items:
"id": 5,
"lines": true,
"linewidth": 1,
"links": [
],
"minSpan": 24,
"nullPointMode": "null as zero",
"renderer": "flot",
@ -19273,6 +19276,9 @@ items:
"id": 5,
"lines": true,
"linewidth": 1,
"links": [
],
"minSpan": 24,
"nullPointMode": "null as zero",
"renderer": "flot",
@ -20644,6 +20650,9 @@ items:
"id": 5,
"lines": true,
"linewidth": 1,
"links": [
],
"minSpan": 24,
"nullPointMode": "null as zero",
"renderer": "flot",

View File

@ -122,6 +122,7 @@ spec:
nodeSelector:
beta.kubernetes.io/os: linux
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: grafana

View File

@ -3,7 +3,7 @@ kind: ClusterRole
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
name: kube-state-metrics
rules:
- apiGroups:

View File

@ -3,7 +3,7 @@ kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@ -3,7 +3,7 @@ kind: Deployment
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
name: kube-state-metrics
namespace: monitoring
spec:
@ -15,7 +15,7 @@ spec:
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
spec:
containers:
- args:
@ -23,7 +23,7 @@ spec:
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: quay.io/coreos/kube-state-metrics:v1.9.5
image: quay.io/coreos/kube-state-metrics:v1.9.7
name: kube-state-metrics
securityContext:
runAsUser: 65534

View File

@ -3,7 +3,7 @@ kind: Service
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
name: kube-state-metrics
namespace: monitoring
spec:

View File

@ -3,6 +3,6 @@ kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
name: kube-state-metrics
namespace: monitoring

View File

@ -3,7 +3,7 @@ kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
app.kubernetes.io/version: 1.9.7
name: kube-state-metrics
namespace: monitoring
spec:

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
name: prometheus-operator
namespace: monitoring
spec:
@ -19,4 +19,4 @@ spec:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1

View File

@ -788,10 +788,11 @@ spec:
rules:
- alert: KubeStateMetricsListErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in
list operations. This is likely causing it to not be able to expose metrics
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
@ -802,10 +803,11 @@ spec:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in
watch operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
@ -1024,9 +1026,10 @@ spec:
rules:
- alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
summary: Pod is crash looping.
expr: |
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0
for: 15m
@ -1034,9 +1037,10 @@ spec:
severity: warning
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |
sum by (namespace, pod) (
max by(namespace, pod) (
@ -1050,10 +1054,11 @@ spec:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
@ -1063,9 +1068,10 @@ spec:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than 15 minutes.
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |
(
kube_deployment_spec_replicas{job="kube-state-metrics"}
@ -1081,9 +1087,10 @@ spec:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
@ -1099,10 +1106,11 @@ spec:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
@ -1112,9 +1120,10 @@ spec:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
update has not been rolled out.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |
(
max without (revision) (
@ -1138,9 +1147,10 @@ spec:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
finished or progressed for at least 15 minutes.
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
not finished or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |
(
(
@ -1170,9 +1180,10 @@ spec:
severity: warning
- alert: KubeContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
has been in waiting state for longer than 1 hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 1h
@ -1180,9 +1191,10 @@ spec:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
@ -1192,9 +1204,10 @@ spec:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 15m
@ -1202,9 +1215,10 @@ spec:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than 12 hours to complete.
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
more than 12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
summary: Job did not complete in time
expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 12h
@ -1212,8 +1226,10 @@ spec:
severity: warning
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
summary: Job failed to complete.
expr: |
kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
@ -1221,9 +1237,10 @@ spec:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
desired number of replicas for longer than 15 minutes.
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched
the desired number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
summary: HPA has not matched descired number of replicas.
expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
!=
@ -1235,9 +1252,10 @@ spec:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
max replicas for longer than 15 minutes.
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running
at max replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
summary: HPA is running at max replicas
expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"}
==
@ -1249,9 +1267,10 @@ spec:
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
description: Cluster has overcommitted CPU resource requests for Pods and
cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
/
@ -1263,9 +1282,10 @@ spec:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
description: Cluster has overcommitted memory resource requests for Pods and
cannot tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
/
@ -1279,8 +1299,9 @@ spec:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
description: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
/
@ -1291,8 +1312,9 @@ spec:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
description: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
/
@ -1303,9 +1325,10 @@ spec:
severity: warning
- alert: KubeQuotaFullyUsed
annotations:
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
summary: Namespace quota is fully used.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
@ -1316,10 +1339,11 @@ spec:
severity: info
- alert: CPUThrottlingHigh
annotations:
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
/
@ -1332,10 +1356,11 @@ spec:
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
@ -1346,10 +1371,12 @@ spec:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value | humanizePercentage }} is available.
description: Based on recent sampling, the PersistentVolume claimed by {{
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is
expected to fill up within four days. Currently {{ $value | humanizePercentage
}} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
@ -1363,9 +1390,10 @@ spec:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
description: The persistent volume {{ $labels.persistentvolume }} has status
{{ $labels.phase }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
@ -1375,9 +1403,10 @@ spec:
rules:
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different semantic versions of Kubernetes
description: There are {{ $value }} different semantic versions of Kubernetes
components running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
@ -1385,9 +1414,10 @@ spec:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/
@ -1400,8 +1430,9 @@ spec:
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and
@ -1413,8 +1444,9 @@ spec:
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and
@ -1426,8 +1458,9 @@ spec:
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
@ -1439,8 +1472,9 @@ spec:
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
description: The API server is burning too much error budget.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and
@ -1454,38 +1488,42 @@ spec:
rules:
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 7.0 days.
description: A client certificate used to authenticate to the apiserver is
expiring in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring
in less than 24.0 hours.
description: A client certificate used to authenticate to the apiserver is
expiring in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
severity: critical
- alert: AggregatedAPIErrors
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
reported errors. The number of errors have increased for it in the past
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
has reported errors. The number of errors have increased for it in the past
five minutes. High values indicate that the availability of the service
changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
summary: An aggregated API has reported errors.
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
severity: warning
- alert: AggregatedAPIDown
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
been only {{ $value | humanize }}% available over the last 5m.
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }}
has been only {{ $value | humanize }}% available over the last 5m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
summary: An aggregated API is down.
expr: |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
for: 5m
@ -1493,8 +1531,9 @@ spec:
severity: warning
- alert: KubeAPIDown
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="apiserver"} == 1)
for: 15m
@ -1504,8 +1543,9 @@ spec:
rules:
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than 15 minutes.'
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
summary: Node is not ready.
expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
@ -1513,17 +1553,20 @@ spec:
severity: warning
- alert: KubeNodeUnreachable
annotations:
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
description: '{{ $labels.node }} is unreachable and some workloads may be
rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
summary: Node is unreachable.
expr: |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
summary: Kubelet is running at capacity.
expr: |
count by(node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
@ -1537,9 +1580,10 @@ spec:
severity: warning
- alert: KubeNodeReadinessFlapping
annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value
}} times in the last 15 minutes.
description: The readiness status of node {{ $labels.node }} has changed {{
$value }} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
for: 15m
@ -1547,9 +1591,10 @@ spec:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
@ -1557,9 +1602,10 @@ spec:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
@ -1567,8 +1613,9 @@ spec:
severity: warning
- alert: KubeletDown
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
@ -1578,8 +1625,9 @@ spec:
rules:
- alert: KubeSchedulerDown
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
description: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-scheduler"} == 1)
for: 15m
@ -1589,8 +1637,10 @@ spec:
rules:
- alert: KubeControllerManagerDown
annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery.
description: KubeControllerManager has disappeared from Prometheus target
discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-controller-manager"} == 1)
for: 15m
@ -1875,7 +1925,7 @@ spec:
message: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
@ -1884,7 +1934,7 @@ spec:
message: Errors while performing Watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[10m]))) > 0.4
for: 15m
labels:
severity: warning

View File

@ -68,6 +68,18 @@ spec:
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/probes
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: k8s-app
namespaceSelector:
matchNames:

View File

@ -645,7 +645,8 @@ spec:
type: object
type: object
baseImage:
description: Base image that is used to deploy pods, without tag.
description: 'Base image that is used to deploy pods, without tag.
Deprecated: use ''image'' instead'
type: string
clusterAdvertiseAddress:
description: 'ClusterAdvertiseAddress is the explicit address to advertise
@ -3014,10 +3015,11 @@ spec:
to use to run the Prometheus Pods.
type: string
sha:
description: SHA of Alertmanager container image to be deployed. Defaults
to the value of `version`. Similar to a tag, but the SHA explicitly
deploys an immutable container image. Version and Tag are ignored
if SHA is set.
description: 'SHA of Alertmanager container image to be deployed.
Defaults to the value of `version`. Similar to a tag, but the SHA
explicitly deploys an immutable container image. Version and Tag
are ignored if SHA is set. Deprecated: use ''image'' instead. The
image digest can be specified as part of the image URL.'
type: string
storage:
description: Storage is the definition of how storage will be used
@ -3279,8 +3281,10 @@ spec:
type: object
type: object
tag:
description: Tag of Alertmanager container image to be deployed. Defaults
to the value of `version`. Version is ignored if Tag is set.
description: 'Tag of Alertmanager container image to be deployed.
Defaults to the value of `version`. Version is ignored if Tag is
set. Deprecated: use ''image'' instead. The image tag can be specified
as part of the image URL.'
type: string
tolerations:
description: If specified, the pod's tolerations.

View File

@ -900,6 +900,11 @@ spec:
required:
- alertmanagers
type: object
allowOverlappingBlocks:
description: AllowOverlappingBlocks enables vertical compaction and
vertical query merge in Prometheus. This is still experimental in
Prometheus so it may change in any upcoming release.
type: boolean
apiserverConfig:
description: APIServerConfig allows specifying a host and auth methods
to access apiserver. If left empty, Prometheus is assumed to run
@ -1097,7 +1102,8 @@ spec:
type: boolean
type: object
baseImage:
description: Base image to use for a Prometheus deployment.
description: 'Base image to use for a Prometheus deployment. Deprecated:
use ''image'' instead'
type: string
configMaps:
description: ConfigMaps is a list of ConfigMaps in the same namespace
@ -3432,6 +3438,95 @@ spec:
priorityClassName:
description: Priority class assigned to the Pods
type: string
probeNamespaceSelector:
description: '*Experimental* Namespaces to be selected for Probe discovery.
If nil, only check own namespace.'
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: A label selector requirement is a selector that
contains values, a key, and an operator that relates the key
and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: operator represents a key's relationship to
a set of values. Valid operators are In, NotIn, Exists
and DoesNotExist.
type: string
values:
description: values is an array of string values. If the
operator is In or NotIn, the values array must be non-empty.
If the operator is Exists or DoesNotExist, the values
array must be empty. This array is replaced during a strategic
merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator
is "In", and the values array contains only "value". The requirements
are ANDed.
type: object
type: object
probeSelector:
description: '*Experimental* Probes to be selected for target discovery.'
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: A label selector requirement is a selector that
contains values, a key, and an operator that relates the key
and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: operator represents a key's relationship to
a set of values. Valid operators are In, NotIn, Exists
and DoesNotExist.
type: string
values:
description: values is an array of string values. If the
operator is In or NotIn, the values array must be non-empty.
If the operator is Exists or DoesNotExist, the values
array must be empty. This array is replaced during a strategic
merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator
is "In", and the values array contains only "value". The requirements
are ANDed.
type: object
type: object
prometheusExternalLabelName:
description: Name of Prometheus external label used to denote Prometheus
instance name. Defaults to the value of `prometheus`. External label
@ -4374,10 +4469,11 @@ spec:
type: object
type: object
sha:
description: SHA of Prometheus container image to be deployed. Defaults
description: 'SHA of Prometheus container image to be deployed. Defaults
to the value of `version`. Similar to a tag, but the SHA explicitly
deploys an immutable container image. Version and Tag are ignored
if SHA is set.
if SHA is set. Deprecated: use ''image'' instead. The image digest
can be specified as part of the image URL.'
type: string
storage:
description: Storage spec to specify how storage shall be used.
@ -4638,8 +4734,10 @@ spec:
type: object
type: object
tag:
description: Tag of Prometheus container image to be deployed. Defaults
to the value of `version`. Version is ignored if Tag is set.
description: 'Tag of Prometheus container image to be deployed. Defaults
to the value of `version`. Version is ignored if Tag is set. Deprecated:
use ''image'' instead. The image tag can be specified as part of
the image URL.'
type: string
thanos:
description: "Thanos configuration allows configuring various aspects
@ -4649,7 +4747,8 @@ spec:
without backward compatibility in any release."
properties:
baseImage:
description: Thanos base image if other than default.
description: 'Thanos base image if other than default. Deprecated:
use ''image'' instead'
type: string
grpcServerTlsConfig:
description: 'GRPCServerTLSConfig configures the gRPC server from
@ -4842,15 +4941,17 @@ spec:
type: object
type: object
sha:
description: SHA of Thanos container image to be deployed. Defaults
description: 'SHA of Thanos container image to be deployed. Defaults
to the value of `version`. Similar to a tag, but the SHA explicitly
deploys an immutable container image. Version and Tag are ignored
if SHA is set.
if SHA is set. Deprecated: use ''image'' instead. The image
digest can be specified as part of the image URL.'
type: string
tag:
description: Tag of Thanos sidecar container image to be deployed.
description: 'Tag of Thanos sidecar container image to be deployed.
Defaults to the value of `version`. Version is ignored if Tag
is set.
is set. Deprecated: use ''image'' instead. The image tag can
be specified as part of the image URL.'
type: string
tracingConfig:
description: TracingConfig configures tracing in Thanos. This

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
name: prometheus-operator
rules:
- apiGroups:
@ -18,6 +18,7 @@ rules:
- thanosrulers/finalizers
- servicemonitors
- podmonitors
- probes
- prometheusrules
verbs:
- '*'

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
name: prometheus-operator
namespace: monitoring
spec:
@ -18,15 +18,15 @@ spec:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.3.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.40.0
image: quay.io/coreos/prometheus-operator:v0.40.0
- --config-reloader-image=jimmidyson/configmap-reload:v0.4.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.41.1
image: quay.io/coreos/prometheus-operator:v0.41.1
name: prometheus-operator
ports:
- containerPort: 8080

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
name: prometheus-operator
namespace: monitoring
spec:

View File

@ -4,6 +4,6 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.40.0
app.kubernetes.io/version: v0.41.1
name: prometheus-operator
namespace: monitoring

View File

@ -237,6 +237,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
deployment.mixin.spec.template.spec.withVolumes(volumes) +
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
deployment.mixin.spec.template.spec.securityContext.withFsGroup(65534) +
deployment.mixin.spec.template.spec.withServiceAccountName('grafana'),
},
}

View File

@ -7,6 +7,9 @@
// instances are deployed on K8s, you will likely want to change
// this to 'instance, pod'.
etcd_instance_labels: 'instance',
// scrape_interval_seconds is the global scrape interval which can be
// used to dynamically adjust rate windows as a function of the interval.
scrape_interval_seconds: 30,
},
prometheusAlerts+:: {
@ -21,12 +24,12 @@
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
or
count without (To) (
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
)
)
> 0
||| % $._config,
'for': '3m',
||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4},
'for': '10m',
labels: {
severity: 'critical',
},

View File

@ -17,16 +17,16 @@ tests:
alertname: etcdInsufficientMembers
- eval_time: 5m
alertname: etcdInsufficientMembers
- eval_time: 5m
- eval_time: 12m
alertname: etcdMembersDown
- eval_time: 7m
- eval_time: 14m
alertname: etcdMembersDown
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (1).'
message: 'etcd cluster "etcd": members are down (3).'
- eval_time: 7m
alertname: etcdInsufficientMembers
- eval_time: 11m
@ -49,33 +49,31 @@ tests:
- interval: 1m
input_series:
- series: 'up{job="etcd",instance="10.10.10.0"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.1"}'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.2"}'
values: '1 1 1 1 0 0 0 0'
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 10m
- eval_time: 14m
alertname: etcdMembersDown
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (2).'
message: 'etcd cluster "etcd": members are down (3).'
- interval: 1m
input_series:
- series: 'up{job="etcd",instance="10.10.10.0"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.1"}'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
values: '0 0 1 2 3 4 5 6 7 8 9 10'
values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18'
alert_rule_test:
- eval_time: 4m
alertname: etcdMembersDown
- eval_time: 6m
- eval_time: 13m
alertname: etcdMembersDown
exp_alerts:
- exp_labels:

View File

@ -1,2 +0,0 @@
jsonnetfile.lock.json
vendor/

View File

@ -1,155 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
_config+:: {
namespace: 'default',
versions+:: {
alertmanager: 'v0.21.0',
},
imageRepos+:: {
alertmanager: 'quay.io/prometheus/alertmanager',
},
alertmanager+:: {
name: 'main',
config: {
global: {
resolve_timeout: '5m',
},
inhibit_rules: [{
source_match: {
severity: 'critical',
},
target_match_re: {
severity: 'warning|info',
},
equal: ['namespace', 'alertname'],
}, {
source_match: {
severity: 'warning',
},
target_match_re: {
severity: 'info',
},
equal: ['namespace', 'alertname'],
}],
route: {
group_by: ['namespace'],
group_wait: '30s',
group_interval: '5m',
repeat_interval: '12h',
receiver: 'Default',
routes: [
{
receiver: 'Watchdog',
match: {
alertname: 'Watchdog',
},
},
{
receiver: 'Critical',
match: {
severity: 'critical',
},
},
],
},
receivers: [
{
name: 'Default',
},
{
name: 'Watchdog',
},
{
name: 'Critical',
},
],
},
replicas: 3,
},
},
alertmanager+:: {
secret:
local secret = k.core.v1.secret;
if std.type($._config.alertmanager.config) == 'object' then
secret.new('alertmanager-' + $._config.alertmanager.name, {})
.withStringData({ 'alertmanager.yaml': std.manifestYamlDoc($._config.alertmanager.config) }) +
secret.mixin.metadata.withNamespace($._config.namespace)
else
secret.new('alertmanager-' + $._config.alertmanager.name, { 'alertmanager.yaml': std.base64($._config.alertmanager.config) }) +
secret.mixin.metadata.withNamespace($._config.namespace),
serviceAccount:
local serviceAccount = k.core.v1.serviceAccount;
serviceAccount.new('alertmanager-' + $._config.alertmanager.name) +
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
service:
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
local alertmanagerPort = servicePort.newNamed('web', 9093, 'web');
service.new('alertmanager-' + $._config.alertmanager.name, { app: 'alertmanager', alertmanager: $._config.alertmanager.name }, alertmanagerPort) +
service.mixin.spec.withSessionAffinity('ClientIP') +
service.mixin.metadata.withNamespace($._config.namespace) +
service.mixin.metadata.withLabels({ alertmanager: $._config.alertmanager.name }),
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'alertmanager',
namespace: $._config.namespace,
labels: {
'k8s-app': 'alertmanager',
},
},
spec: {
selector: {
matchLabels: {
alertmanager: $._config.alertmanager.name,
},
},
endpoints: [
{
port: 'web',
interval: '30s',
},
],
},
},
alertmanager:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'Alertmanager',
metadata: {
name: $._config.alertmanager.name,
namespace: $._config.namespace,
labels: {
alertmanager: $._config.alertmanager.name,
},
},
spec: {
replicas: $._config.alertmanager.replicas,
version: $._config.versions.alertmanager,
image: $._config.imageRepos.alertmanager + ':' + $._config.versions.alertmanager,
nodeSelector: { 'kubernetes.io/os': 'linux' },
serviceAccountName: 'alertmanager-' + $._config.alertmanager.name,
securityContext: {
runAsUser: 1000,
runAsNonRoot: true,
fsGroup: 2000,
},
},
},
},
}

View File

@ -1,57 +0,0 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'alertmanager.rules',
rules: [
{
alert: 'AlertmanagerConfigInconsistent',
annotations: {
message: |||
The configuration of the instances of the Alertmanager cluster `{{ $labels.namespace }}/{{ $labels.service }}` are out of sync.
{{ range printf "alertmanager_config_hash{namespace=\"%s\",service=\"%s\"}" $labels.namespace $labels.service | query }}
Configuration hash for pod {{ .Labels.pod }} is "{{ printf "%.f" .Value }}"
{{ end }}
|||,
},
expr: |||
count by(namespace,service) (count_values by(namespace,service) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})) != 1
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
},
{
alert: 'AlertmanagerFailedReload',
annotations: {
message: "Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}.",
},
expr: |||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
},
{
alert: 'AlertmanagerMembersInconsistent',
annotations: {
message: 'Alertmanager has not found all other members of the cluster.',
},
expr: |||
alertmanager_cluster_members{%(alertmanagerSelector)s}
!= on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{%(alertmanagerSelector)s})
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
},
],
},
],
},
}

View File

@ -1,4 +0,0 @@
(import 'alertmanager.libsonnet') +
(import 'general.libsonnet') +
(import 'node.libsonnet') +
(import 'prometheus-operator.libsonnet')

View File

@ -1,38 +0,0 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'general.rules',
rules: [
{
alert: 'TargetDown',
annotations: {
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.',
},
expr: '100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10',
'for': '10m',
labels: {
severity: 'warning',
},
},
{
alert: 'Watchdog',
annotations: {
message: |||
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
|||,
},
expr: 'vector(1)',
labels: {
severity: 'none',
},
},
],
},
],
},
}

View File

@ -1,24 +0,0 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'node-network',
rules: [
{
alert: 'NodeNetworkInterfaceFlapping',
annotations: {
message: 'Network interface "{{ $labels.device }}" changing it\'s up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"',
},
expr: |||
changes(node_network_up{%(nodeExporterSelector)s,%(hostNetworkInterfaceSelector)s}[2m]) > 2
||| % $._config,
'for': '2m',
labels: {
severity: 'warning',
},
},
],
},
],
},
}

View File

@ -1,63 +0,0 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorListErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorWatchErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||
rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorNodeLookupErrors',
expr: |||
rate(prometheus_operator_node_address_lookup_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.',
},
'for': '10m',
},
],
},
],
},
}

View File

@ -1,157 +0,0 @@
# TODO(metalmatze): This file is temporarily saved here for later reference
# until we find out how to integrate the tests into our jsonnet stack.
rule_files:
- rules.yaml
evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 1 1 1 1 1 1 0 0 0 0 0 0'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
values: '3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3'
alert_rule_test:
- eval_time: 5m
alertname: AlertmanagerMembersInconsistent
- eval_time: 11m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 17m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 23m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- interval: 1m
input_series:
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.0",namespace="monitoring",pod="alertmanager-main-0",service="alertmanager-main"}'
values: '3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.1",namespace="monitoring",pod="alertmanager-main-1",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
- series: 'alertmanager_cluster_members{job="alertmanager-main",instance="10.10.10.2",namespace="monitoring",pod="alertmanager-main-2",service="alertmanager-main"}'
values: '3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2'
alert_rule_test:
- eval_time: 5m
alertname: AlertmanagerMembersInconsistent
- eval_time: 11m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 17m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- eval_time: 23m
alertname: AlertmanagerMembersInconsistent
exp_alerts:
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.0
namespace: monitoring
pod: alertmanager-main-0
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.1
namespace: monitoring
pod: alertmanager-main-1
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'
- exp_labels:
service: 'alertmanager-main'
severity: critical
job: 'alertmanager-main'
instance: 10.10.10.2
namespace: monitoring
pod: alertmanager-main-2
exp_annotations:
message: 'Alertmanager has not found all other members of the cluster.'

View File

@ -1,50 +0,0 @@
[
// Drop all kubelet metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)',
action: 'drop',
},
// Drop all scheduler metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)',
action: 'drop',
},
// Drop all apiserver metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)',
action: 'drop',
},
// Drop all docker metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)',
action: 'drop',
},
// Drop all reflector metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)',
action: 'drop',
},
// Drop all etcd metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)',
action: 'drop',
},
// Drop all transformation metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: 'transformation_(transformation_latencies_microseconds|failures_total)',
action: 'drop',
},
// Drop all other metrics which are deprecated in kubernetes.
{
sourceLabels: ['__name__'],
regex: '(admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)',
action: 'drop',
},
]

View File

@ -1,89 +0,0 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/brancz/kubernetes-grafana",
"subdir": "grafana"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/etcd",
"subdir": "Documentation/etcd-mixin"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/prometheus-operator",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "release-0.40"
},
{
"source": {
"git": {
"remote": "https://github.com/ksonnet/ksonnet-lib",
"subdir": ""
}
},
"version": "master",
"name": "ksonnet"
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": ""
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"subdir": "docs/node-mixin"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/prometheus",
"subdir": "documentation/prometheus-mixin"
}
},
"version": "master",
"name": "prometheus"
}
],
"legacyImports": true
}

View File

@ -1,118 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
_config+:: {
versions+:: {
clusterVerticalAutoscaler: "v0.8.1"
},
imageRepos+:: {
clusterVerticalAutoscaler: 'gcr.io/google_containers/cpvpa-amd64'
},
kubeStateMetrics+:: {
stepCPU: '1m',
stepMemory: '2Mi',
},
},
ksmAutoscaler+:: {
clusterRole:
local clusterRole = k.rbac.v1.clusterRole;
local rulesType = clusterRole.rulesType;
local rules = [
rulesType.new() +
rulesType.withApiGroups(['']) +
rulesType.withResources([
'nodes',
]) +
rulesType.withVerbs(['list', 'watch']),
];
clusterRole.new() +
clusterRole.mixin.metadata.withName('ksm-autoscaler') +
clusterRole.withRules(rules),
clusterRoleBinding:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('ksm-autoscaler') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('ksm-autoscaler') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler', namespace: $._config.namespace }]),
roleBinding:
local roleBinding = k.rbac.v1.roleBinding;
roleBinding.new() +
roleBinding.mixin.metadata.withName('ksm-autoscaler') +
roleBinding.mixin.metadata.withNamespace($._config.namespace) +
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
roleBinding.mixin.roleRef.withName('ksm-autoscaler') +
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'ksm-autoscaler' }]),
role:
local role = k.rbac.v1.role;
local rulesType = role.rulesType;
local extensionsRule = rulesType.new() +
rulesType.withApiGroups(['extensions']) +
rulesType.withResources([
'deployments',
]) +
rulesType.withVerbs(['patch']) +
rulesType.withResourceNames(['kube-state-metrics']);
local appsRule = rulesType.new() +
rulesType.withApiGroups(['apps']) +
rulesType.withResources([
'deployments',
]) +
rulesType.withVerbs(['patch']) +
rulesType.withResourceNames(['kube-state-metrics']);
local rules = [extensionsRule, appsRule];
role.new() +
role.mixin.metadata.withName('ksm-autoscaler') +
role.mixin.metadata.withNamespace($._config.namespace) +
role.withRules(rules),
serviceAccount:
local serviceAccount = k.core.v1.serviceAccount;
serviceAccount.new('ksm-autoscaler') +
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
deployment:
local deployment = k.apps.v1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local podSelector = deployment.mixin.spec.template.spec.selectorType;
local podLabels = { app: 'ksm-autoscaler' };
local kubeStateMetricsAutoscaler =
container.new('ksm-autoscaler', $._config.imageRepos.clusterVerticalAutoscaler + ':' + $._config.versions.clusterVerticalAutoscaler) +
container.withArgs([
'/cpvpa',
'--target=deployment/kube-state-metrics',
'--namespace=' + $._config.namespace,
'--logtostderr=true',
'--poll-period-seconds=10',
'--default-config={"kube-state-metrics":{"requests":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}},"limits":{"cpu":{"base":"' + $._config.kubeStateMetrics.baseCPU + '","step":"' + $._config.kubeStateMetrics.stepCPU + '","nodesPerStep":1},"memory":{"base":"' + $._config.kubeStateMetrics.baseMemory + '","step":"' + $._config.kubeStateMetrics.stepMemory + '","nodesPerStep":1}}}}'
]) +
container.mixin.resources.withRequests({cpu: '20m', memory: '10Mi'});
local c = [kubeStateMetricsAutoscaler];
deployment.new('ksm-autoscaler', 1, c, podLabels) +
deployment.mixin.metadata.withNamespace($._config.namespace) +
deployment.mixin.metadata.withLabels(podLabels) +
deployment.mixin.spec.selector.withMatchLabels(podLabels) +
deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
deployment.mixin.spec.template.spec.withServiceAccountName('ksm-autoscaler'),
},
}

View File

@ -1,20 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
prometheus+:: {
clusterRole+: {
rules+:
local role = k.rbac.v1.role;
local policyRule = role.rulesType;
local rule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'services',
'endpoints',
'pods',
]) +
policyRule.withVerbs(['get', 'list', 'watch']);
[rule]
},
}
}

View File

@ -1,41 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local statefulSet = k.apps.v1.statefulSet;
local affinity = statefulSet.mixin.spec.template.spec.affinity.podAntiAffinity.preferredDuringSchedulingIgnoredDuringExecutionType;
local matchExpression = affinity.mixin.podAffinityTerm.labelSelector.matchExpressionsType;
{
local antiaffinity(key, values, namespace) = {
affinity: {
podAntiAffinity: {
preferredDuringSchedulingIgnoredDuringExecution: [
affinity.new() +
affinity.withWeight(100) +
affinity.mixin.podAffinityTerm.withNamespaces(namespace) +
affinity.mixin.podAffinityTerm.withTopologyKey('kubernetes.io/hostname') +
affinity.mixin.podAffinityTerm.labelSelector.withMatchExpressions([
matchExpression.new() +
matchExpression.withKey(key) +
matchExpression.withOperator('In') +
matchExpression.withValues(values),
]),
],
},
},
},
alertmanager+:: {
alertmanager+: {
spec+:
antiaffinity('alertmanager', [$._config.alertmanager.name], $._config.namespace),
},
},
prometheus+: {
local p = self,
prometheus+: {
spec+:
antiaffinity('prometheus', [p.name], p.namespace),
},
},
}

View File

@ -1,23 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+:: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),
kubeDnsPrometheusDiscoveryService:
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('http-metrics-skydns', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
service.mixin.spec.withClusterIp('None'),
},
}

View File

@ -1,20 +0,0 @@
local l = import 'lib/lib.libsonnet';
// withImageRepository is a mixin that replaces all images prefixes by repository. eg.
// quay.io/coreos/addon-resizer -> $repository/addon-resizer
// grafana/grafana -> grafana $repository/grafana
local withImageRepository(repository) = {
local oldRepos = super._config.imageRepos,
local substituteRepository(image, repository) =
if repository == null then image else repository + '/' + l.imageName(image),
_config+:: {
imageRepos:: {
[field]: substituteRepository(oldRepos[field], repository),
for field in std.objectFields(oldRepos)
}
},
};
{
withImageRepository:: withImageRepository,
}

View File

@ -1,197 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
// Custom metrics API allows the HPA v2 to scale based on arbirary metrics.
// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links
{
_config+:: {
prometheusAdapter+:: {
// Rules for custom-metrics
config+:: {
rules+: [
{
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
seriesFilters: [],
resources: {
overrides: {
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
}
},
},
name: {
matches: '^container_(.*)_seconds_total$',
as: ""
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>,container!="POD"}[1m])) by (<<.GroupBy>>)'
},
{
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
seriesFilters: [
{ isNot: '^container_.*_seconds_total$' },
],
resources: {
overrides: {
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
}
},
},
name: {
matches: '^container_(.*)_total$',
as: ''
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>,container!="POD"}[1m])) by (<<.GroupBy>>)'
},
{
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
seriesFilters: [
{ isNot: '^container_.*_total$' },
],
resources: {
overrides: {
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
}
},
},
name: {
matches: '^container_(.*)$',
as: ''
},
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>,container!="POD"}) by (<<.GroupBy>>)'
},
{
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
seriesFilters: [
{ isNot: '.*_total$' },
],
resources: {
template: '<<.Resource>>'
},
name: {
matches: '',
as: ''
},
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)'
},
{
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
seriesFilters: [
{ isNot: '.*_seconds_total' },
],
resources: {
template: '<<.Resource>>'
},
name: {
matches: '^(.*)_total$',
as: ''
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)'
},
{
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
seriesFilters: [],
resources: {
template: '<<.Resource>>'
},
name: {
matches: '^(.*)_seconds_total$',
as: ''
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)'
}
],
},
},
},
prometheusAdapter+:: {
customMetricsApiService: {
apiVersion: 'apiregistration.k8s.io/v1',
kind: 'APIService',
metadata: {
name: 'v1beta1.custom.metrics.k8s.io',
},
spec: {
service: {
name: $.prometheusAdapter.service.metadata.name,
namespace: $._config.namespace,
},
group: 'custom.metrics.k8s.io',
version: 'v1beta1',
insecureSkipTLSVerify: true,
groupPriorityMinimum: 100,
versionPriority: 100,
},
},
customMetricsApiServiceV1Beta2: {
apiVersion: 'apiregistration.k8s.io/v1',
kind: 'APIService',
metadata: {
name: 'v1beta2.custom.metrics.k8s.io',
},
spec: {
service: {
name: $.prometheusAdapter.service.metadata.name,
namespace: $._config.namespace,
},
group: 'custom.metrics.k8s.io',
version: 'v1beta2',
insecureSkipTLSVerify: true,
groupPriorityMinimum: 100,
versionPriority: 200,
},
},
customMetricsClusterRoleServerResources:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local rules =
policyRule.new() +
policyRule.withApiGroups(['custom.metrics.k8s.io']) +
policyRule.withResources(['*']) +
policyRule.withVerbs(['*']);
clusterRole.new() +
clusterRole.mixin.metadata.withName('custom-metrics-server-resources') +
clusterRole.withRules(rules),
customMetricsClusterRoleBindingServerResources:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('custom-metrics-server-resources') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('custom-metrics-server-resources') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{
kind: 'ServiceAccount',
name: $.prometheusAdapter.serviceAccount.metadata.name,
namespace: $._config.namespace,
}]),
customMetricsClusterRoleBindingHPA:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('hpa-controller-custom-metrics') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('custom-metrics-server-resources') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{
kind: 'ServiceAccount',
name: 'horizontal-pod-autoscaler',
namespace: 'kube-system',
}]),
}
}

View File

@ -1,82 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
eks: {
minimumAvailableIPs: 10,
minimumAvailableIPsTime: '10m'
}
},
prometheus+: {
serviceMonitorCoreDNS+: {
spec+: {
endpoints: [
{
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
interval: "15s",
targetPort: 9153
}
]
},
},
AwsEksCniMetricService:
service.new('aws-node', { 'k8s-app' : 'aws-node' } , servicePort.newNamed('cni-metrics-port', 61678, 61678)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'aws-node' }) +
service.mixin.spec.withClusterIp('None'),
serviceMonitorAwsEksCNI:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'awsekscni',
namespace: $._config.namespace,
labels: {
'k8s-app': 'eks-cni',
},
},
spec: {
jobLabel: 'k8s-app',
selector: {
matchLabels: {
'k8s-app': 'aws-node',
},
},
namespaceSelector: {
matchNames: [
'kube-system',
],
},
endpoints: [
{
port: 'cni-metrics-port',
interval: '30s',
path: '/metrics',
},
],
},
},
},
prometheusRules+: {
groups+: [
{
name: 'kube-prometheus-eks.rules',
rules: [
{
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
labels: {
severity: 'critical',
},
annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
},
'for': $._config.eks.minimumAvailableIPsTime,
alert: 'EksAvailableIPs'
},
],
},
],
},
}

View File

@ -1,46 +0,0 @@
{
prometheus+:: {
serviceMonitorKubelet+:
{
spec+: {
endpoints: [
{
port: 'http-metrics',
scheme: 'http',
interval: '30s',
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path'
},
],
},
{
port: 'http-metrics',
scheme: 'http',
path: '/metrics/cadvisor',
interval: '30s',
honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path'
},
],
metricRelabelings: [
// Drop a bunch of metrics which are disabled but still sent, see
// https://github.com/google/cadvisor/issues/1925.
{
sourceLabels: ['__name__'],
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
action: 'drop',
},
],
},
],
},
},
},
}

View File

@ -1,13 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+:: {
kubeDnsPrometheusDiscoveryService:
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 9153, 9153)]) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
service.mixin.spec.withClusterIp('None'),
},
}

View File

@ -1,23 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+:: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),
kubeDnsPrometheusDiscoveryService:
service.new('kube-dns-prometheus-discovery', { 'k8s-app': 'kube-dns' }, [servicePort.newNamed('metrics', 10055, 10055), servicePort.newNamed('http-metrics-dnsmasq', 10054, 10054)]) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-dns' }) +
service.mixin.spec.withClusterIp('None'),
},
}

View File

@ -1,8 +0,0 @@
local kp = (import 'kube-prometheus/kube-prometheus.libsonnet');
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } +
{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } +
{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } +
{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } +
{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } +
{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) }

View File

@ -1,18 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),
},
}

View File

@ -1,18 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),
},
}

View File

@ -1,40 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'component': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'component': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),
serviceMonitorKubeScheduler+: {
spec+: {
selector+: {
matchLabels: {
'k8s-app': 'kube-scheduler',
},
},
},
},
serviceMonitorKubeControllerManager+: {
spec+: {
selector+: {
matchLabels: {
'k8s-app': 'kube-controller-manager',
},
},
},
},
},
}

View File

@ -1,35 +0,0 @@
// On managed Kubernetes clusters some of the control plane components are not exposed to customers.
// Disable scrape jobs, service monitors, and alert groups for these components by overwriting 'kube-prometheus.libsonnet' defaults
{
_config+:: {
// This snippet walks the original object (super.jobs, set as temp var j) and creates a replacement jobs object
// excluding any members of the set specified (eg: controller and scheduler).
local j = super.jobs,
jobs: {
[k]: j[k]
for k in std.objectFields(j)
if !std.setMember(k, ['KubeControllerManager', 'KubeScheduler'])
},
// Skip alerting rules too
prometheus+:: {
rules+:: {
local g = super.groups,
groups: [
h
for h in g
if !std.setMember(h.name, ['kubernetes-system-controller-manager', 'kubernetes-system-scheduler'])
],
},
},
},
// Same as above but for ServiceMonitor's
local p = super.prometheus,
prometheus: {
[q]: p[q]
for q in std.objectFields(p)
if !std.setMember(q, ['serviceMonitorKubeControllerManager', 'serviceMonitorKubeScheduler'])
},
}

View File

@ -1,21 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
service+:
service.mixin.spec.withPorts(servicePort.newNamed('web', 9090, 'web') + servicePort.withNodePort(30900)) +
service.mixin.spec.withType('NodePort'),
},
alertmanager+: {
service+:
service.mixin.spec.withPorts(servicePort.newNamed('web', 9093, 'web') + servicePort.withNodePort(30903)) +
service.mixin.spec.withType('NodePort'),
},
grafana+: {
service+:
service.mixin.spec.withPorts(servicePort.newNamed('http', 3000, 'http') + servicePort.withNodePort(30902)) +
service.mixin.spec.withType('NodePort'),
},
}

View File

@ -1,99 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
(import 'etcd-mixin/mixin.libsonnet') + {
_config+:: {
etcd: {
ips: [],
clientCA: null,
clientKey: null,
clientCert: null,
serverName: null,
insecureSkipVerify: null,
},
},
prometheus+:: {
serviceEtcd:
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
local etcdServicePort = servicePort.newNamed('metrics', 2379, 2379);
service.new('etcd', null, etcdServicePort) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'etcd' }) +
service.mixin.spec.withClusterIp('None'),
endpointsEtcd:
local endpoints = k.core.v1.endpoints;
local endpointSubset = endpoints.subsetsType;
local endpointPort = endpointSubset.portsType;
local etcdPort = endpointPort.new() +
endpointPort.withName('metrics') +
endpointPort.withPort(2379) +
endpointPort.withProtocol('TCP');
local subset = endpointSubset.new() +
endpointSubset.withAddresses([
{ ip: etcdIP }
for etcdIP in $._config.etcd.ips
]) +
endpointSubset.withPorts(etcdPort);
endpoints.new() +
endpoints.mixin.metadata.withName('etcd') +
endpoints.mixin.metadata.withNamespace('kube-system') +
endpoints.mixin.metadata.withLabels({ 'k8s-app': 'etcd' }) +
endpoints.withSubsets(subset),
serviceMonitorEtcd:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'etcd',
namespace: 'kube-system',
labels: {
'k8s-app': 'etcd',
},
},
spec: {
jobLabel: 'k8s-app',
endpoints: [
{
port: 'metrics',
interval: '30s',
scheme: 'https',
// Prometheus Operator (and Prometheus) allow us to specify a tlsConfig. This is required as most likely your etcd metrics end points is secure.
tlsConfig: {
caFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client-ca.crt',
keyFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.key',
certFile: '/etc/prometheus/secrets/kube-etcd-client-certs/etcd-client.crt',
[if $._config.etcd.serverName != null then 'serverName']: $._config.etcd.serverName,
[if $._config.etcd.insecureSkipVerify != null then 'insecureSkipVerify']: $._config.etcd.insecureSkipVerify,
},
},
],
selector: {
matchLabels: {
'k8s-app': 'etcd',
},
},
},
},
secretEtcdCerts:
// Prometheus Operator allows us to mount secrets in the pod. By loading the secrets as files, they can be made available inside the Prometheus pod.
local secret = k.core.v1.secret;
secret.new('kube-etcd-client-certs', {
'etcd-client-ca.crt': std.base64($._config.etcd.clientCA),
'etcd-client.key': std.base64($._config.etcd.clientKey),
'etcd-client.crt': std.base64($._config.etcd.clientCert),
}) +
secret.mixin.metadata.withNamespace($._config.namespace),
prometheus+:
{
// Reference info: https://coreos.com/operators/prometheus/docs/latest/api.html#prometheusspec
spec+: {
secrets+: [$.prometheus.secretEtcdCerts.metadata.name],
},
},
},
}

View File

@ -1,35 +0,0 @@
// Strips spec.containers[].limits for certain containers
// https://github.com/coreos/kube-prometheus/issues/72
{
_config+:: {
resources+:: {
'addon-resizer'+: {
limits: {},
},
'kube-rbac-proxy'+: {
limits: {},
},
'kube-state-metrics'+: {
limits: {},
},
'node-exporter'+: {
limits: {},
},
},
},
prometheusOperator+: {
deployment+: {
spec+: {
template+: {
spec+: {
local addArgs(c) =
if c.name == 'prometheus-operator'
then c + {args+: ['--config-reloader-cpu=0']}
else c,
containers: std.map(addArgs, super.containers),
},
},
},
},
},
}

View File

@ -1,39 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
versions+:: {
thanos: 'v0.14.0',
},
imageRepos+:: {
thanos: 'quay.io/thanos/thanos',
},
thanos+:: {
objectStorageConfig: {
key: 'thanos.yaml', // How the file inside the secret is called
name: 'thanos-objectstorage', // This is the name of your Kubernetes secret with the config
},
},
},
prometheus+:: {
// Add the grpc port to the Prometheus service to be able to query it with the Thanos Querier
service+: {
spec+: {
ports+: [
servicePort.newNamed('grpc', 10901, 10901),
],
},
},
prometheus+: {
spec+: {
thanos+: {
version: $._config.versions.thanos,
image: $._config.imageRepos.thanos + ':' + $._config.versions.thanos,
objectStorageConfig: $._config.thanos.objectStorageConfig,
},
},
},
},
}

View File

@ -1,189 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
serviceWeaveNet:
service.new('weave-net', { 'name': 'weave-net' }, servicePort.newNamed('weave-net-metrics', 6782, 6782)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'weave-net' }) +
service.mixin.spec.withClusterIp('None'),
serviceMonitorWeaveNet: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'weave-net',
labels: {
'k8s-app': 'weave-net',
},
namespace: 'monitoring',
},
spec: {
jobLabel: 'k8s-app',
endpoints: [
{
port: 'weave-net-metrics',
path: '/metrics',
interval: '15s',
},
],
namespaceSelector: {
matchNames: [
'kube-system',
],
},
selector: {
matchLabels: {
'k8s-app': 'weave-net',
},
},
},
},
},
prometheusRules+: {
groups+: [
{
name: 'weave-net',
rules: [
{
alert: 'WeaveNetIPAMSplitBrain',
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetIPAMUnreachable',
expr: 'weave_ipam_unreachable_percentage > 25',
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
description: 'actionable: Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetIPAMPendingAllocates',
expr: 'sum(weave_ipam_pending_allocates) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Number of pending allocates is above the threshold.',
description: 'actionable: Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetIPAMPendingClaims',
expr: 'sum(weave_ipam_pending_claims) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Number of pending claims is above the threshold.',
description: 'actionable: Please find the problem and fix it.',
},
},
{
alert: 'WeaveNetFastDPFlowsLow',
expr: 'sum(weave_flows) < 15000',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Number of FastDP flows is below the threshold.',
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
},
},
{
alert: 'WeaveNetFastDPFlowsOff',
expr: 'sum(weave_flows == bool 0) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'FastDP flows is zero.',
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
},
},
{
alert: 'WeaveNetHighConnectionTerminationRate',
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are getting terminated.',
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
},
},
{
alert: 'WeaveNetConnectionsConnecting',
expr: 'sum(weave_connections{state="connecting"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in connecting state.',
description: 'actionable: Please find the reason for this and fix it.',
},
},
{
alert: 'WeaveNetConnectionsRetying',
expr: 'sum(weave_connections{state="retrying"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in retrying state.',
description: 'actionable: Please find the reason for this and fix it.',
},
},
{
alert: 'WeaveNetConnectionsPending',
expr: 'sum(weave_connections{state="pending"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in pending state.',
description: 'actionable: Please find the reason for this and fix it.',
},
},
{
alert: 'WeaveNetConnectionsFailed',
expr: 'sum(weave_connections{state="failed"}) > 0',
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A lot of connections are in failed state.',
description: 'actionable: Please find the reason and fix it.',
},
},
],
},
],
},
grafanaDashboards+:: {
'weave-net.json': (import 'grafana-weave-net.json'),
'weave-net-cluster.json': (import 'grafana-weave-net-cluster.json'),
},
}

View File

@ -1,196 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local k3 = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
local configMapList = k3.core.v1.configMapList;
(import 'grafana/grafana.libsonnet') +
(import 'kube-state-metrics/kube-state-metrics.libsonnet') +
(import 'kube-state-metrics-mixin/mixin.libsonnet') +
(import 'node-exporter/node-exporter.libsonnet') +
(import 'node-mixin/mixin.libsonnet') +
(import 'alertmanager/alertmanager.libsonnet') +
(import 'prometheus-operator/prometheus-operator.libsonnet') +
(import 'prometheus/prometheus.libsonnet') +
(import 'prometheus-adapter/prometheus-adapter.libsonnet') +
(import 'kubernetes-mixin/mixin.libsonnet') +
(import 'prometheus/mixin.libsonnet') +
(import 'alerts/alerts.libsonnet') +
(import 'rules/rules.libsonnet') + {
kubePrometheus+:: {
namespace: k.core.v1.namespace.new($._config.namespace),
},
prometheusOperator+:: {
service+: {
spec+: {
ports: [
{
name: 'https',
port: 8443,
targetPort: 'https',
},
],
},
},
serviceMonitor+: {
spec+: {
endpoints: [
{
port: 'https',
scheme: 'https',
honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
tlsConfig: {
insecureSkipVerify: true,
},
},
]
},
},
clusterRole+: {
rules+: [
{
apiGroups: ['authentication.k8s.io'],
resources: ['tokenreviews'],
verbs: ['create'],
},
{
apiGroups: ['authorization.k8s.io'],
resources: ['subjectaccessreviews'],
verbs: ['create'],
},
],
},
} +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy',
securePortName: 'https',
securePort: 8443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8080/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin,
grafana+:: {
dashboardDefinitions: configMapList.new(super.dashboardDefinitions),
serviceMonitor: {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'grafana',
namespace: $._config.namespace,
},
spec: {
selector: {
matchLabels: {
app: 'grafana',
},
},
endpoints: [
{
port: 'http',
interval: '15s',
},
],
},
},
},
} + {
_config+:: {
namespace: 'default',
versions+:: {
grafana: '7.1.0',
},
tlsCipherSuites: [
'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
// 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
// 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
// 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
// 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2
// 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
// disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go
'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384',
'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384',
'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305',
'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305',
],
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
kubeletSelector: 'job="kubelet", metrics_path="/metrics"',
kubeStateMetricsSelector: 'job="kube-state-metrics"',
nodeExporterSelector: 'job="node-exporter"',
fsSpaceFillingUpCriticalThreshold: 15,
notKubeDnsSelector: 'job!="kube-dns"',
kubeSchedulerSelector: 'job="kube-scheduler"',
kubeControllerManagerSelector: 'job="kube-controller-manager"',
kubeApiserverSelector: 'job="apiserver"',
coreDNSSelector: 'job="kube-dns"',
podLabel: 'pod',
alertmanagerSelector: 'job="alertmanager-' + $._config.alertmanager.name + '",namespace="' + $._config.namespace + '"',
prometheusSelector: 'job="prometheus-' + $._config.prometheus.name + '",namespace="' + $._config.namespace + '"',
prometheusName: '{{$labels.namespace}}/{{$labels.pod}}',
prometheusOperatorSelector: 'job="prometheus-operator",namespace="' + $._config.namespace + '"',
jobs: {
Kubelet: $._config.kubeletSelector,
KubeScheduler: $._config.kubeSchedulerSelector,
KubeControllerManager: $._config.kubeControllerManagerSelector,
KubeAPI: $._config.kubeApiserverSelector,
KubeStateMetrics: $._config.kubeStateMetricsSelector,
NodeExporter: $._config.nodeExporterSelector,
Alertmanager: $._config.alertmanagerSelector,
Prometheus: $._config.prometheusSelector,
PrometheusOperator: $._config.prometheusOperatorSelector,
CoreDNS: $._config.coreDNSSelector,
},
resources+:: {
'addon-resizer': {
requests: { cpu: '10m', memory: '30Mi' },
limits: { cpu: '50m', memory: '30Mi' },
},
'kube-rbac-proxy': {
requests: { cpu: '10m', memory: '20Mi' },
limits: { cpu: '20m', memory: '40Mi' },
},
'kube-state-metrics': {
requests: { cpu: '100m', memory: '150Mi' },
limits: { cpu: '100m', memory: '150Mi' },
},
'node-exporter': {
requests: { cpu: '102m', memory: '180Mi' },
limits: { cpu: '250m', memory: '180Mi' },
},
},
prometheus+:: {
rules: $.prometheusRules + $.prometheusAlerts,
},
grafana+:: {
dashboards: $.grafanaDashboards,
},
},
}

View File

@ -1,91 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local deployment = k.apps.v1.deployment;
local container = deployment.mixin.spec.template.spec.containersType;
local containerPort = container.portsType;
{
local krp = self,
config+:: {
kubeRbacProxy: {
image: error 'must provide image',
name: error 'must provide name',
securePortName: error 'must provide securePortName',
securePort: error 'must provide securePort',
secureListenAddress: error 'must provide secureListenAddress',
upstream: error 'must provide upstream',
tlsCipherSuites: error 'must provide tlsCipherSuites',
},
},
specMixin:: {
local sm = self,
config+:: {
kubeRbacProxy: {
image: error 'must provide image',
name: error 'must provide name',
securePortName: error 'must provide securePortName',
securePort: error 'must provide securePort',
secureListenAddress: error 'must provide secureListenAddress',
upstream: error 'must provide upstream',
tlsCipherSuites: error 'must provide tlsCipherSuites',
},
},
spec+: {
template+: {
spec+: {
containers+: [
container.new(krp.config.kubeRbacProxy.name, krp.config.kubeRbacProxy.image) +
container.mixin.securityContext.withRunAsUser(65534) +
container.withArgs([
'--logtostderr',
'--secure-listen-address=' + krp.config.kubeRbacProxy.secureListenAddress,
'--tls-cipher-suites=' + std.join(',', krp.config.kubeRbacProxy.tlsCipherSuites),
'--upstream=' + krp.config.kubeRbacProxy.upstream,
]) +
container.withPorts(containerPort.newNamed(krp.config.kubeRbacProxy.securePort, krp.config.kubeRbacProxy.securePortName)),
],
},
},
},
},
deploymentMixin:: {
local dm = self,
config+:: {
kubeRbacProxy: {
image: error 'must provide image',
name: error 'must provide name',
securePortName: error 'must provide securePortName',
securePort: error 'must provide securePort',
secureListenAddress: error 'must provide secureListenAddress',
upstream: error 'must provide upstream',
tlsCipherSuites: error 'must provide tlsCipherSuites',
},
},
deployment+: krp.specMixin {
config+:: {
kubeRbacProxy+: dm.config.kubeRbacProxy,
},
},
},
statefulSetMixin:: {
local sm = self,
config+:: {
kubeRbacProxy: {
image: error 'must provide image',
name: error 'must provide name',
securePortName: error 'must provide securePortName',
securePort: error 'must provide securePort',
secureListenAddress: error 'must provide secureListenAddress',
upstream: error 'must provide upstream',
tlsCipherSuites: error 'must provide tlsCipherSuites',
},
},
statefulSet+: krp.specMixin {
config+:: {
kubeRbacProxy+: sm.config.kubeRbacProxy,
},
},
},
}

View File

@ -1,129 +0,0 @@
{
_config+:: {
versions+:: {
kubeStateMetrics: '1.9.5',
},
imageRepos+:: {
kubeStateMetrics: 'quay.io/coreos/kube-state-metrics',
},
kubeStateMetrics+:: {
scrapeInterval: '30s',
scrapeTimeout: '30s',
},
},
kubeStateMetrics+:: (import 'kube-state-metrics/kube-state-metrics.libsonnet') +
{
local ksm = self,
name:: 'kube-state-metrics',
namespace:: $._config.namespace,
version:: $._config.versions.kubeStateMetrics,
image:: $._config.imageRepos.kubeStateMetrics + ':v' + $._config.versions.kubeStateMetrics,
service+: {
spec+: {
ports: [
{
name: 'https-main',
port: 8443,
targetPort: 'https-main',
},
{
name: 'https-self',
port: 9443,
targetPort: 'https-self',
},
],
},
},
deployment+: {
spec+: {
template+: {
spec+: {
containers: std.map(function(c) c {
ports:: null,
livenessProbe:: null,
readinessProbe:: null,
args: ['--host=127.0.0.1', '--port=8081', '--telemetry-host=127.0.0.1', '--telemetry-port=8082'],
}, super.containers),
},
},
},
},
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kube-state-metrics',
namespace: $._config.namespace,
labels: {
'app.kubernetes.io/name': 'kube-state-metrics',
'app.kubernetes.io/version': ksm.version,
},
},
spec: {
jobLabel: 'app.kubernetes.io/name',
selector: {
matchLabels: {
'app.kubernetes.io/name': 'kube-state-metrics',
},
},
endpoints: [
{
port: 'https-main',
scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval,
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
honorLabels: true,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
regex: '(pod|service|endpoint|namespace)',
action: 'labeldrop',
},
],
tlsConfig: {
insecureSkipVerify: true,
},
},
{
port: 'https-self',
scheme: 'https',
interval: $._config.kubeStateMetrics.scrapeInterval,
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
tlsConfig: {
insecureSkipVerify: true,
},
},
],
},
},
} +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-main',
securePortName: 'https-main',
securePort: 8443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8081/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin +
((import 'kube-prometheus/kube-rbac-proxy/container.libsonnet') {
config+:: {
kubeRbacProxy: {
local cfg = self,
image: $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy,
name: 'kube-rbac-proxy-self',
securePortName: 'https-self',
securePort: 9443,
secureListenAddress: ':%d' % self.securePort,
upstream: 'http://127.0.0.1:8082/',
tlsCipherSuites: $._config.tlsCipherSuites,
},
},
}).deploymentMixin,
}

View File

@ -1,21 +0,0 @@
// imageName extracts the image name from a fully qualified image string. eg.
// quay.io/coreos/addon-resizer -> addon-resizer
// grafana/grafana -> grafana
local imageName(image) =
local parts = std.split(image, '/');
local len = std.length(parts);
if len == 3 then
# registry.com/org/image
parts[2]
else if len == 2 then
# org/image
parts[1]
else if len == 1 then
# image, ie. busybox
parts[0]
else
error 'unknown image format: ' + image;
{
imageName:: imageName,
}

View File

@ -1 +0,0 @@
(import 'image.libsonnet')

View File

@ -1,205 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
_config+:: {
namespace: 'default',
versions+:: {
nodeExporter: 'v0.18.1',
kubeRbacProxy: 'v0.4.1',
},
imageRepos+:: {
nodeExporter: 'quay.io/prometheus/node-exporter',
kubeRbacProxy: 'quay.io/coreos/kube-rbac-proxy',
},
nodeExporter+:: {
listenAddress: '127.0.0.1',
port: 9100,
labels: {
'app.kubernetes.io/name': 'node-exporter',
'app.kubernetes.io/version': $._config.versions.nodeExporter,
},
selectorLabels: {
[labelName]: $._config.nodeExporter.labels[labelName]
for labelName in std.objectFields($._config.nodeExporter.labels)
if !std.setMember(labelName, ['app.kubernetes.io/version'])
},
},
},
nodeExporter+:: {
clusterRoleBinding:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('node-exporter') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('node-exporter') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'node-exporter', namespace: $._config.namespace }]),
clusterRole:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local authenticationRole = policyRule.new() +
policyRule.withApiGroups(['authentication.k8s.io']) +
policyRule.withResources([
'tokenreviews',
]) +
policyRule.withVerbs(['create']);
local authorizationRole = policyRule.new() +
policyRule.withApiGroups(['authorization.k8s.io']) +
policyRule.withResources([
'subjectaccessreviews',
]) +
policyRule.withVerbs(['create']);
local rules = [authenticationRole, authorizationRole];
clusterRole.new() +
clusterRole.mixin.metadata.withName('node-exporter') +
clusterRole.withRules(rules),
daemonset:
local daemonset = k.apps.v1.daemonSet;
local container = daemonset.mixin.spec.template.spec.containersType;
local volume = daemonset.mixin.spec.template.spec.volumesType;
local containerPort = container.portsType;
local containerVolumeMount = container.volumeMountsType;
local podSelector = daemonset.mixin.spec.template.spec.selectorType;
local toleration = daemonset.mixin.spec.template.spec.tolerationsType;
local containerEnv = container.envType;
local podLabels = $._config.nodeExporter.labels;
local selectorLabels = $._config.nodeExporter.selectorLabels;
local existsToleration = toleration.new() +
toleration.withOperator('Exists');
local procVolumeName = 'proc';
local procVolume = volume.fromHostPath(procVolumeName, '/proc');
local procVolumeMount = containerVolumeMount.new(procVolumeName, '/host/proc');
local sysVolumeName = 'sys';
local sysVolume = volume.fromHostPath(sysVolumeName, '/sys');
local sysVolumeMount = containerVolumeMount.new(sysVolumeName, '/host/sys');
local rootVolumeName = 'root';
local rootVolume = volume.fromHostPath(rootVolumeName, '/');
local rootVolumeMount = containerVolumeMount.new(rootVolumeName, '/host/root').
withMountPropagation('HostToContainer').
withReadOnly(true);
local nodeExporter =
container.new('node-exporter', $._config.imageRepos.nodeExporter + ':' + $._config.versions.nodeExporter) +
container.withArgs([
'--web.listen-address=' + std.join(':', [$._config.nodeExporter.listenAddress, std.toString($._config.nodeExporter.port)]),
'--path.procfs=/host/proc',
'--path.sysfs=/host/sys',
'--path.rootfs=/host/root',
'--no-collector.wifi',
'--no-collector.hwmon',
'--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)',
]) +
container.withVolumeMounts([procVolumeMount, sysVolumeMount, rootVolumeMount]) +
container.mixin.resources.withRequests($._config.resources['node-exporter'].requests) +
container.mixin.resources.withLimits($._config.resources['node-exporter'].limits);
local ip = containerEnv.fromFieldPath('IP', 'status.podIP');
local proxy =
container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) +
container.withArgs([
'--logtostderr',
'--secure-listen-address=[$(IP)]:' + $._config.nodeExporter.port,
'--tls-cipher-suites=' + std.join(',', $._config.tlsCipherSuites),
'--upstream=http://127.0.0.1:' + $._config.nodeExporter.port + '/',
]) +
// Keep `hostPort` here, rather than in the node-exporter container
// because Kubernetes mandates that if you define a `hostPort` then
// `containerPort` must match. In our case, we are splitting the
// host port and container port between the two containers.
// We'll keep the port specification here so that the named port
// used by the service is tied to the proxy container. We *could*
// forgo declaring the host port, however it is important to declare
// it so that the scheduler can decide if the pod is schedulable.
container.withPorts(containerPort.new($._config.nodeExporter.port) + containerPort.withHostPort($._config.nodeExporter.port) + containerPort.withName('https')) +
container.mixin.resources.withRequests($._config.resources['kube-rbac-proxy'].requests) +
container.mixin.resources.withLimits($._config.resources['kube-rbac-proxy'].limits) +
container.withEnv([ip]);
local c = [nodeExporter, proxy];
daemonset.new() +
daemonset.mixin.metadata.withName('node-exporter') +
daemonset.mixin.metadata.withNamespace($._config.namespace) +
daemonset.mixin.metadata.withLabels(podLabels) +
daemonset.mixin.spec.selector.withMatchLabels(selectorLabels) +
daemonset.mixin.spec.template.metadata.withLabels(podLabels) +
daemonset.mixin.spec.template.spec.withTolerations([existsToleration]) +
daemonset.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
daemonset.mixin.spec.template.spec.withContainers(c) +
daemonset.mixin.spec.template.spec.withVolumes([procVolume, sysVolume, rootVolume]) +
daemonset.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
daemonset.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
daemonset.mixin.spec.template.spec.withServiceAccountName('node-exporter') +
daemonset.mixin.spec.template.spec.withHostPid(true) +
daemonset.mixin.spec.template.spec.withHostNetwork(true),
serviceAccount:
local serviceAccount = k.core.v1.serviceAccount;
serviceAccount.new('node-exporter') +
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'node-exporter',
namespace: $._config.namespace,
labels: $._config.nodeExporter.labels,
},
spec: {
jobLabel: 'app.kubernetes.io/name',
selector: {
matchLabels: $._config.nodeExporter.selectorLabels,
},
endpoints: [
{
port: 'https',
scheme: 'https',
interval: '15s',
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
action: 'replace',
regex: '(.*)',
replacement: '$1',
sourceLabels: ['__meta_kubernetes_pod_node_name'],
targetLabel: 'instance',
},
],
tlsConfig: {
insecureSkipVerify: true,
},
},
],
},
},
service:
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
local nodeExporterPort = servicePort.newNamed('https', $._config.nodeExporter.port, 'https');
service.new('node-exporter', $._config.nodeExporter.selectorLabels, nodeExporterPort) +
service.mixin.metadata.withNamespace($._config.namespace) +
service.mixin.metadata.withLabels($._config.nodeExporter.labels) +
service.mixin.spec.withClusterIp('None'),
},
}

View File

@ -1,234 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
_config+:: {
namespace: 'default',
versions+:: {
prometheusAdapter: 'v0.7.0',
},
imageRepos+:: {
prometheusAdapter: 'directxman12/k8s-prometheus-adapter',
},
prometheusAdapter+:: {
name: 'prometheus-adapter',
labels: { name: $._config.prometheusAdapter.name },
prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc.cluster.local:9090/',
config: {
resourceRules: {
cpu: {
containerQuery: 'sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)',
nodeQuery: 'sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)',
resources: {
overrides: {
node: {
resource: 'node'
},
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
},
},
},
containerLabel: 'container'
},
memory: {
containerQuery: 'sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)',
nodeQuery: 'sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)',
resources: {
overrides: {
instance: {
resource: 'node'
},
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
},
},
},
containerLabel: 'container'
},
window: '5m',
},
}
},
},
prometheusAdapter+:: {
apiService:
{
apiVersion: 'apiregistration.k8s.io/v1',
kind: 'APIService',
metadata: {
name: 'v1beta1.metrics.k8s.io',
},
spec: {
service: {
name: $.prometheusAdapter.service.metadata.name,
namespace: $._config.namespace,
},
group: 'metrics.k8s.io',
version: 'v1beta1',
insecureSkipTLSVerify: true,
groupPriorityMinimum: 100,
versionPriority: 100,
},
},
configMap:
local configmap = k.core.v1.configMap;
configmap.new('adapter-config', { 'config.yaml': std.manifestYamlDoc($._config.prometheusAdapter.config) }) +
configmap.mixin.metadata.withNamespace($._config.namespace),
service:
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
service.new(
$._config.prometheusAdapter.name,
$._config.prometheusAdapter.labels,
servicePort.newNamed('https', 443, 6443),
) +
service.mixin.metadata.withNamespace($._config.namespace) +
service.mixin.metadata.withLabels($._config.prometheusAdapter.labels),
deployment:
local deployment = k.apps.v1.deployment;
local volume = deployment.mixin.spec.template.spec.volumesType;
local container = deployment.mixin.spec.template.spec.containersType;
local containerVolumeMount = container.volumeMountsType;
local c =
container.new($._config.prometheusAdapter.name, $._config.imageRepos.prometheusAdapter + ':' + $._config.versions.prometheusAdapter) +
container.withArgs([
'--cert-dir=/var/run/serving-cert',
'--config=/etc/adapter/config.yaml',
'--logtostderr=true',
'--metrics-relist-interval=1m',
'--prometheus-url=' + $._config.prometheusAdapter.prometheusURL,
'--secure-port=6443',
]) +
container.withPorts([{ containerPort: 6443 }]) +
container.withVolumeMounts([
containerVolumeMount.new('tmpfs', '/tmp'),
containerVolumeMount.new('volume-serving-cert', '/var/run/serving-cert'),
containerVolumeMount.new('config', '/etc/adapter'),
],);
deployment.new($._config.prometheusAdapter.name, 1, c, $._config.prometheusAdapter.labels) +
deployment.mixin.metadata.withNamespace($._config.namespace) +
deployment.mixin.spec.selector.withMatchLabels($._config.prometheusAdapter.labels) +
deployment.mixin.spec.template.spec.withServiceAccountName($.prometheusAdapter.serviceAccount.metadata.name) +
deployment.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(1) +
deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(0) +
deployment.mixin.spec.template.spec.withVolumes([
volume.fromEmptyDir(name='tmpfs'),
volume.fromEmptyDir(name='volume-serving-cert'),
{ name: 'config', configMap: { name: 'adapter-config' } },
]),
serviceAccount:
local serviceAccount = k.core.v1.serviceAccount;
serviceAccount.new($._config.prometheusAdapter.name) +
serviceAccount.mixin.metadata.withNamespace($._config.namespace),
clusterRole:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local rules =
policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources(['nodes', 'namespaces', 'pods', 'services']) +
policyRule.withVerbs(['get', 'list', 'watch']);
clusterRole.new() +
clusterRole.mixin.metadata.withName($._config.prometheusAdapter.name) +
clusterRole.withRules(rules),
clusterRoleBinding:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName($._config.prometheusAdapter.name) +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName($.prometheusAdapter.clusterRole.metadata.name) +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{
kind: 'ServiceAccount',
name: $.prometheusAdapter.serviceAccount.metadata.name,
namespace: $._config.namespace,
}]),
clusterRoleBindingDelegator:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('resource-metrics:system:auth-delegator') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('system:auth-delegator') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{
kind: 'ServiceAccount',
name: $.prometheusAdapter.serviceAccount.metadata.name,
namespace: $._config.namespace,
}]),
clusterRoleServerResources:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local rules =
policyRule.new() +
policyRule.withApiGroups(['metrics.k8s.io']) +
policyRule.withResources(['*']) +
policyRule.withVerbs(['*']);
clusterRole.new() +
clusterRole.mixin.metadata.withName('resource-metrics-server-resources') +
clusterRole.withRules(rules),
clusterRoleAggregatedMetricsReader:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local rules =
policyRule.new() +
policyRule.withApiGroups(['metrics.k8s.io']) +
policyRule.withResources(['pods', 'nodes']) +
policyRule.withVerbs(['get','list','watch']);
clusterRole.new() +
clusterRole.mixin.metadata.withName('system:aggregated-metrics-reader') +
clusterRole.mixin.metadata.withLabels({
"rbac.authorization.k8s.io/aggregate-to-admin": "true",
"rbac.authorization.k8s.io/aggregate-to-edit": "true",
"rbac.authorization.k8s.io/aggregate-to-view": "true",
}) +
clusterRole.withRules(rules),
roleBindingAuthReader:
local roleBinding = k.rbac.v1.roleBinding;
roleBinding.new() +
roleBinding.mixin.metadata.withName('resource-metrics-auth-reader') +
roleBinding.mixin.metadata.withNamespace('kube-system') +
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
roleBinding.mixin.roleRef.withName('extension-apiserver-authentication-reader') +
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
roleBinding.withSubjects([{
kind: 'ServiceAccount',
name: $.prometheusAdapter.serviceAccount.metadata.name,
namespace: $._config.namespace,
}]),
},
}

View File

@ -1,476 +0,0 @@
local k3 = import 'ksonnet/ksonnet.beta.3/k.libsonnet';
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
_config+:: {
namespace: 'default',
versions+:: {
prometheus: 'v2.20.0',
},
imageRepos+:: {
prometheus: 'quay.io/prometheus/prometheus',
},
alertmanager+:: {
name: 'main',
},
prometheus+:: {
name: 'k8s',
replicas: 2,
rules: {},
namespaces: ['default', 'kube-system', $._config.namespace],
},
},
prometheus+:: {
local p = self,
name:: $._config.prometheus.name,
namespace:: $._config.namespace,
roleBindingNamespaces:: $._config.prometheus.namespaces,
replicas:: $._config.prometheus.replicas,
prometheusRules:: $._config.prometheus.rules,
alertmanagerName:: $.alertmanager.service.metadata.name,
serviceAccount:
local serviceAccount = k.core.v1.serviceAccount;
serviceAccount.new('prometheus-' + p.name) +
serviceAccount.mixin.metadata.withNamespace(p.namespace),
service:
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
local prometheusPort = servicePort.newNamed('web', 9090, 'web');
service.new('prometheus-' + p.name, { app: 'prometheus', prometheus: p.name }, prometheusPort) +
service.mixin.spec.withSessionAffinity('ClientIP') +
service.mixin.metadata.withNamespace(p.namespace) +
service.mixin.metadata.withLabels({ prometheus: p.name }),
rules:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
labels: {
prometheus: p.name,
role: 'alert-rules',
},
name: 'prometheus-' + p.name + '-rules',
namespace: p.namespace,
},
spec: {
groups: p.prometheusRules.groups,
},
},
roleBindingSpecificNamespaces:
local roleBinding = k.rbac.v1.roleBinding;
local newSpecificRoleBinding(namespace) =
roleBinding.new() +
roleBinding.mixin.metadata.withName('prometheus-' + p.name) +
roleBinding.mixin.metadata.withNamespace(namespace) +
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
roleBinding.mixin.roleRef.withName('prometheus-' + p.name) +
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]);
local roleBindingList = k3.rbac.v1.roleBindingList;
roleBindingList.new([newSpecificRoleBinding(x) for x in p.roleBindingNamespaces]),
clusterRole:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local nodeMetricsRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources(['nodes/metrics']) +
policyRule.withVerbs(['get']);
local metricsRule = policyRule.new() +
policyRule.withNonResourceUrls('/metrics') +
policyRule.withVerbs(['get']);
local rules = [nodeMetricsRule, metricsRule];
clusterRole.new() +
clusterRole.mixin.metadata.withName('prometheus-' + p.name) +
clusterRole.withRules(rules),
roleConfig:
local role = k.rbac.v1.role;
local policyRule = role.rulesType;
local configmapRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'configmaps',
]) +
policyRule.withVerbs(['get']);
role.new() +
role.mixin.metadata.withName('prometheus-' + p.name + '-config') +
role.mixin.metadata.withNamespace(p.namespace) +
role.withRules(configmapRule),
roleBindingConfig:
local roleBinding = k.rbac.v1.roleBinding;
roleBinding.new() +
roleBinding.mixin.metadata.withName('prometheus-' + p.name + '-config') +
roleBinding.mixin.metadata.withNamespace(p.namespace) +
roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
roleBinding.mixin.roleRef.withName('prometheus-' + p.name + '-config') +
roleBinding.mixin.roleRef.mixinInstance({ kind: 'Role' }) +
roleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]),
clusterRoleBinding:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('prometheus-' + p.name) +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('prometheus-' + p.name) +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-' + p.name, namespace: p.namespace }]),
roleSpecificNamespaces:
local role = k.rbac.v1.role;
local policyRule = role.rulesType;
local coreRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'services',
'endpoints',
'pods',
]) +
policyRule.withVerbs(['get', 'list', 'watch']);
local newSpecificRole(namespace) =
role.new() +
role.mixin.metadata.withName('prometheus-' + p.name) +
role.mixin.metadata.withNamespace(namespace) +
role.withRules(coreRule);
local roleList = k3.rbac.v1.roleList;
roleList.new([newSpecificRole(x) for x in p.roleBindingNamespaces]),
prometheus:
local statefulSet = k.apps.v1.statefulSet;
local container = statefulSet.mixin.spec.template.spec.containersType;
local resourceRequirements = container.mixin.resourcesType;
local selector = statefulSet.mixin.spec.selectorType;
local resources =
resourceRequirements.new() +
resourceRequirements.withRequests({ memory: '400Mi' });
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'Prometheus',
metadata: {
name: p.name,
namespace: p.namespace,
labels: {
prometheus: p.name,
},
},
spec: {
replicas: p.replicas,
version: $._config.versions.prometheus,
image: $._config.imageRepos.prometheus + ':' + $._config.versions.prometheus,
serviceAccountName: 'prometheus-' + p.name,
serviceMonitorSelector: {},
podMonitorSelector: {},
serviceMonitorNamespaceSelector: {},
podMonitorNamespaceSelector: {},
nodeSelector: { 'kubernetes.io/os': 'linux' },
ruleSelector: selector.withMatchLabels({
role: 'alert-rules',
prometheus: p.name,
}),
resources: resources,
alerting: {
alertmanagers: [
{
namespace: p.namespace,
name: p.alertmanagerName,
port: 'web',
},
],
},
securityContext: {
runAsUser: 1000,
runAsNonRoot: true,
fsGroup: 2000,
},
},
},
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'prometheus',
namespace: p.namespace,
labels: {
'k8s-app': 'prometheus',
},
},
spec: {
selector: {
matchLabels: {
prometheus: p.name,
},
},
endpoints: [
{
port: 'web',
interval: '30s',
},
],
},
},
serviceMonitorKubeScheduler:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kube-scheduler',
namespace: p.namespace,
labels: {
'k8s-app': 'kube-scheduler',
},
},
spec: {
jobLabel: 'k8s-app',
endpoints: [
{
port: 'https-metrics',
interval: '30s',
scheme: "https",
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
tlsConfig: {
insecureSkipVerify: true
}
},
],
selector: {
matchLabels: {
'k8s-app': 'kube-scheduler',
},
},
namespaceSelector: {
matchNames: [
'kube-system',
],
},
},
},
serviceMonitorKubelet:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kubelet',
namespace: p.namespace,
labels: {
'k8s-app': 'kubelet',
},
},
spec: {
jobLabel: 'k8s-app',
endpoints: [
{
port: 'https-metrics',
scheme: 'https',
interval: '30s',
honorLabels: true,
tlsConfig: {
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet'),
relabelings: [
{
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path',
},
],
},
{
port: 'https-metrics',
scheme: 'https',
path: '/metrics/cadvisor',
interval: '30s',
honorLabels: true,
tlsConfig: {
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
relabelings: [
{
sourceLabels: ['__metrics_path__'],
targetLabel: 'metrics_path',
},
],
metricRelabelings: [
// Drop a bunch of metrics which are disabled but still sent, see
// https://github.com/google/cadvisor/issues/1925.
{
sourceLabels: ['__name__'],
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
action: 'drop',
},
],
},
],
selector: {
matchLabels: {
'k8s-app': 'kubelet',
},
},
namespaceSelector: {
matchNames: [
'kube-system',
],
},
},
},
serviceMonitorKubeControllerManager:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kube-controller-manager',
namespace: p.namespace,
labels: {
'k8s-app': 'kube-controller-manager',
},
},
spec: {
jobLabel: 'k8s-app',
endpoints: [
{
port: 'https-metrics',
interval: '30s',
scheme: "https",
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
tlsConfig: {
insecureSkipVerify: true
},
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*',
action: 'drop',
},
],
},
],
selector: {
matchLabels: {
'k8s-app': 'kube-controller-manager',
},
},
namespaceSelector: {
matchNames: [
'kube-system',
],
},
},
},
serviceMonitorApiserver:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'kube-apiserver',
namespace: p.namespace,
labels: {
'k8s-app': 'apiserver',
},
},
spec: {
jobLabel: 'component',
selector: {
matchLabels: {
component: 'apiserver',
provider: 'kubernetes',
},
},
namespaceSelector: {
matchNames: [
'default',
],
},
endpoints: [
{
port: 'https',
interval: '30s',
scheme: 'https',
tlsConfig: {
caFile: '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt',
serverName: 'kubernetes',
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|server).*',
action: 'drop',
},
{
sourceLabels: ['__name__'],
regex: 'apiserver_admission_controller_admission_latencies_seconds_.*',
action: 'drop',
},
{
sourceLabels: ['__name__'],
regex: 'apiserver_admission_step_admission_latencies_seconds_.*',
action: 'drop',
},
{
sourceLabels: ['__name__', 'le'],
regex: 'apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)',
action: 'drop',
},
],
},
],
},
},
serviceMonitorCoreDNS:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'coredns',
namespace: p.namespace,
labels: {
'k8s-app': 'coredns',
},
},
spec: {
jobLabel: 'k8s-app',
selector: {
matchLabels: {
'k8s-app': 'kube-dns',
},
},
namespaceSelector: {
matchNames: [
'kube-system',
],
},
endpoints: [
{
port: 'metrics',
interval: '15s',
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
},
],
},
},
},
}

View File

@ -1,19 +0,0 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-prometheus-general.rules',
rules: [
{
expr: 'count without(instance, pod, node) (up == 1)',
record: 'count:up1',
},
{
expr: 'count without(instance, pod, node) (up == 0)',
record: 'count:up0',
},
],
},
],
},
}

View File

@ -1,35 +0,0 @@
{
prometheusRules+:: {
groups+: [
{
name: 'kube-prometheus-node-recording.rules',
rules: [
{
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)',
record: 'instance:node_cpu:rate:sum',
},
{
expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)',
record: 'instance:node_network_receive_bytes:rate:sum',
},
{
expr: 'sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)',
record: 'instance:node_network_transmit_bytes:rate:sum',
},
{
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)',
record: 'instance:node_cpu:ratio',
},
{
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))',
record: 'cluster:node_cpu:sum_rate5m',
},
{
expr: 'cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))',
record: 'cluster:node_cpu:ratio',
},
],
},
],
},
}

View File

@ -1,2 +0,0 @@
(import 'node-rules.libsonnet') +
(import 'general.libsonnet')

View File

@ -1,2 +0,0 @@
vendor/
jsonnetfile.lock.json

File diff suppressed because one or more lines are too long

View File

@ -1,14 +0,0 @@
{
"dependencies": [
{
"name": "ksonnet",
"source": {
"git": {
"remote": "https://github.com/ksonnet/ksonnet-lib",
"subdir": ""
}
},
"version": "master"
}
]
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,204 +0,0 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
{
_config+:: {
namespace: 'default',
prometheusOperator+:: {
deploymentSelectorLabels: {
'app.kubernetes.io/name': 'prometheus-operator',
'app.kubernetes.io/component': 'controller',
},
commonLabels:
$._config.prometheusOperator.deploymentSelectorLabels
{ 'app.kubernetes.io/version': $._config.versions.prometheusOperator },
},
versions+:: {
prometheusOperator: 'v0.40.0',
prometheusConfigReloader: self.prometheusOperator,
configmapReloader: 'v0.3.0',
},
imageRepos+:: {
prometheusOperator: 'quay.io/coreos/prometheus-operator',
configmapReloader: 'jimmidyson/configmap-reload',
prometheusConfigReloader: 'quay.io/coreos/prometheus-config-reloader',
},
},
prometheusOperator+:: {
local po = self,
namespace:: $._config.namespace,
commonLabels:: $._config.prometheusOperator.commonLabels,
deploymentSelectorLabels:: $._config.prometheusOperator.deploymentSelectorLabels,
image:: $._config.imageRepos.prometheusOperator,
version:: $._config.versions.prometheusOperator,
configReloaderImage:: $._config.imageRepos.configmapReloader,
configReloaderVersion:: $._config.versions.configmapReloader,
prometheusConfigReloaderImage:: $._config.imageRepos.prometheusConfigReloader,
prometheusConfigReloaderVersion:: $._config.versions.prometheusConfigReloader,
// Prefixing with 0 to ensure these manifests are listed and therefore created first.
'0alertmanagerCustomResourceDefinition': import 'alertmanager-crd.libsonnet',
'0prometheusCustomResourceDefinition': import 'prometheus-crd.libsonnet',
'0servicemonitorCustomResourceDefinition': import 'servicemonitor-crd.libsonnet',
'0podmonitorCustomResourceDefinition': import 'podmonitor-crd.libsonnet',
'0prometheusruleCustomResourceDefinition': import 'prometheusrule-crd.libsonnet',
'0thanosrulerCustomResourceDefinition': import 'thanosruler-crd.libsonnet',
clusterRoleBinding:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withLabels(po.commonLabels) +
clusterRoleBinding.mixin.metadata.withName('prometheus-operator') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('prometheus-operator') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{ kind: 'ServiceAccount', name: 'prometheus-operator', namespace: po.namespace }]),
clusterRole:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local monitoringRule = policyRule.new() +
policyRule.withApiGroups(['monitoring.coreos.com']) +
policyRule.withResources([
'alertmanagers',
'alertmanagers/finalizers',
'prometheuses',
'prometheuses/finalizers',
'thanosrulers',
'thanosrulers/finalizers',
'servicemonitors',
'podmonitors',
'prometheusrules',
]) +
policyRule.withVerbs(['*']);
local appsRule = policyRule.new() +
policyRule.withApiGroups(['apps']) +
policyRule.withResources([
'statefulsets',
]) +
policyRule.withVerbs(['*']);
local coreRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'configmaps',
'secrets',
]) +
policyRule.withVerbs(['*']);
local podRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'pods',
]) +
policyRule.withVerbs(['list', 'delete']);
local routingRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'services',
'services/finalizers',
'endpoints',
]) +
policyRule.withVerbs(['get', 'create', 'update', 'delete']);
local nodeRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'nodes',
]) +
policyRule.withVerbs(['list', 'watch']);
local namespaceRule = policyRule.new() +
policyRule.withApiGroups(['']) +
policyRule.withResources([
'namespaces',
]) +
policyRule.withVerbs(['get', 'list', 'watch']);
local rules = [monitoringRule, appsRule, coreRule, podRule, routingRule, nodeRule, namespaceRule];
clusterRole.new() +
clusterRole.mixin.metadata.withLabels(po.commonLabels) +
clusterRole.mixin.metadata.withName('prometheus-operator') +
clusterRole.withRules(rules),
deployment:
local deployment = k.apps.v1.deployment;
local container = k.apps.v1.deployment.mixin.spec.template.spec.containersType;
local containerPort = container.portsType;
local targetPort = 8080;
local operatorContainer =
container.new('prometheus-operator', po.image + ':' + po.version) +
container.withPorts(containerPort.newNamed(targetPort, 'http')) +
container.withArgs([
'--kubelet-service=kube-system/kubelet',
// Prometheus Operator is run with a read-only root file system. By
// default glog saves logfiles to /tmp. Make it log to stderr instead.
'--logtostderr=true',
'--config-reloader-image=' + po.configReloaderImage + ':' + po.configReloaderVersion,
'--prometheus-config-reloader=' + po.prometheusConfigReloaderImage + ':' + po.prometheusConfigReloaderVersion,
]) +
container.mixin.securityContext.withAllowPrivilegeEscalation(false) +
container.mixin.resources.withRequests({ cpu: '100m', memory: '100Mi' }) +
container.mixin.resources.withLimits({ cpu: '200m', memory: '200Mi' });
deployment.new('prometheus-operator', 1, operatorContainer, po.commonLabels) +
deployment.mixin.metadata.withNamespace(po.namespace) +
deployment.mixin.metadata.withLabels(po.commonLabels) +
deployment.mixin.spec.selector.withMatchLabels(po.deploymentSelectorLabels) +
deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) +
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
deployment.mixin.spec.template.spec.securityContext.withRunAsUser(65534) +
deployment.mixin.spec.template.spec.withServiceAccountName('prometheus-operator'),
serviceAccount:
local serviceAccount = k.core.v1.serviceAccount;
serviceAccount.new('prometheus-operator') +
serviceAccount.mixin.metadata.withLabels(po.commonLabels) +
serviceAccount.mixin.metadata.withNamespace(po.namespace),
service:
local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
local poServicePort = servicePort.newNamed('http', 8080, 'http');
service.new('prometheus-operator', po.deployment.spec.selector.matchLabels, [poServicePort]) +
service.mixin.metadata.withLabels(po.commonLabels) +
service.mixin.metadata.withNamespace(po.namespace) +
service.mixin.spec.withClusterIp('None'),
serviceMonitor:
{
apiVersion: 'monitoring.coreos.com/v1',
kind: 'ServiceMonitor',
metadata: {
name: 'prometheus-operator',
namespace: po.namespace,
labels: po.commonLabels,
},
spec: {
endpoints: [
{
port: 'http',
honorLabels: true,
},
],
selector: {
matchLabels: po.commonLabels,
},
},
},
},
}

View File

@ -1 +0,0 @@
{"apiVersion":"apiextensions.k8s.io/v1","kind":"CustomResourceDefinition","metadata":{"annotations":{"controller-gen.kubebuilder.io/version":"v0.2.4"},"creationTimestamp":null,"name":"prometheusrules.monitoring.coreos.com"},"spec":{"group":"monitoring.coreos.com","names":{"kind":"PrometheusRule","listKind":"PrometheusRuleList","plural":"prometheusrules","singular":"prometheusrule"},"scope":"Namespaced","versions":[{"name":"v1","schema":{"openAPIV3Schema":{"description":"PrometheusRule defines alerting rules for a Prometheus instance","properties":{"apiVersion":{"description":"APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources","type":"string"},"kind":{"description":"Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds","type":"string"},"metadata":{"type":"object"},"spec":{"description":"Specification of desired alerting rule definitions for Prometheus.","properties":{"groups":{"description":"Content of Prometheus rule file","items":{"description":"RuleGroup is a list of sequentially evaluated recording and alerting rules. Note: PartialResponseStrategy is only used by ThanosRuler and will be ignored by Prometheus instances. Valid values for this field are 'warn' or 'abort'. More info: https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md#partial-response","properties":{"interval":{"type":"string"},"name":{"type":"string"},"partial_response_strategy":{"type":"string"},"rules":{"items":{"description":"Rule describes an alerting or recording rule.","properties":{"alert":{"type":"string"},"annotations":{"additionalProperties":{"type":"string"},"type":"object"},"expr":{"anyOf":[{"type":"integer"},{"type":"string"}],"x-kubernetes-int-or-string":true},"for":{"type":"string"},"labels":{"additionalProperties":{"type":"string"},"type":"object"},"record":{"type":"string"}},"required":["expr"],"type":"object"},"type":"array"}},"required":["name","rules"],"type":"object"},"type":"array"}},"type":"object"}},"required":["spec"],"type":"object"}},"served":true,"storage":true}]},"status":{"acceptedNames":{"kind":"","plural":""},"conditions":[],"storedVersions":[]}}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -133,16 +133,18 @@ local timepickerlib = import 'timepicker.libsonnet';
name,
label,
type,
pluginId,
pluginName,
pluginId=null,
pluginName=null,
description='',
value=null,
):: self {
inputs+: [{
name: name,
label: label,
type: type,
pluginId: pluginId,
pluginName: pluginName,
[if pluginId != null then 'pluginId']: pluginId,
[if pluginName != null then 'pluginName']: pluginName,
[if value != null then 'value']: value,
description: description,
}],
},

View File

@ -25,5 +25,6 @@
pluginlist:: import 'pluginlist.libsonnet',
gauge:: error 'gauge is removed, migrate to gaugePanel',
gaugePanel:: import 'gauge_panel.libsonnet',
barGaugePanel:: import 'bar_gauge_panel.libsonnet',
statPanel:: import 'stat_panel.libsonnet',
}

View File

@ -12,6 +12,13 @@
* @param datasource Datasource
* @param aliasColors Define color mappings
* @param pieType Type of pie chart (one of pie or donut)
* @param showLegend Show legend
* @param showLegendPercentage Show percentage values in the legend
* @param legendType Type of legend (one of 'Right side', 'Under graph' or 'On graph')
* @param valueName Type of tooltip value
* @param repeat Variable used to repeat the pie chart
* @param repeatDirection Which direction to repeat the panel, 'h' for horizontal and 'v' for vertical
* @param maxPerRow Number of panels to display when repeated. Used in combination with repeat.
* @return A json that represents a pie chart panel
*/
new(
@ -27,6 +34,9 @@
showLegend=true,
showLegendPercentage=true,
legendType='Right side',
repeat=null,
repeatDirection=null,
maxPerRow=null,
):: {
type: 'grafana-piechart-panel',
[if description != null then 'description']: description,
@ -36,6 +46,9 @@
[if span != null then 'span']: span,
[if min_span != null then 'minSpan']: min_span,
[if height != null then 'height']: height,
[if repeat != null then 'repeat']: repeat,
[if repeatDirection != null then 'repeatDirection']: repeatDirection,
[if maxPerRow != null then 'maxPerRow']: maxPerRow,
valueName: valueName,
datasource: datasource,
legend: {

View File

@ -16,6 +16,7 @@
* @param sort Sorting instruction for the panel
* @param transform allow table manipulation to present data as desired
* @param transparent Boolean (default: false) If set to true the panel will be transparent
* @param links Set of links for the panel.
* @return A json that represents a table panel
*/
new(
@ -32,6 +33,7 @@
sort=null,
time_from=null,
time_shift=null,
links=[],
):: {
type: 'table',
title: title,
@ -45,6 +47,7 @@
columns: columns,
timeFrom: time_from,
timeShift: time_shift,
links: links,
[if sort != null then 'sort']: sort,
[if description != null then 'description']: description,
[if transform != null then 'transform']: transform,
@ -69,5 +72,8 @@
type: 'hidden',
}],
},
addLink(link):: self {
links+: [link],
},
},
}

View File

@ -1,6 +1,25 @@
{
/**
* Returns a new template that can be added to a dashboard.
* See what's a [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates).
*
* @name template.new
*
* @param name Name of variable
* @param datasource Template [datasource](https://grafana.com/docs/grafana/latest/variables/variable-types/add-data-source-variable/)
* @param query [Query expression](https://grafana.com/docs/grafana/latest/variables/variable-types/add-query-variable/) for the datasource.
* @param label (optional) Display name of the variable dropdown. If null, then the dropdown label will be the variable name.
* @param allValues (optional) Formatting for [multi-value variables](https://grafana.com/docs/grafana/latest/variables/formatting-multi-value-variables/#formatting-multi-value-variables)
* @param tagValuesQuery (optional, experimental feature) Group values into [selectable tags](https://grafana.com/docs/grafana/latest/variables/variable-value-tags/)
* @param current
* @param hide '' (default) the variable dropdown displays the variable Name or Label value. 'label' the variable dropdown only displays the selected variable value and a down arrow. Any other value, no variable dropdown is displayed on the dashboard.
* @param regex (optional) Regex expression to filter or capture specific parts of the names returned by your data source query. To see examples, refer to [Filter variables with regex](https://grafana.com/docs/grafana/latest/variables/filter-variables-with-regex/).
* @param refresh 'never' (default): Variables queries are cached and values are not updated. This is fine if the values never change, but problematic if they are dynamic and change a lot. 'load': Queries the data source every time the dashboard loads. This slows down dashboard loading, because the variable query needs to be completed before dashboard can be initialized. 'time': Queries the data source when the dashboard time range changes. Only use this option if your variable options query contains a time range filter or is dependent on the dashboard time range.
* @param includeAll Whether all value option is available or not. False by default.
* @param multi Whether multiple values can be selected or not from variable value list. False by default.
* @param sort 0 (default): Without Sort, 1: Alphabetical (asc), 2: Alphabetical (desc), 3: Numerical (asc), 4: Numerical (desc).
*
* @return A [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates)
*/
new(
name,
@ -38,7 +57,20 @@
useTags: false,
},
/**
* Use an [interval variable](https://grafana.com/docs/grafana/latest/variables/variable-types/add-interval-variable/) to represent time spans such as '1m', '1h', '1d'. You can think of them as a dashboard-wide "group by time" command. Interval variables change how the data is grouped in the visualization. You can also use the Auto Option to return a set number of data points per time span.
* You can use an interval variable as a parameter to group by time (for InfluxDB), date histogram interval (for Elasticsearch), or as a summarize function parameter (for Graphite).
*
* @name template.interval
*
* @param name Variable name
* @param query
* @param current
* @param hide '' (default) the variable dropdown displays the variable Name or Label value. 'label' the variable dropdown only displays the selected variable value and a down arrow. Any other value, no variable dropdown is displayed on the dashboard.
* @param label (optional) Display name of the variable dropdown. If null, then the dropdown label will be the variable name.
* @param auto_count (default: 300) Valid only if 'auto' is defined in query. Number of times the current time range will be divided to calculate the value, similar to the Max data points query option. For example, if the current visible time range is 30 minutes, then the auto interval groups the data into 30 one-minute increments. The default value is 30 steps.
* @param auto_min (default: '10s') Valid only if 'auto' is defined in query. The minimum threshold below which the step count intervals will not divide the time. To continue the 30 minute example, if the minimum interval is set to 2m, then Grafana would group the data into 15 two-minute increments.
*
* @return A new interval variable for templating.
*/
interval(
name,
@ -122,18 +154,24 @@
hide='',
)::
{
// self has dynamic scope, so self may not be myself below.
// '$' can't be used neither as this object is not top-level object.
local custom = self,
allValue: allValues,
current: {
value: current,
text: if current in valuelabels then valuelabels[current] else current,
// Both 'all' and 'All' are accepted for consistency.
value: if includeAll && (current == 'All' || current == 'all') then
if multi then ['$__all'] else '$__all'
else
current,
text: if std.isArray(current) then
std.join(' + ', std.map(custom.valuelabel, current))
else
custom.valuelabel(current),
[if multi then 'selected']: true,
},
options: std.map(
function(i)
{
text: if i in valuelabels then valuelabels[i] else i,
value: i,
}, std.split(query, ',')
),
options: std.map(self.option, self.query_array(query)),
hide: $.hide(hide),
includeAll: includeAll,
label: label,
@ -142,6 +180,24 @@
name: name,
query: query,
type: 'custom',
valuelabel(value):: if value in valuelabels then
valuelabels[value]
else value,
option(option):: {
text: custom.valuelabel(option),
value: if includeAll && option == 'All' then '$__all' else option,
[if multi then 'selected']: if multi && std.isArray(current) then
std.member(current, option)
else if multi then
current == option
else
null,
},
query_array(query):: std.split(
if includeAll then 'All,' + query else query, ','
),
},
/**
* @name template.text
@ -161,4 +217,20 @@
query: '',
type: 'textbox',
},
/**
* @name template.adhoc
*/
adhoc(
name,
datasource,
label=null,
hide='',
)::
{
datasource: datasource,
hide: $.hide(hide),
label: label,
name: name,
type: 'adhoc',
},
}

View File

@ -1,5 +1,5 @@
{
dashboard(title, uid=''):: {
dashboard(title, uid='', datasource='default'):: {
// Stuff that isn't materialised.
_nextPanel:: 1,
addRow(row):: self {
@ -88,8 +88,8 @@
list: [
{
current: {
text: 'default',
value: 'default',
text: datasource,
value: datasource,
},
hide: 0,
label: null,

View File

@ -13,7 +13,8 @@ A set of Grafana dashboards and Prometheus alerts for Kubernetes.
| release-0.2 | v1.14.1 and before | v2.11.0+ |
| release-0.3 | v1.17 and before | v2.11.0+ |
| release-0.4 | v1.18 | v2.11.0+ |
| master | v1.18+ | v2.11.0+ |
| release-0.5 | v1.19 | v2.11.0+ |
| master | v1.19 | v2.11.0+ |
In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.
Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis.
@ -254,4 +255,4 @@ While the community has not yet fully agreed on alert severities and their to be
## Note
You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance.
Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.
Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.

View File

@ -18,7 +18,8 @@
severity: 'warning',
},
annotations: {
message: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.',
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.',
summary: 'Pod is crash looping.',
},
'for': '15m',
alert: 'KubePodCrashLooping',
@ -41,7 +42,8 @@
severity: 'warning',
},
annotations: {
message: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
summary: 'Pod has been in a non-ready state for more than 15 minutes.',
},
'for': '15m',
alert: 'KubePodNotReady',
@ -56,7 +58,8 @@
severity: 'warning',
},
annotations: {
message: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
summary: 'Deployment generation mismatch due to possible roll-back',
},
'for': '15m',
alert: 'KubeDeploymentGenerationMismatch',
@ -77,7 +80,8 @@
severity: 'warning',
},
annotations: {
message: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
summary: 'Deployment has not matched the expected number of replicas.',
},
'for': '15m',
alert: 'KubeDeploymentReplicasMismatch',
@ -98,7 +102,8 @@
severity: 'warning',
},
annotations: {
message: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
summary: 'Deployment has not matched the expected number of replicas.',
},
'for': '15m',
alert: 'KubeStatefulSetReplicasMismatch',
@ -113,7 +118,8 @@
severity: 'warning',
},
annotations: {
message: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
summary: 'StatefulSet generation mismatch due to possible roll-back',
},
'for': '15m',
alert: 'KubeStatefulSetGenerationMismatch',
@ -142,7 +148,8 @@
severity: 'warning',
},
annotations: {
message: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
summary: 'StatefulSet update has not been rolled out.',
},
'for': '15m',
alert: 'KubeStatefulSetUpdateNotRolledOut',
@ -178,7 +185,8 @@
severity: 'warning',
},
annotations: {
message: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
summary: 'DaemonSet rollout is stuck.',
},
'for': '15m',
},
@ -190,7 +198,8 @@
severity: 'warning',
},
annotations: {
message: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
summary: 'Pod container waiting longer than 1 hour',
},
'for': '1h',
alert: 'KubeContainerWaiting',
@ -206,7 +215,8 @@
severity: 'warning',
},
annotations: {
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
summary: 'DaemonSet pods are not scheduled.',
},
'for': '10m',
},
@ -219,7 +229,8 @@
severity: 'warning',
},
annotations: {
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
summary: 'DaemonSet pods are misscheduled.',
},
'for': '15m',
},
@ -233,7 +244,8 @@
severity: 'warning',
},
annotations: {
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
summary: 'Job did not complete in time',
},
},
{
@ -246,7 +258,8 @@
severity: 'warning',
},
annotations: {
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.',
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.',
summary: 'Job failed to complete.',
},
},
{
@ -261,7 +274,8 @@
severity: 'warning',
},
annotations: {
message: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.',
description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.',
summary: 'HPA has not matched descired number of replicas.',
},
'for': '15m',
alert: 'KubeHpaReplicasMismatch',
@ -276,7 +290,8 @@
severity: 'warning',
},
annotations: {
message: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.',
description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.',
summary: 'HPA is running at max replicas',
},
'for': '15m',
alert: 'KubeHpaMaxedOut',

View File

@ -35,7 +35,8 @@ local utils = import 'utils.libsonnet';
long: '%(long)s' % w,
},
annotations: {
message: 'The API server is burning too much error budget',
description: 'The API server is burning too much error budget.',
summary: 'The API server is burning too much error budget.',
},
'for': '%(for)s' % w,
}
@ -54,7 +55,8 @@ local utils = import 'utils.libsonnet';
severity: 'warning',
},
annotations: {
message: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationWarningSeconds)),
description: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationWarningSeconds)),
summary: 'Client certificate is about to expire.',
},
},
{
@ -66,7 +68,8 @@ local utils = import 'utils.libsonnet';
severity: 'critical',
},
annotations: {
message: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationCriticalSeconds)),
description: 'A client certificate used to authenticate to the apiserver is expiring in less than %s.' % (utils.humanizeSeconds($._config.certExpirationCriticalSeconds)),
summary: 'Client certificate is about to expire.',
},
},
{
@ -78,7 +81,8 @@ local utils = import 'utils.libsonnet';
severity: 'warning',
},
annotations: {
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.',
description: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.',
summary: 'An aggregated API has reported errors.',
},
},
{
@ -91,7 +95,8 @@ local utils = import 'utils.libsonnet';
severity: 'warning',
},
annotations: {
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.',
description: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.',
summary: 'An aggregated API is down.',
},
},
(import '../lib/absent_alert.libsonnet') {

View File

@ -22,7 +22,8 @@
severity: 'warning',
},
annotations: {
message: '{{ $labels.node }} has been unready for more than 15 minutes.',
description: '{{ $labels.node }} has been unready for more than 15 minutes.',
summary: 'Node is not ready.',
},
'for': '15m',
alert: 'KubeNodeNotReady',
@ -37,7 +38,8 @@
severity: 'warning',
},
annotations: {
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.',
description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.',
summary: 'Node is unreachable.',
},
alert: 'KubeNodeUnreachable',
},
@ -59,7 +61,8 @@
severity: 'warning',
},
annotations: {
message: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
description: "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
summary: 'Kubelet is running at capacity.',
},
},
{
@ -72,7 +75,8 @@
severity: 'warning',
},
annotations: {
message: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.',
description: 'The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes.',
summary: 'Node readiness status is flapping.',
},
},
{
@ -85,7 +89,8 @@
severity: 'warning',
},
annotations: {
message: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.',
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}.',
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.',
},
},
{
@ -98,7 +103,8 @@
severity: 'warning',
},
annotations: {
message: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.',
description: 'Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.',
summary: 'Kubelet Pod startup latency is too high.',
},
},
(import '../lib/absent_alert.libsonnet') {

View File

@ -35,7 +35,8 @@
severity: 'warning',
},
annotations: {
message: 'Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.',
description: 'Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.',
summary: 'Cluster has overcommitted CPU resource requests.',
},
'for': '5m',
},
@ -54,7 +55,8 @@
severity: 'warning',
},
annotations: {
message: 'Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.',
description: 'Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.',
summary: 'Cluster has overcommitted memory resource requests.',
},
'for': '5m',
},
@ -70,7 +72,8 @@
severity: 'warning',
},
annotations: {
message: 'Cluster has overcommitted CPU resource requests for Namespaces.',
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
summary: 'Cluster has overcommitted CPU resource requests.',
},
'for': '5m',
},
@ -86,7 +89,8 @@
severity: 'warning',
},
annotations: {
message: 'Cluster has overcommitted memory resource requests for Namespaces.',
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
summary: 'Cluster has overcommitted memory resource requests.',
},
'for': '5m',
},
@ -103,7 +107,8 @@
severity: 'info',
},
annotations: {
message: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
summary: 'Namespace quota is fully used.',
},
},
{
@ -119,7 +124,8 @@
severity: 'info',
},
annotations: {
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
summary: 'Processes experience elevated CPU throttling.',
},
},
],

View File

@ -29,7 +29,8 @@
severity: 'critical',
},
annotations: {
message: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.',
description: 'The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free.',
summary: 'PersistentVolume is filling up.',
},
},
{
@ -48,7 +49,8 @@
severity: 'warning',
},
annotations: {
message: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
description: 'Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.',
summary: 'PersistentVolume is filling up.',
},
},
{
@ -61,7 +63,8 @@
severity: 'critical',
},
annotations: {
message: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.',
description: 'The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.',
summary: 'PersistentVolume is having issues with provisioning.',
},
},
],

View File

@ -18,7 +18,8 @@
severity: 'warning',
},
annotations: {
message: 'There are {{ $value }} different semantic versions of Kubernetes components running.',
description: 'There are {{ $value }} different semantic versions of Kubernetes components running.',
summary: 'Different semantic versions of Kubernetes components running.',
},
},
{
@ -37,7 +38,8 @@
severity: 'warning',
},
annotations: {
message: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.'",
summary: 'Kubernetes API server client is experiencing errors.',
},
},
],

View File

@ -247,10 +247,10 @@ local singlestat = grafana.singlestat;
span=12,
),
gridPos={
"h": 2,
"w": 24,
"x": 0,
"y": 0
h: 2,
w: 24,
x: 0,
y: 0,
},
)
.addRow(

View File

@ -12,6 +12,7 @@
severity: 'critical',
},
annotations: {
message: '%s has disappeared from Prometheus target discovery.' % absentAlert.componentName,
description: '%s has disappeared from Prometheus target discovery.' % absentAlert.componentName,
summary: 'Target disappeared from Prometheus target discovery.',
},
}

View File

@ -17,7 +17,8 @@
severity: 'critical',
},
annotations: {
message: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
summary: 'kube-state-metrics is experiencing errors in list operations.',
description: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
},
},
{
@ -33,7 +34,8 @@
severity: 'critical',
},
annotations: {
message: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
summary: 'kube-state-metrics is experiencing errors in watch operations.',
description: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.',
},
},
],

View File

@ -1 +1 @@
github.com/coreos/kube-prometheus/jsonnet/kube-prometheus
github.com/prometheus-operator/kube-prometheus/jsonnet/kube-prometheus

View File

@ -1 +1 @@
github.com/coreos/prometheus-operator/jsonnet/prometheus-operator
github.com/prometheus-operator/prometheus-operator/jsonnet/prometheus-operator