major monitoring upgrade
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2021-05-01 16:24:35 +02:00
parent ab98ad2e70
commit d26b64b384
228 changed files with 8917 additions and 7056 deletions

View File

@ -8,7 +8,7 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "master",
"version": "main",
"name": "kube-prometheus"
},
{

View File

@ -8,18 +8,18 @@
"subdir": "grafana"
}
},
"version": "8024f4fdaeb3a3a7d72f77e2ed87deb92c79aeda",
"sum": "WXrJQtWuU5lJVc4jXkJGddPMpPP0+4eMcIB5cauZGgM="
"version": "8ea4e7bc04b1bf5e9bd99918ca28c6271b42be0e",
"sum": "muenICtKXABk6MZZHCZD2wCbmtiE96GwWRMGa1Rg+wA="
},
{
"source": {
"git": {
"remote": "https://github.com/etcd-io/etcd",
"subdir": "Documentation/etcd-mixin"
"subdir": "contrib/mixin"
}
},
"version": "ca866c02422ff3f3d1f0876898a30c33dd7bcccf",
"sum": "bLqTqEr0jky9zz5MV/7ucn6H5mph2NlXas0TVnGNB1Y="
"version": "562d645ac923388ff5b8d270b0536764d34b0e0f",
"sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc="
},
{
"source": {
@ -28,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "356bd73e4792ffe107725776ca8946895969c191",
"sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU="
"version": "55cf4ee53ced2b6d3ce96ecce9fb813b4465be98",
"sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk="
},
{
"source": {
@ -38,8 +38,8 @@
"subdir": "grafana-builder"
}
},
"version": "216bc806bb512f218e3cf5ed3d4f5699b07f04d6",
"sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k="
"version": "dbf1211d003d20c7adcdee942c477e648507a398",
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
},
{
"source": {
@ -59,8 +59,8 @@
"subdir": ""
}
},
"version": "ead45674dba3c8712e422d99223453177aac6bf4",
"sum": "3i0NkntlBluDS1NRF+iSc2e727Alkv3ziuVjAP12/kE="
"version": "c67c0f19e869f1da34d79b6507c1fa37c23a6e4e",
"sum": "F+RxcI26zeoeI81uot39Jv6IpQ6BOz+xlSHlElJYsz8="
},
{
"source": {
@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "ead45674dba3c8712e422d99223453177aac6bf4",
"version": "39a9cda705b5201c35105bd1f24c83923fa839ef",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
@ -79,8 +79,8 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "89aaf6c524ee891140c4c8f2a05b1b16f5847309",
"sum": "zD/pbQLnQq+5hegEelaheHS8mn1h09GTktFO74iwlBI="
"version": "b1889aa1561ee269f628e2b9659155e7714dbbf0",
"sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
},
{
"source": {
@ -89,7 +89,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "7bdd62593c9273b5179cf3c9d2d819e9d997aaa4",
"version": "b1889aa1561ee269f628e2b9659155e7714dbbf0",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
},
{
@ -99,8 +99,8 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "7d7d40b4dee70ecd3328dcdee2ed0cc8f806df93",
"sum": "6PhhQPWilq4skfe+z/hXKEg1pRqHnwvMR1Au6W136U0="
"version": "5b2740d517095a6ae9ad51bcb9c53e5ef28c62a0",
"sum": "+6VkkR44AC3Qnwfr9cWYCKs+uRi5JaIOda/3X1JEzAg="
},
{
"source": {
@ -109,8 +109,9 @@
"subdir": "jsonnet/mixin"
}
},
"version": "117c9a2cd905479022a66ddd92a41f599cccf10d",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U="
"version": "b7ca32169844f0b5143f3e5e318fc05fa025df18",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"name": "prometheus-operator-mixin"
},
{
"source": {
@ -119,8 +120,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "d8b7d3766225908d0239fd0d78258892cd0fc384",
"sum": "Nl+N/h76bzD9tZ8tx7tuNIKHwCIJ9zyOsAWplH8HvAE="
"version": "b7ca32169844f0b5143f3e5e318fc05fa025df18",
"sum": "MRwyChXdKG3anL2OWpbUu3qWc97w9J6YsjUWjLFQyB0="
},
{
"source": {
@ -129,8 +130,8 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "193ebba04d1e70d971047e983a0b489112610460",
"sum": "QcftU7gjCQyj7B6M4YJeCAeaPd0kwxd4J4rolo7AnLE=",
"version": "99f64e944b1043c790784cf5373c8fb349816fc4",
"sum": "V8jcZQ1Qrlm7AQ6wjbuQQsacPb0NvrcZovKyplmzW5w=",
"name": "alertmanager"
},
{
@ -140,8 +141,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "8b466360a35581e0301bd22918be7011cf4203c3",
"sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8="
"version": "b597c1244d7bef49e6f3359c87a56dd7707f6719",
"sum": "cZTNXQMUCLB5FGYpMn845dcqGdkcYt58qCqOFIV/BoQ="
},
{
"source": {
@ -150,8 +151,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "26d89b4b0776fe4cd5a3656dfa520f119a375273",
"sum": "1VRVMuxAEZ9vdGHFlndmG9iQzDD6AoIXrX80CDpGDaU=",
"version": "3cafc58827d1ebd1a67749f88be4218f0bab3d8d",
"sum": "VK0c3sQ3ksiM6JQsAVfWmL5NbzGv9llMfXFNXfFdJ+A=",
"name": "prometheus"
},
{
@ -161,8 +162,9 @@
"subdir": "mixin"
}
},
"version": "37e6ef61566c7c70793ba6d128f00c4c66cb2402",
"sum": "OptiWUMOHFrRGTZhSfxV1RCeXZ90qsefGNTD4lDYVG0="
"version": "ba6c5c4726ff52807c7383c68f2159b1af7980bb",
"sum": "XP3uq7xcfKHsnWsz1v992csZhhZR3jQma6hFOfSViTs=",
"name": "thanos-mixin"
},
{
"source": {

View File

@ -72,7 +72,7 @@ local masterIP = '185.95.218.11';
{
port: 'http-metrics',
interval: '30s',
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
metricRelabelings: (import 'kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*',

View File

@ -3,17 +3,34 @@ kind: Alertmanager
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: main
namespace: monitoring
namespace: default
spec:
configSecret: alertmanager-tbrnt-config
image: quay.io/prometheus/alertmanager:v0.21.0
nodeSelector:
kubernetes.io/os: linux
replicas: 1
podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: v0.21.0
version: 0.21.0

View File

@ -0,0 +1,18 @@
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: default
spec:
maxUnavailable: 1
selector:
matchLabels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,156 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
prometheus: k8s
role: alert-rules
name: alertmanager-main-rules
namespace: default
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{
$labels.pod}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="default"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only
found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster
members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="default"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="default"}[5m]))
for: 10m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have
different configurations.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="default"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have been up for less than half of the
last 5m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster
are down.
expr: |
(
count by (namespace,service) (
avg_over_time(up{job="alertmanager-main",namespace="default"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="default"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have restarted at least 5 times in the
last 10m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster
are crashlooping.
expr: |
(
count by (namespace,service) (
changes(process_start_time_seconds{job="alertmanager-main",namespace="default"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="default"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@ -1,8 +1,14 @@
apiVersion: v1
kind: Secret
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
namespace: default
stringData:
alertmanager.yaml: |-
"global":

View File

@ -3,8 +3,12 @@ kind: Service
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
namespace: default
spec:
ports:
- name: web
@ -13,4 +17,7 @@ spec:
selector:
alertmanager: main
app: alertmanager
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP

View File

@ -1,5 +1,11 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
namespace: default

View File

@ -2,9 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: alertmanager
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager
namespace: monitoring
namespace: default
spec:
endpoints:
- interval: 30s
@ -12,3 +15,6 @@ spec:
selector:
matchLabels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@ -6,11 +6,12 @@ metadata:
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgDHh1Qgrdffl6IFXJuk3ZzNHGARWZLDzbMLKp5Wo/ZYqclCji4T5wx7Fql6FALCvvUBvThxLfiwM2LQHRcWfWnf5AwxipCxpos9oVvlu4aON0WZd/Kjz/ZwDq5pgR/etCrSA2DYNxYq4vnTxUAk29eY5F4dWwRPcCgVZ5/KsTIcNx4x+4einqQbwAhkUtAwQl5fEPmpwNRquJZM29XIEUxZLWne0YmCmEgNGleUU20ByfYMwgtDJGjgr6XNPtTmByAHVrxNuQwAMxgT6GcfDLCNsByaS6CY3wmSTg1GUv/CG8Xx29FwDWyf1Ly2KbmcAAafN5QJGvCCTEt/WB85GtzQisrWFZTykv3Zjuz101p9ShXQZALylaX5h22hHFXuQyiIQZEeM2ixiYQjcPhiPjx1/hkbQ25QRD73/gjalZO8bprDrJxkLlw+hrgJ0LzxWL881U6INLKow+8/GmLleFhMUXRsGqacLreCIAr4uVGEMGMVLhHJKnj597HRnn0UCxVNkDk8QjHyiVgJBrQ3Pz9SFdF7mxvJ9F4rEgGkE4dvfvWxrZFumTLEkVRF9To+rKxsIVkewvoHtN/gMzFMzumP+fz/oB9yAHsxkwVyfqXBg52hNSYIx5Z/67yy3hDRKPBcZgknf9S+F37ET5BABFxazwG8NJjf4td+UsAGuAMzKI/94u7TxuXLPCs/tIGKD7kJnPxAqpalepzABtVCmOrtWwNPb1h4XeuraUS9beJ2zV9oV5nVFJmX94EJ7qpZt0Um7+GGeavQ5SV3XHRolDS5PpZPTAWnc/1rtZ0nsKk8lllEr3aDWveMXma06NKkIXz8+iAonvHsDZuw0W6jUdUUtraIbSua9YkyugqCBGeeXIPLwFxqJTqIX5vedZVMveFiaxtCJjL48SUGxtyugfiYbPa3xpHWWe22BcJyTmAOG9aIq4Tp4nvftLyvWe7c9PotJk/7gdv1IO4RLx//eLtKWw0uADa0ara4hDuI8Yktlti24TlA9XYz00d5WtE+lJsSZN8547BUfFzXSOZSSbfrFLZmEmBTgkbj4szX19bXSctJN3BtOmRfCEPXYQN10HgnhpwqYHbXKUSTZNWLojnFL1/E56wUXGxRg9NGOwSXzTyfoLGxI9NEQpGc0Rj2Wna+JSUhlAUnfYW1eH8yyg5FfkyhQdyZJFvYfF0rk+XG5XNhLumST19uxrAkMWhk+Z9/eWwOaZQMmDcoi2Rs0za+1GGjPW5k56Ip+spwW5cvYmdl1PgkZ4g1mupjiB0FdgZHGR+kGn1lbPtSUd+amh9PXSDWkqfnix62H7374rQ3ZyG7fs9sQNnnRrd/cDCMxAl5Upk8D9dfxRmvuxRd8b89h7EQwUBML7TIriA2Pci5Ftux2R5wyIXjznLC5/kFZg6/Av3uKmKK6dLR2Ooey7/3g14CEjMumdijjySl8Pd2UUxSKVKD7vkq+3xYm0CJZqVvT/iBOccrv0UEiTHBsXrfaugUvqIKTAGYhJy0fUBXKisPdA0HdzrUmx57Du36TGyuEzGtVuDarcWzQYPqKJxOIuofJ+AGTDY53OjdUJ8pwJD6HDz55tu85gaV6ZOvSYqjqeX2FUe7lPhsGUIh/FemfichpypHyFpPYhkwAIO1AinKvsqjUuDXE6n5b7NMbI1gl87fPqT5wUSKXZqwViyFqUA5DFqPTEqvHIGU5Wz0GajEaQ==
alertmanager.yaml: AgBj3KqLiF7EAnGK6c4+Thferv3Sur+fhlwE4wpXD1PBtwlJQsMCqjsLRRFNAESH9/8vhI9E9D8wLJiauNS7CGw5jd5KU1cvxo5EyGeFoVyAB4bHSy/pxptSFq+rn00E99/Tqkbdsgduwusfpi0I9F1+zNucyyamJsEzIcsyHMlBbACz+9KQV9SdbgVEmIeqabrAP9VQaQ+i69yurhPdV7VkZzr0WKcGg3x27+slmtjlJz5fwtv1qmbYt/MQnijF2tc6tJeq19Cm0O4zuQ09meW6DwAZ9SOIFU6LxrqJlKbuleaWmfIE3AQYA6Z+qXyBjT1ILW36RwGyg3YK7nm0MNDQxd6LN3zR0eifPqrPsm7O6LE+NAg4FkurV3lJlrBoU3lSSc+sQZZr00ct9Gp57EEvg9T2TaM1B/KHQNmIhpDGntD4+yTcvK3nU7+sxqG/c4Wk5xiUQyLYnigNy5qYCcsM+t9iCoGxP7uU8GrsvIkojTxzhdc6e5LduThKdGE9jI3R6nCP6kmsU6XyUzgKmxYJVVzhSrm9yxFVDPHriNaEM2hgEd3wStwmRjGjwAPUjQZfSJtmxY6+RQ/77TYGjiskDm6gAZuzkGjdptL2t5F+54y3uePaLHNspMgtZsTARCo3kAhgf61Gk2nvnEY/ws5qFjAnsUEXs86wAk2S+w401QKPKTcDr6e/rnve8IrXW0FPvzR3rzdWOcU8v0Z0sSFijIfXdx+A9WGCJuHNo65FbKSgWhlHBfvWB1qWBnDd/VVHIA2wR8gevAPJHSc0f1WdUDc4w2w8tc/qum1SZo2lWkMopvLaiVHU5dCGtG6+4qsC1DmFzIRGZN4AbdVd5k+OY15Fp+ysjuTpA/HuZ/N0kz/5BXzNbY3u7YKi3EV3Up+eZIi2jlG0XBXWdCouuxRW40qHuShivtbrBgey32kFo85dsrDqN7F2kBVnumOB7kvFOaCkL2AtsakVjUzGoh5eXCSHl0ZcqmW2UjInzZIirBChMW/G4yL/TwpVYbBLqWPfdVMFmq7I4srY2+hUP/5UBt/DKZi5zPlLR8H3q4i02zsNpqdhSa9o6ThhFtVX9/te/DMpyN1fJ1Hn2p3cDhoTsiTLPkvflVOx70flap0v2zzPoDm+yXhFllpWp/5avHy9pKf/RzpAodbNr/EydkC+KDKI88MhVUtxS27WbKFsq+vUkmHQj+KtGyRFjg2/CnmM8YbdRsMe8p39PVGLxj1RTnyYzlMltOTbJo3rhDzjmpzGVUpWokwTMGC1WgTenrS4IcCK61ri9bsBIL9n9sMLF1lT8NVKnQfluDTaHNzsQgJ1HTSwQOcAfugqlUrSeTLt3q6U4pSjjlF8P7wYpqzWc+bhOaHed9NxrGXFBC5Wh6+BULuCaCA6TtkLpUfABYHVUa4OS3huNsOeBhZ3aCCQXrc0jOOq2DQzxvdGu4YAQnvMHwJRVyKVcw0pOS5RjIqJW6IOn0MGHzAo7qNv6LUyJ9a7huT2W4ibrHFkMck1zKxbBekPQ9FxpufSXrEqEqNuB3j7Gi7lVDVbPySr1rr2KXLzOLsnZhpTpMq2RejglIAMF7WfIMfvHQ2mnjNuYNNQnXx8hPLm88GSxFYKHpUnAswgYuo4XX2drYMzzq3GWDMIHZ/kpLySU+eJGo6VGeFUV1DgaGksLXE3oCfrA1OCUyZ/qke3tzj8ixjwuprCmFPWsg==
template:
metadata:
creationTimestamp: null
name: alertmanager-tbrnt-config
namespace: monitoring
type: Opaque
status: {}

View File

@ -1,8 +1,13 @@
apiVersion: v1
data:
datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJMb2tpIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAibG9raSIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL2xva2kubG9raTozMTAwIiwKICAgICAgICAgICAgInZlcnNpb24iOiAxCiAgICAgICAgfQogICAgXQp9
datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLmRlZmF1bHQuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0=
kind: Secret
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana-datasources
namespace: monitoring
namespace: default
type: Opaque

File diff suppressed because it is too large Load Diff

View File

@ -17,5 +17,10 @@ data:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana-dashboards
namespace: monitoring
namespace: default

View File

@ -2,26 +2,32 @@ apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana
namespace: monitoring
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
checksum/grafana-datasources: 7103d054a6e94f976ca59b4ede77cf88
checksum/grafana-datasources: b822d7b1a1070f322d0773c043985b4a
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
spec:
containers:
- env:
- name: GF_INSTALL_PLUGINS
value: grafana-piechart-panel
image: grafana/grafana:7.3.5
- env: []
image: grafana/grafana:7.5.4
name: grafana
ports:
- containerPort: 3000
@ -113,9 +119,6 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/statefulset
name: grafana-dashboard-statefulset
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/traefik
name: grafana-dashboard-traefik
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/workload-total
name: grafana-dashboard-workload-total
readOnly: false
@ -201,9 +204,6 @@ spec:
- configMap:
name: grafana-dashboard-statefulset
name: grafana-dashboard-statefulset
- configMap:
name: grafana-dashboard-traefik
name: grafana-dashboard-traefik
- configMap:
name: grafana-dashboard-workload-total
name: grafana-dashboard-workload-total

View File

@ -2,14 +2,18 @@ apiVersion: v1
kind: Service
metadata:
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana
namespace: monitoring
namespace: default
spec:
ports:
- name: http
port: 3000
targetPort: http
selector:
app: grafana
type: NodePort
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus

View File

@ -2,4 +2,4 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: grafana
namespace: monitoring
namespace: default

View File

@ -1,12 +1,17 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana
namespace: monitoring
namespace: default
spec:
endpoints:
- interval: 15s
port: http
selector:
matchLabels:
app: grafana
app.kubernetes.io/name: grafana

View File

@ -13,4 +13,5 @@ spec:
name: healthchecks-io
namespace: monitoring
type: Opaque
status: {}

View File

@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
rules:
- apiGroups:
@ -24,16 +26,6 @@ rules:
verbs:
- list
- watch
- apiGroups:
- extensions
resources:
- daemonsets
- deployments
- replicasets
- ingresses
verbs:
- list
- watch
- apiGroups:
- apps
resources:
@ -105,6 +97,14 @@ rules:
- networking.k8s.io
resources:
- networkpolicies
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch

View File

@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -12,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring
namespace: default

View File

@ -2,20 +2,28 @@ apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: kube-state-metrics
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
spec:
containers:
- args:
@ -23,8 +31,17 @@ spec:
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: quay.io/coreos/kube-state-metrics:v1.9.7
image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0
name: kube-state-metrics
resources:
limits:
cpu: 100m
memory: 250Mi
requests:
cpu: 10m
memory: 190Mi
securityContext:
runAsUser: 65534
- args:
- --logtostderr
- --secure-listen-address=:8443
@ -35,6 +52,13 @@ spec:
ports:
- containerPort: 8443
name: https-main
resources:
limits:
cpu: 40m
memory: 40Mi
requests:
cpu: 20m
memory: 20Mi
securityContext:
runAsGroup: 65532
runAsNonRoot: true
@ -49,6 +73,13 @@ spec:
ports:
- containerPort: 9443
name: https-self
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
runAsGroup: 65532
runAsNonRoot: true

View File

@ -0,0 +1,46 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
prometheus: k8s
role: alert-rules
name: kube-state-metrics-rules
namespace: default
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical

View File

@ -2,10 +2,12 @@ apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default
spec:
clusterIP: None
ports:
@ -16,4 +18,6 @@ spec:
port: 9443
targetPort: https-self
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

View File

@ -2,7 +2,9 @@ apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default

View File

@ -2,10 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -28,4 +30,6 @@ spec:
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
rules:
- apiGroups:

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: node-exporter
namespace: monitoring
namespace: default

View File

@ -2,30 +2,37 @@ apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
spec:
containers:
- args:
- --web.listen-address=127.0.0.1:9100
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
image: quay.io/prometheus/node-exporter:v1.0.1
- --collector.netclass.ignored-devices=^(veth.*)$
- --collector.netdev.device-exclude=^(veth.*)$
image: quay.io/prometheus/node-exporter:v1.1.2
name: node-exporter
resources:
limits:
@ -35,10 +42,6 @@ spec:
cpu: 102m
memory: 180Mi
volumeMounts:
- mountPath: /host/proc
mountPropagation: HostToContainer
name: proc
readOnly: true
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
@ -85,9 +88,6 @@ spec:
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /proc
name: proc
- hostPath:
path: /sys
name: sys

View File

@ -0,0 +1,301 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
prometheus: k8s
role: alert-rules
name: node-exporter-rules
namespace: default
spec:
groups:
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 300s.
Ensure NTP is configured correctly on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed"} > 0
labels:
severity: warning
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m

View File

@ -2,10 +2,12 @@ apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default
spec:
clusterIP: None
ports:
@ -13,4 +15,6 @@ spec:
port: 9100
targetPort: https
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,5 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default

View File

@ -2,10 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -24,4 +26,6 @@ spec:
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,6 +1,11 @@
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
@ -8,6 +13,6 @@ spec:
insecureSkipTLSVerify: true
service:
name: prometheus-adapter
namespace: monitoring
namespace: default
version: v1beta1
versionPriority: 100

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
rules:
- apiGroups:

View File

@ -2,6 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring
namespace: default

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: resource-metrics:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring
namespace: default

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: resource-metrics-server-resources
rules:
- apiGroups:

View File

@ -4,8 +4,8 @@ data:
"resourceRules":
"cpu":
"containerLabel": "container"
"containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)"
"nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
"containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)"
"nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) or sum (1- irate(windows_cpu_time_total{mode=\"idle\", job=\"windows-exporter\",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>)"
"resources":
"overrides":
"namespace":
@ -16,8 +16,8 @@ data:
"resource": "pod"
"memory":
"containerLabel": "container"
"containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)"
"nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)"
"containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)"
"nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>) or sum(windows_cs_physical_memory_bytes{job=\"windows-exporter\",<<.LabelMatchers>>} - windows_memory_available_bytes{job=\"windows-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)"
"resources":
"overrides":
"instance":
@ -29,5 +29,10 @@ data:
"window": "5m"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: adapter-config
namespace: monitoring
namespace: default

View File

@ -1,21 +1,31 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
namespace: monitoring
namespace: default
spec:
replicas: 1
replicas: 2
selector:
matchLabels:
name: prometheus-adapter
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
maxUnavailable: 1
template:
metadata:
labels:
name: prometheus-adapter
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
spec:
containers:
- args:
@ -23,9 +33,9 @@ spec:
- --config=/etc/adapter/config.yaml
- --logtostderr=true
- --metrics-relist-interval=1m
- --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090/
- --prometheus-url=http://prometheus-k8s.default.svc.cluster.local:9090/
- --secure-port=6443
image: directxman12/k8s-prometheus-adapter:v0.8.2
image: directxman12/k8s-prometheus-adapter:v0.8.4
name: prometheus-adapter
ports:
- containerPort: 6443

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: resource-metrics-auth-reader
namespace: kube-system
roleRef:
@ -10,4 +15,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring
namespace: default

View File

@ -2,13 +2,18 @@ apiVersion: v1
kind: Service
metadata:
labels:
name: prometheus-adapter
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
namespace: monitoring
namespace: default
spec:
ports:
- name: https
port: 443
targetPort: 6443
selector:
name: prometheus-adapter
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,5 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
namespace: monitoring
namespace: default

View File

@ -2,9 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
name: prometheus-adapter
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
namespace: monitoring
namespace: default
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -15,4 +18,6 @@ spec:
insecureSkipVerify: true
selector:
matchLabels:
name: prometheus-adapter
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
rules:
- apiGroups:

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
namespace: default

View File

@ -4,9 +4,10 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator
namespace: monitoring
namespace: default
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -19,4 +20,5 @@ spec:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0

View File

@ -0,0 +1,18 @@
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: default
spec:
minAvailable: 1
selector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s

View File

@ -2,19 +2,31 @@ apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
prometheus: k8s
name: k8s
namespace: monitoring
namespace: default
spec:
alerting:
alertmanagers:
- name: alertmanager-main
namespace: monitoring
- apiVersion: v2
name: alertmanager-main
namespace: default
port: web
externalLabels: {}
externalUrl: http://prometheus-k8s.monitoring:9090
image: quay.io/prometheus/prometheus:v2.22.1
image: quay.io/prometheus/prometheus:v2.26.0
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
podMonitorNamespaceSelector:
matchExpressions:
- key: prometheus
@ -25,7 +37,7 @@ spec:
podMonitorSelector: {}
probeNamespaceSelector: {}
probeSelector: {}
replicas: 1
replicas: 2
resources:
requests:
memory: 400Mi
@ -58,4 +70,4 @@ spec:
requests:
storage: 10Gi
storageClassName: local-path
version: v2.22.1
version: 2.26.0

View File

@ -0,0 +1,256 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
namespace: default
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
reload its configuration.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="default"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less
than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="default"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="default"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to
a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="default"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="default"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
to any Alertmanagers.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="default"}[5m]) < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} reload failures over the last 3h.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="default"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} compaction failures over the last 3h.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="default"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
samples.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: |
(
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="default"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="default"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="default"}) > 0
)
)
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with different values but duplicated
timestamp.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
{{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
$labels.url }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="default"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="default"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="default"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="default"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="default"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="default"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="default"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="default"}[5m])
)
> 120
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ $value }} shards for queue {{
$labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{
printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="default"}`
$labels.instance | query | first | value }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="default"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="default"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
evaluate {{ printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{
printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
{{ printf "%.0f" $value }} targets because the number of targets exceeded
the configured target_limit.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="default",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="default",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical

View File

@ -1,8 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s-config
namespace: monitoring
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
@ -10,4 +15,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
namespace: default

View File

@ -3,6 +3,11 @@ items:
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: default
roleRef:
@ -12,10 +17,15 @@ items:
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
namespace: default
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: kube-system
roleRef:
@ -25,12 +35,17 @@ items:
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
namespace: default
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: monitoring
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
@ -38,31 +53,5 @@ items:
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: k8up
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: owntracks
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
namespace: default
kind: RoleBindingList

View File

@ -1,8 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s-config
namespace: monitoring
namespace: default
rules:
- apiGroups:
- ""

View File

@ -3,6 +3,11 @@ items:
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: default
rules:
@ -24,9 +29,22 @@ items:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: kube-system
rules:
@ -48,11 +66,24 @@ items:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: monitoring
namespace: default
rules:
- apiGroups:
- ""
@ -72,48 +103,8 @@ items:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: k8up
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: owntracks
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
- networking.k8s.io
resources:
- ingresses
verbs:

File diff suppressed because it is too large Load Diff

View File

@ -2,9 +2,13 @@ apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
namespace: default
spec:
ports:
- name: web
@ -12,5 +16,8 @@ spec:
targetPort: web
selector:
app: prometheus
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
sessionAffinity: ClientIP

View File

@ -1,5 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: monitoring
namespace: default

View File

@ -2,13 +2,19 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: prometheus
name: prometheus
namespace: monitoring
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: default
spec:
endpoints:
- interval: 30s
port: web
selector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s

View File

@ -1,74 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: apiserver
name: kube-apiserver
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|server).*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_controller_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_step_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
sourceLabels:
- __name__
- le
port: https
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
jobLabel: component
namespaceSelector:
matchNames:
- default
selector:
matchLabels:
component: apiserver
provider: kubernetes

View File

@ -1,19 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: coredns
name: coredns
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
port: metrics
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-dns

View File

@ -1,10 +1,3 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kube-controller-manager
name: kube-controller-manager
namespace: monitoring
spec:
endpoints:
- interval: 30s
@ -46,10 +39,3 @@ spec:
sourceLabels:
- __name__
port: http-metrics
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-controller-manager

View File

@ -1,18 +1,4 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kube-scheduler
name: kube-scheduler
namespace: monitoring
spec:
endpoints:
- interval: 30s
port: http-metrics
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-scheduler

View File

@ -1,90 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kubelet
name: kubelet
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
honorTimestamps: false
interval: 30s
metricRelabelings:
- action: drop
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
sourceLabels:
- __name__
path: /metrics/cadvisor
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/probes
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kubelet

View File

@ -1,4 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
name: default

View File

@ -0,0 +1,76 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
role: alert-rules
name: kube-prometheus-rules
namespace: default
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/watchdog
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
severity: none
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{ $labels.device }}" changing it's up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
expr: |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: AlertmanagerConfig
listKind: AlertmanagerConfigList
plural: alertmanagerconfigs
@ -60,6 +62,7 @@ spec:
properties:
name:
description: Label to match.
minLength: 1
type: string
regex:
description: Whether to match on equality (false) or regular-expression
@ -70,7 +73,6 @@ spec:
type: string
required:
- name
- value
type: object
type: array
targetMatch:
@ -82,6 +84,7 @@ spec:
properties:
name:
description: Label to match.
minLength: 1
type: string
regex:
description: Whether to match on equality (false) or regular-expression
@ -92,7 +95,6 @@ spec:
type: string
required:
- name
- value
type: object
type: array
type: object
@ -108,9 +110,13 @@ spec:
description: EmailConfig configures notifications via Email.
properties:
authIdentity:
description: The identity to use for authentication.
type: string
authPassword:
description: SecretKeySelector selects a key of a Secret.
description: The secret's key that contains the password
to use for authentication. The secret needs to be in
the same namespace as the AlertmanagerConfig object
and accessible by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
@ -129,7 +135,10 @@ spec:
- key
type: object
authSecret:
description: SecretKeySelector selects a key of a Secret.
description: The secret's key that contains the CRAM-MD5
secret. The secret needs to be in the same namespace
as the AlertmanagerConfig object and accessible by the
Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
@ -148,7 +157,7 @@ spec:
- key
type: object
authUsername:
description: SMTP authentication information.
description: The username to use for authentication.
type: string
from:
description: The sender address.
@ -162,6 +171,7 @@ spec:
properties:
key:
description: Key of the tuple.
minLength: 1
type: string
value:
description: Value of the tuple.
@ -321,6 +331,7 @@ spec:
name:
description: Name of the receiver. Must be unique across all
items from the list.
minLength: 1
type: string
opsgenieConfigs:
description: List of OpsGenie configurations.
@ -364,6 +375,7 @@ spec:
properties:
key:
description: Key of the tuple.
minLength: 1
type: string
value:
description: Value of the tuple.
@ -590,8 +602,8 @@ spec:
description: List of responders responsible for notifications.
items:
description: OpsGenieConfigResponder defines a responder
to an incident. One of id, name or username has to
be defined.
to an incident. One of `id`, `name` or `username`
has to be defined.
properties:
id:
description: ID of the responder.
@ -601,10 +613,13 @@ spec:
type: string
type:
description: Type of responder.
minLength: 1
type: string
username:
description: Username of the responder.
type: string
required:
- type
type: object
type: array
sendResolved:
@ -649,6 +664,7 @@ spec:
properties:
key:
description: Key of the tuple.
minLength: 1
type: string
value:
description: Value of the tuple.
@ -1163,8 +1179,11 @@ spec:
description: Notification title.
type: string
token:
description: Your registered applications API token,
see https://pushover.net/apps
description: The secret's key that contains the registered
applications API token, see https://pushover.net/apps.
The secret needs to be in the same namespace as the
AlertmanagerConfig object and accessible by the Prometheus
Operator.
properties:
key:
description: The key of the secret to select from. Must
@ -1190,7 +1209,10 @@ spec:
just the URL is shown
type: string
userKey:
description: The recipient users user key.
description: The secret's key that contains the recipient
users user key. The secret needs to be in the same
namespace as the AlertmanagerConfig object and accessible
by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
@ -1237,6 +1259,7 @@ spec:
okText:
type: string
text:
minLength: 1
type: string
title:
type: string
@ -1248,8 +1271,10 @@ spec:
style:
type: string
text:
minLength: 1
type: string
type:
minLength: 1
type: string
url:
type: string
@ -1307,8 +1332,10 @@ spec:
short:
type: boolean
title:
minLength: 1
type: string
value:
minLength: 1
type: string
required:
- title
@ -1558,8 +1585,10 @@ spec:
VictorOps. See https://prometheus.io/docs/alerting/latest/configuration/#victorops_config
properties:
apiKey:
description: The API key to use when talking to the VictorOps
API.
description: The secret's key that contains the API key
to use when talking to the VictorOps API. The secret
needs to be in the same namespace as the AlertmanagerConfig
object and accessible by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must
@ -1587,6 +1616,7 @@ spec:
properties:
key:
description: Key of the tuple.
minLength: 1
type: string
value:
description: Value of the tuple.
@ -1820,8 +1850,6 @@ spec:
description: Contains long explanation of the alerted
problem.
type: string
required:
- routingKey
type: object
type: array
webhookConfigs:
@ -2035,8 +2063,9 @@ spec:
type: object
maxAlerts:
description: Maximum number of alerts to be sent per webhook
message.
message. When 0, all alerts are included.
format: int32
minimum: 0
type: integer
sendResolved:
description: Whether or not to notify about resolved alerts.
@ -2334,8 +2363,8 @@ spec:
type: array
route:
description: The Alertmanager route definition for alerts matching
the resources namespace. It will be added to the generated Alertmanager
configuration as a first-level route.
the resources namespace. If present, it will be added to the generated
Alertmanager configuration as a first-level route.
properties:
continue:
description: Boolean indicating whether an alert should continue
@ -2367,6 +2396,7 @@ spec:
properties:
name:
description: Label to match.
minLength: 1
type: string
regex:
description: Whether to match on equality (false) or regular-expression
@ -2377,13 +2407,11 @@ spec:
type: string
required:
- name
- value
type: object
type: array
receiver:
description: Name of the receiver for this route. If present,
it should be listed in the `receivers` field. The field can
be omitted only for nested routes otherwise it is mandatory.
description: Name of the receiver for this route. If not empty,
it should be listed in the `receivers` field.
type: string
repeatInterval:
description: How long to wait before repeating the last notification.

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: Alertmanager
listKind: AlertmanagerList
plural: alertmanagers

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: PodMonitor
listKind: PodMonitorList
plural: podmonitors
@ -197,8 +199,10 @@ spec:
to proxy through this endpoint.
type: string
relabelings:
description: 'RelabelConfigs to apply to samples before ingestion.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
description: 'RelabelConfigs to apply to samples before scraping.
Prometheus Operator automatically adds relabelings for a few
standard Kubernetes fields and replaces original scrape job
name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: 'RelabelConfig allows dynamic rewriting of the
label set, being applied to samples before ingestion. It

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: Probe
listKind: ProbeList
plural: probes
@ -35,6 +37,68 @@ spec:
description: Specification of desired Ingress selection for target discovery
by Prometheus.
properties:
basicAuth:
description: 'BasicAuth allow an endpoint to authenticate over basic
authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint'
properties:
password:
description: The secret in the service monitor namespace that
contains the password for authentication.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
username:
description: The secret in the service monitor namespace that
contains the username for authentication.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
type: object
bearerTokenSecret:
description: Secret to mount to read bearer token for scraping targets.
The secret needs to be in the same namespace as the probe and accessible
by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must be a
valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be defined
type: boolean
required:
- key
type: object
interval:
description: Interval at which targets are probed using the configured
prober. If not specified Prometheus' global scrape interval is used.
@ -190,6 +254,52 @@ spec:
description: Labels assigned to all metrics scraped from the
targets.
type: object
relabelingConfigs:
description: 'RelabelConfigs to apply to samples before ingestion.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: 'RelabelConfig allows dynamic rewriting of
the label set, being applied to samples before ingestion.
It defines `<metric_relabel_configs>`-section of Prometheus
configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs'
properties:
action:
description: Action to perform based on regex matching.
Default is 'replace'
type: string
modulus:
description: Modulus to take of the hash of the source
label values.
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched. Default is '(.*)'
type: string
replacement:
description: Replacement value against which a regex
replace is performed if the regular expression matches.
Regex capture groups are available. Default is '$1'
type: string
separator:
description: Separator placed between concatenated source
label values. default is ';'.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
separator and matched against the configured regular
expression for the replace, keep, and drop actions.
items:
type: string
type: array
targetLabel:
description: Label to which the resulting value is written
in a replace action. It is mandatory for replace actions.
Regex capture groups are available.
type: string
type: object
type: array
static:
description: Targets is a list of URLs to probe using the
configured prober.
@ -198,6 +308,112 @@ spec:
type: array
type: object
type: object
tlsConfig:
description: TLS configuration to use when scraping the endpoint.
properties:
ca:
description: Struct containing the CA cert to use for the targets.
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
type: object
cert:
description: Struct containing the client cert file for the targets.
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
type: object
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keySecret:
description: Secret containing the client key file for the targets.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object
required:
- spec

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: Prometheus
listKind: PrometheusList
plural: prometheuses
@ -2199,6 +2201,15 @@ spec:
only clients authorized to perform these actions can do so. For
more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis'
type: boolean
enableFeatures:
description: Enable access to Prometheus disabled features. By default,
no features are enabled. Enabling disabled features is entirely
outside the scope of what the maintainers will support and by doing
so, you accept that this behaviour may break at any time without
notice. For more information see https://prometheus.io/docs/prometheus/latest/disabled_features/
items:
type: string
type: array
enforcedNamespaceLabel:
description: EnforcedNamespaceLabel enforces adding a namespace label
of origin for each alert and metric that is user created. The label
@ -3388,8 +3399,8 @@ spec:
type: string
type: object
podMonitorNamespaceSelector:
description: Namespaces to be selected for PodMonitor discovery. If
nil, only check own namespace.
description: Namespace's labels to match for PodMonitor discovery.
If nil, only check own namespace.
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
@ -3682,7 +3693,7 @@ spec:
type: object
type: object
bearerToken:
description: bearer token for remote read.
description: Bearer token for remote read.
type: string
bearerTokenFile:
description: File to read bearer token for remote read.
@ -3893,11 +3904,32 @@ spec:
type: object
type: object
bearerToken:
description: File to read bearer token for remote write.
description: Bearer token for remote write.
type: string
bearerTokenFile:
description: File to read bearer token for remote write.
type: string
headers:
additionalProperties:
type: string
description: Custom HTTP headers to be sent along with each
remote write request. Be aware that headers that are set by
Prometheus itself can't be overwritten. Only valid in Prometheus
versions 2.25.0 and newer.
type: object
metadataConfig:
description: MetadataConfig configures the sending of series
metadata to remote storage.
properties:
send:
description: Whether metric metadata is sent to remote storage
or not.
type: boolean
sendInterval:
description: How frequently metric metadata is sent to remote
storage.
type: string
type: object
name:
description: The name of the remote write queue, must be unique
if specified. The name is used in metrics and logging in order
@ -4168,7 +4200,8 @@ spec:
(milliseconds seconds minutes hours days weeks years).
type: string
retentionSize:
description: Maximum amount of disk space used by blocks.
description: 'Maximum amount of disk space used by blocks. Supported
units: B, KB, MB, GB, TB, PB, EB. Ex: `512MB`.'
type: string
routePrefix:
description: The route prefix Prometheus registers HTTP handlers for.
@ -4435,7 +4468,7 @@ spec:
to use to run the Prometheus Pods.
type: string
serviceMonitorNamespaceSelector:
description: Namespaces to be selected for ServiceMonitor discovery.
description: Namespace's labels to match for ServiceMonitor discovery.
If nil, only check own namespace.
properties:
matchExpressions:
@ -5072,6 +5105,11 @@ spec:
required:
- key
type: object
tracingConfigFile:
description: TracingConfig specifies the path of the tracing configuration
file. When used alongside with TracingConfig, TracingConfigFile
takes precedence.
type: string
version:
description: Version describes the version of Thanos to use.
type: string

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: ServiceMonitor
listKind: ServiceMonitorList
plural: servicemonitors
@ -184,7 +186,9 @@ spec:
type: string
relabelings:
description: 'RelabelConfigs to apply to samples before scraping.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
Prometheus Operator automatically adds relabelings for a few
standard Kubernetes fields and replaces original scrape job
name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: 'RelabelConfig allows dynamic rewriting of the
label set, being applied to samples before ingestion. It

View File

@ -8,6 +8,8 @@ metadata:
spec:
group: monitoring.coreos.com
names:
categories:
- prometheus-operator
kind: ThanosRuler
listKind: ThanosRulerList
plural: thanosrulers

View File

@ -4,7 +4,8 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator
rules:
- apiGroups:

View File

@ -4,7 +4,8 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -13,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring
namespace: default

View File

@ -4,27 +4,30 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator
namespace: monitoring
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.44.1
image: quay.io/prometheus-operator/prometheus-operator:v0.44.1
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.47.0
image: quay.io/prometheus-operator/prometheus-operator:v0.47.0
name: prometheus-operator
ports:
- containerPort: 8080
@ -48,12 +51,19 @@ spec:
ports:
- containerPort: 8443
name: https
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
nodeSelector:
beta.kubernetes.io/os: linux
kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534

View File

@ -0,0 +1,95 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
prometheus: k8s
role: alert-rules
name: prometheus-operator-rules
namespace: default
spec:
groups:
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="default"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="default"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="default"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="default"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="default"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations
failed for {{ $labels.controller }} controller in {{ $labels.namespace }}
namespace.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="default"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="default"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
Namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="default"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected
{{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource
}} resources.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="default"}[5m]) > 0
for: 5m
labels:
severity: warning

View File

@ -4,9 +4,10 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator
namespace: monitoring
namespace: default
spec:
clusterIP: None
ports:
@ -16,3 +17,4 @@ spec:
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus

View File

@ -4,6 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator
namespace: monitoring
namespace: default

View File

@ -8,7 +8,7 @@ local statefulSet = k.apps.v1.statefulSet;
local selector = statefulSet.mixin.spec.selectorType;
local kp =
(import 'kube-prometheus/kube-prometheus.libsonnet') +
(import 'kube-prometheus/main.libsonnet') +
(import 'prometheus-pushgateway/pushgateway.libsonnet') +
(import 'k3s.libsonnet')

View File

@ -1 +0,0 @@
github.com/etcd-io/etcd/Documentation/etcd-mixin

View File

@ -7,7 +7,7 @@
},
imageRepos+:: {
grafana: 'grafana/grafana',
grafana: 'docker.io/grafana/grafana',
},
prometheus+:: {
@ -16,6 +16,11 @@
},
grafana+:: {
labels: {
'app.kubernetes.io/name': 'grafana',
'app.kubernetes.io/version': $._config.versions.grafana,
'app.kubernetes.io/component': 'grafana',
},
dashboards: {},
rawDashboards: {},
folderDashboards: {},
@ -51,6 +56,7 @@
metadata: {
name: 'grafana-config',
namespace: $._config.namespace,
labels: $._config.grafana.labels,
},
type: 'Opaque',
data: {
@ -67,6 +73,7 @@
metadata: {
name: dashboardName,
namespace: $._config.namespace,
labels: $._config.grafana.labels,
},
data: { [name]: std.manifestJsonEx($._config.grafana.dashboards[name], ' ') },
}
@ -79,6 +86,7 @@
metadata: {
name: dashboardName,
namespace: $._config.namespace,
labels: $._config.grafana.labels,
},
data: { [name]: std.manifestJsonEx($._config.grafana.folderDashboards[folder][name], ' ') },
}
@ -95,6 +103,7 @@
metadata: {
name: dashboardName,
namespace: $._config.namespace,
labels: $._config.grafana.labels,
},
data: { [name]: $._config.grafana.rawDashboards[name] },
}
@ -141,6 +150,7 @@
metadata: {
name: 'grafana-dashboards',
namespace: $._config.namespace,
labels: $._config.grafana.labels,
},
data: { 'dashboards.yaml': std.manifestJsonEx(dashboardSources, ' ') },
},
@ -151,6 +161,7 @@
metadata: {
name: 'grafana-datasources',
namespace: $._config.namespace,
labels: $._config.grafana.labels,
},
type: 'Opaque',
data: { 'datasources.yaml': std.base64(std.encodeUTF8(std.manifestJsonEx({
@ -165,13 +176,10 @@
metadata: {
name: 'grafana',
namespace: $._config.namespace,
labels: {
app: 'grafana',
},
labels: $._config.grafana.labels,
},
spec: {
selector: $.grafana.deployment.spec.selector.matchLabels,
type: 'NodePort',
ports: [
{ name: 'http', targetPort: 'http', port: 3000 },
],
@ -189,7 +197,12 @@
deployment:
local targetPort = $._config.grafana.port;
local portName = 'http';
local podLabels = { app: 'grafana' };
local podLabels = $._config.grafana.labels;
local podSelectorLabels = {
[labelName]: podLabels[labelName]
for labelName in std.objectFields(podLabels)
if !std.setMember(labelName, ['app.kubernetes.io/version'])
};
local configVolumeName = 'grafana-config';
local configSecretName = 'grafana-config';
@ -311,7 +324,7 @@
spec: {
replicas: 1,
selector: {
matchLabels: podLabels,
matchLabels: podSelectorLabels,
},
template: {
metadata: {

View File

@ -10,6 +10,11 @@
// scrape_interval_seconds is the global scrape interval which can be
// used to dynamically adjust rate windows as a function of the interval.
scrape_interval_seconds: 30,
// Dashboard variable refresh option on Grafana (https://grafana.com/docs/grafana/latest/datasources/prometheus/).
// 0 : Never (Will never refresh the Dashboard variables values)
// 1 : On Dashboard Load (Will refresh Dashboards variables when dashboard are loaded)
// 2 : On Time Range Change (Will refresh Dashboards variables when time range will be changed)
dashboard_var_refresh: 2,
},
prometheusAlerts+:: {
@ -202,51 +207,6 @@
summary: 'etcd cluster 99th percentile commit durations are too high.',
},
},
{
alert: 'etcdHighNumberOfFailedHTTPRequests',
expr: |||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
without (code) > 0.01
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}',
summary: 'etcd has high number of failed HTTP requests.',
},
},
{
alert: 'etcdHighNumberOfFailedHTTPRequests',
expr: |||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
without (code) > 0.05
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd has high number of failed HTTP requests.',
},
},
{
alert: 'etcdHTTPRequestsSlow',
expr: |||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.',
summary: 'etcd instance HTTP requests are slow.',
},
},
{
alert: 'etcdBackendQuotaLowSpace',
expr: |||
@ -283,7 +243,7 @@
uid: std.md5('etcd.json'),
title: 'etcd',
description: 'etcd sample Grafana dashboard with Prometheus',
tags: [],
tags: [ 'etcd-mixin' ],
style: 'dark',
timezone: 'browser',
editable: true,
@ -1332,7 +1292,7 @@
name: 'cluster',
options: [],
query: 'label_values(etcd_server_has_leader, job)',
refresh: 1,
refresh: $._config.dashboard_var_refresh,
regex: '',
sort: 2,
tagValuesQuery: '',

View File

@ -0,0 +1,57 @@
{
/**
* Creates a [Google Cloud Monitoring target](https://grafana.com/docs/grafana/latest/datasources/cloudmonitoring/)
*
* @name cloudmonitoring.target
*
* @param metric
* @param project
* @param filters (optional)
* @param groupBys (optional)
* @param period (default: `'cloud-monitoring-auto'`)
* @param crossSeriesReducer (default 'REDUCE_MAX')
* @param valueType (default 'INT64')
* @param perSeriesAligner (default 'ALIGN_DELTA')
* @param metricKind (default 'CUMULATIVE')
* @param unit (optional)
* @param alias (optional)
* @return Panel target
*/
target(
metric,
project,
filters=[],
groupBys=[],
period='cloud-monitoring-auto',
crossSeriesReducer='REDUCE_MAX',
valueType='INT64',
perSeriesAligner='ALIGN_DELTA',
metricKind='CUMULATIVE',
unit=1,
alias=null,
):: {
metricQuery: {
[if alias != null then 'aliasBy']: alias,
alignmentPeriod: period,
crossSeriesReducer: crossSeriesReducer,
[if filters != null then 'filters']: filters,
[if groupBys != null then 'groupBys']: groupBys,
metricKind: metricKind,
metricType: metric,
perSeriesAligner: perSeriesAligner,
projectName: project,
unit: unit,
valueType: valueType,
},
sloQuery: {
[if alias != null then 'aliasBy']: alias,
alignmentPeriod: period,
projectName: project,
selectorName: 'select_slo_health',
serviceId: '',
sloId: '',
},
},
}

View File

@ -13,6 +13,9 @@
* @param highResolution (default: `false`)
* @param period (default: `'1m'`)
* @param dimensions (optional)
* @param id (optional)
* @param expression (optional)
* @param hide (optional)
* @return Panel target
*/
@ -26,7 +29,10 @@
alias=null,
highResolution=false,
period='1m',
dimensions={}
dimensions={},
id=null,
expression=null,
hide=null
):: {
region: region,
namespace: namespace,
@ -37,5 +43,9 @@
highResolution: highResolution,
period: period,
dimensions: dimensions,
[if id != null then 'id']: id,
[if expression != null then 'expression']: expression,
[if hide != null then 'hide']: hide,
},
}

View File

@ -36,6 +36,7 @@
* @method addMappings(mappings) Adds an array of value mappings.
* @method addDataLink(link) Adds a data link.
* @method addDataLinks(links) Adds an array of data links.
* @param timeFrom (optional)
*/
new(
title,
@ -58,6 +59,7 @@
repeat=null,
repeatDirection='h',
repeatMaxPerRow=null,
timeFrom=null,
pluginVersion='7',
):: {
@ -71,6 +73,7 @@
[if repeat != null then 'repeat']: repeat,
[if repeat != null then 'repeatDirection']: repeatDirection,
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
[if timeFrom != null then 'timeFrom']: timeFrom,
// targets
_nextTarget:: 0,
@ -138,6 +141,21 @@
fieldConfig+: { defaults+: { links+: [link] } },
},
// Overrides
addOverride(
matcher=null,
properties=null,
):: self {
fieldConfig+: {
overrides+: [
{
[if matcher != null then 'matcher']: matcher,
[if properties != null then 'properties']: properties,
},
],
},
},
addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self),
} else {
options: {

View File

@ -18,6 +18,7 @@
sql:: import 'sql.libsonnet',
graphite:: import 'graphite.libsonnet',
alertCondition:: import 'alert_condition.libsonnet',
cloudmonitoring:: import 'cloudmonitoring.libsonnet',
cloudwatch:: import 'cloudwatch.libsonnet',
elasticsearch:: import 'elasticsearch.libsonnet',
heatmapPanel:: import 'heatmap_panel.libsonnet',
@ -27,4 +28,5 @@
gaugePanel:: import 'gauge_panel.libsonnet',
barGaugePanel:: import 'bar_gauge_panel.libsonnet',
statPanel:: import 'stat_panel.libsonnet',
transformation:: import 'transformation.libsonnet',
}

View File

@ -21,6 +21,7 @@
* @param formatY2 (optional) Unit of the second Y axis
* @param min (optional) Min of the Y axes
* @param max (optional) Max of the Y axes
* @param maxDataPoints (optional) If the data source supports it, sets the maximum number of data points for each series returned.
* @param labelY1 (optional) Label of the first Y axis
* @param labelY2 (optional) Label of the second Y axis
* @param x_axis_mode (default `'time'`) X axis mode, one of [time, series, histogram]
@ -57,6 +58,8 @@
* @param value_type (default `'individual'`) Type of tooltip value
* @param shared_tooltip (default `true`) Allow to group or spit tooltips on mouseover within a chart
* @param percentage (defaut: false) show as percentages
* @param interval (defaut: null) A lower limit for the interval.
*
* @method addTarget(target) Adds a target object.
* @method addTargets(targets) Adds an array of targets.
@ -126,8 +129,10 @@
value_type='individual',
shared_tooltip=true,
percentage=false,
maxDataPoints=null,
time_from=null,
time_shift=null,
interval=null
):: {
title: title,
[if span != null then 'span']: span,
@ -179,6 +184,7 @@
bars: bars,
stack: stack,
percentage: percentage,
[if maxDataPoints != null then 'maxDataPoints']: maxDataPoints,
legend: {
show: legend_show,
values: legend_values,
@ -204,6 +210,7 @@
},
timeFrom: time_from,
timeShift: time_shift,
[if interval != null then 'interval']: interval,
[if transparent == true then 'transparent']: transparent,
aliasColors: aliasColors,
repeat: repeat,
@ -288,5 +295,19 @@
links+: [link],
},
addLinks(links):: std.foldl(function(p, t) p.addLink(t), links, self),
addOverride(
matcher=null,
properties=null,
):: self {
fieldConfig+: {
overrides+: [
{
[if matcher != null then 'matcher']: matcher,
[if properties != null then 'properties']: properties,
},
],
},
},
addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self),
},
}

View File

@ -42,6 +42,7 @@
* @param yBucketBound (default `'auto'`) Which bound ('lower' or 'upper') of the bucket to use
* @param yBucketNumber (optional) Number of buckets for the Y axis
* @param yBucketSize (optional) Size of Y axis buckets. Has priority over yBucketNumber
* @param maxDataPoints (optional) The maximum data points per series. Used directly by some data sources and used in calculation of auto interval. With streaming data this value is used for the rolling buffer.
*
* @method addTarget(target) Adds a target object.
* @method addTargets(targets) Adds an array of targets.
@ -83,7 +84,7 @@
yBucketBound='auto',
yBucketNumber=null,
yBucketSize=null,
maxDataPoints=null,
):: {
title: title,
type: 'heatmap',
@ -135,6 +136,7 @@
yBucketBound: yBucketBound,
[if dataFormat == 'timeseries' then 'yBucketNumber']: yBucketNumber,
[if dataFormat == 'timeseries' then 'yBucketSize']: yBucketSize,
[if maxDataPoints != null then 'maxDataPoints']: maxDataPoints,
_nextTarget:: 0,
addTarget(target):: self {

View File

@ -7,14 +7,17 @@
* @param rawSql The SQL query
* @param datasource (optional)
* @param format (default `'time_series'`)
* @param alias (optional)
*/
target(
rawSql,
datasource=null,
format='time_series',
alias=null,
):: {
[if datasource != null then 'datasource']: datasource,
format: format,
[if alias != null then 'alias']: alias,
rawSql: rawSql,
},
}

View File

@ -23,9 +23,10 @@
* @param displayName (optional) Change the field or series name.
* @param noValue (optional) What to show when there is no value.
* @param thresholdsMode (default `'absolute'`) 'absolute' or 'percentage'.
* @param timeFrom (optional) Override the relative time range.
* @param repeat (optional) Name of variable that should be used to repeat this panel.
* @param repeatDirection (default `'h'`) 'h' for horizontal or 'v' for vertical.
* @param repeatMaxPerRow (optional) Maximum panels per row in repeat mode.
* @param maxPerRow (optional) Maximum panels per row in repeat mode.
* @param pluginVersion (default `'7'`) Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'.
*
* @method addTarget(target) Adds a target object.
@ -59,9 +60,10 @@
displayName=null,
noValue=null,
thresholdsMode='absolute',
timeFrom=null,
repeat=null,
repeatDirection='h',
repeatMaxPerRow=null,
maxPerRow=null,
pluginVersion='7',
):: {
@ -74,7 +76,8 @@
links: [],
[if repeat != null then 'repeat']: repeat,
[if repeat != null then 'repeatDirection']: repeatDirection,
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
[if timeFrom != null then 'timeFrom']: timeFrom,
[if repeat != null then 'maxPerRow']: maxPerRow,
// targets
_nextTarget:: 0,
@ -143,6 +146,22 @@
addDataLink(link):: self {
fieldConfig+: { defaults+: { links+: [link] } },
},
// Overrides
addOverride(
matcher=null,
properties=null,
):: self {
fieldConfig+: {
overrides+: [
{
[if matcher != null then 'matcher']: matcher,
[if properties != null then 'properties']: properties,
},
],
},
},
addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self),
} else {
options: {
fieldOptions: {

View File

@ -24,6 +24,8 @@
* @method addColumn(field, style) Adds a column
* @method hideColumn(field) Hides a column
* @method addLink(link) Adds a link
* @method addTransformation(transformation) Adds a transformation object
* @method addTransformations(transformations) Adds an array of transformations
*/
new(
title,
@ -81,5 +83,9 @@
addLink(link):: self {
links+: [link],
},
addTransformation(transformation):: self {
transformations+: [transformation],
},
addTransformations(transformations):: std.foldl(function(p, t) p.addTransformation(t), transformations, self),
},
}

View File

@ -1,6 +1,6 @@
{
/**
* Creates a [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates) that can be added to a dashboard.
* Creates a [template](https://grafana.com/docs/grafana/latest/variables/#templates) that can be added to a dashboard.
*
* @name template.new
*
@ -18,7 +18,7 @@
* @param multi (default `false`) Whether multiple values can be selected or not from variable value list.
* @param sort (default `0`) `0`: Without Sort, `1`: Alphabetical (asc), `2`: Alphabetical (desc), `3`: Numerical (asc), `4`: Numerical (desc).
*
* @return A [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates)
* @return A [template](https://grafana.com/docs/grafana/latest/variables/#templates)
*/
new(
name,

View File

@ -31,8 +31,10 @@
'7d',
'30d',
],
nowDelay=null,
):: {
refresh_intervals: refresh_intervals,
time_options: time_options,
[if nowDelay != null then 'nowDelay']: nowDelay,
},
}

View File

@ -0,0 +1,12 @@
{
/**
* @name transformation.new
*/
new(
id='',
options={}
):: {
id: id,
options: options,
},
}

View File

@ -44,7 +44,7 @@
addMultiTemplate(name, metric_name, label_name, hide=0):: self {
templating+: {
list+: [{
allValue: null,
allValue: '.+',
current: {
selected: true,
text: 'All',
@ -196,7 +196,7 @@
timeShift: null,
title: title,
tooltip: {
shared: true,
shared: false,
sort: 0,
value_type: 'individual',
},
@ -382,7 +382,7 @@
expr:
|||
sum by (status) (
label_replace(label_replace(rate(%s[$__interval]),
label_replace(label_replace(rate(%s[$__rate_interval]),
"status", "${1}xx", "%s", "([0-9]).."),
"status", "${1}", "%s", "([a-z]+)"))
||| % [selector, statusLabelName, statusLabelName],
@ -399,7 +399,7 @@
nullPointMode: 'null as zero',
targets: [
{
expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__interval])) by (le)) * %s' % [metricName, selector, multiplier],
expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier],
format: 'time_series',
intervalFactor: 2,
legendFormat: '99th Percentile',
@ -407,7 +407,7 @@
step: 10,
},
{
expr: 'histogram_quantile(0.50, sum(rate(%s_bucket%s[$__interval])) by (le)) * %s' % [metricName, selector, multiplier],
expr: 'histogram_quantile(0.50, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier],
format: 'time_series',
intervalFactor: 2,
legendFormat: '50th Percentile',
@ -415,7 +415,7 @@
step: 10,
},
{
expr: 'sum(rate(%s_sum%s[$__interval])) * %s / sum(rate(%s_count%s[$__interval]))' % [metricName, selector, multiplier, metricName, selector],
expr: 'sum(rate(%s_sum%s[$__rate_interval])) * %s / sum(rate(%s_count%s[$__rate_interval]))' % [metricName, selector, multiplier, metricName, selector],
format: 'time_series',
intervalFactor: 2,
legendFormat: 'Average',

View File

@ -6,6 +6,8 @@ approvers:
- metalmatze
- tomwilkie
- s-urbaniak
- povilasv
- paulfantom
reviewers:
- brancz
@ -13,3 +15,5 @@ reviewers:
- metalmatze
- tomwilkie
- s-urbaniak
- povilasv
- paulfantom

View File

@ -7,15 +7,17 @@ A set of Grafana dashboards and Prometheus alerts for Kubernetes.
## Releases
| Release branch | Kubernetes Compatibility | Prometheus Compatibility |
| ------- | -------------------------- | ------------------------ |
| release-0.1 | v1.13 and before | |
| release-0.2 | v1.14.1 and before | v2.11.0+ |
| release-0.3 | v1.17 and before | v2.11.0+ |
| release-0.4 | v1.18 | v2.11.0+ |
| release-0.5 | v1.19 | v2.11.0+ |
| release-0.6 | v1.19+ | v2.11.0+ |
| master | v1.19+ | v2.11.0+ |
| Release branch | Kubernetes Compatibility | Prometheus Compatibility | Kube-state-metrics Compatibility |
| -------------- | -------------------------- | ------------------------ | -------------------------------- |
| release-0.1 | v1.13 and before | | |
| release-0.2 | v1.14.1 and before | v2.11.0+ | |
| release-0.3 | v1.17 and before | v2.11.0+ | |
| release-0.4 | v1.18 | v2.11.0+ | |
| release-0.5 | v1.19 | v2.11.0+ | |
| release-0.6 | v1.19+ | v2.11.0+ | |
| release-0.7 | v1.19+ | v2.11.0+ | v1.x |
| release-0.8 | v1.20+ | v2.11.0+ | v2.0+ |
| master | v1.20+ | v2.11.0+ | v2.0+ |
In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.
Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis.

Some files were not shown because too many files have changed in this diff Show More