major monitoring upgrade
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2021-05-01 16:24:35 +02:00
parent ab98ad2e70
commit d26b64b384
228 changed files with 8917 additions and 7056 deletions

View File

@ -8,7 +8,7 @@
"subdir": "jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "master", "version": "main",
"name": "kube-prometheus" "name": "kube-prometheus"
}, },
{ {

View File

@ -8,18 +8,18 @@
"subdir": "grafana" "subdir": "grafana"
} }
}, },
"version": "8024f4fdaeb3a3a7d72f77e2ed87deb92c79aeda", "version": "8ea4e7bc04b1bf5e9bd99918ca28c6271b42be0e",
"sum": "WXrJQtWuU5lJVc4jXkJGddPMpPP0+4eMcIB5cauZGgM=" "sum": "muenICtKXABk6MZZHCZD2wCbmtiE96GwWRMGa1Rg+wA="
}, },
{ {
"source": { "source": {
"git": { "git": {
"remote": "https://github.com/etcd-io/etcd", "remote": "https://github.com/etcd-io/etcd",
"subdir": "Documentation/etcd-mixin" "subdir": "contrib/mixin"
} }
}, },
"version": "ca866c02422ff3f3d1f0876898a30c33dd7bcccf", "version": "562d645ac923388ff5b8d270b0536764d34b0e0f",
"sum": "bLqTqEr0jky9zz5MV/7ucn6H5mph2NlXas0TVnGNB1Y=" "sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc="
}, },
{ {
"source": { "source": {
@ -28,8 +28,8 @@
"subdir": "grafonnet" "subdir": "grafonnet"
} }
}, },
"version": "356bd73e4792ffe107725776ca8946895969c191", "version": "55cf4ee53ced2b6d3ce96ecce9fb813b4465be98",
"sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU=" "sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk="
}, },
{ {
"source": { "source": {
@ -38,8 +38,8 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "216bc806bb512f218e3cf5ed3d4f5699b07f04d6", "version": "dbf1211d003d20c7adcdee942c477e648507a398",
"sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k=" "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
}, },
{ {
"source": { "source": {
@ -59,8 +59,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "ead45674dba3c8712e422d99223453177aac6bf4", "version": "c67c0f19e869f1da34d79b6507c1fa37c23a6e4e",
"sum": "3i0NkntlBluDS1NRF+iSc2e727Alkv3ziuVjAP12/kE=" "sum": "F+RxcI26zeoeI81uot39Jv6IpQ6BOz+xlSHlElJYsz8="
}, },
{ {
"source": { "source": {
@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet" "subdir": "lib/promgrafonnet"
} }
}, },
"version": "ead45674dba3c8712e422d99223453177aac6bf4", "version": "39a9cda705b5201c35105bd1f24c83923fa839ef",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
}, },
{ {
@ -79,8 +79,8 @@
"subdir": "jsonnet/kube-state-metrics" "subdir": "jsonnet/kube-state-metrics"
} }
}, },
"version": "89aaf6c524ee891140c4c8f2a05b1b16f5847309", "version": "b1889aa1561ee269f628e2b9659155e7714dbbf0",
"sum": "zD/pbQLnQq+5hegEelaheHS8mn1h09GTktFO74iwlBI=" "sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
}, },
{ {
"source": { "source": {
@ -89,7 +89,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin" "subdir": "jsonnet/kube-state-metrics-mixin"
} }
}, },
"version": "7bdd62593c9273b5179cf3c9d2d819e9d997aaa4", "version": "b1889aa1561ee269f628e2b9659155e7714dbbf0",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
}, },
{ {
@ -99,8 +99,8 @@
"subdir": "jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "7d7d40b4dee70ecd3328dcdee2ed0cc8f806df93", "version": "5b2740d517095a6ae9ad51bcb9c53e5ef28c62a0",
"sum": "6PhhQPWilq4skfe+z/hXKEg1pRqHnwvMR1Au6W136U0=" "sum": "+6VkkR44AC3Qnwfr9cWYCKs+uRi5JaIOda/3X1JEzAg="
}, },
{ {
"source": { "source": {
@ -109,8 +109,9 @@
"subdir": "jsonnet/mixin" "subdir": "jsonnet/mixin"
} }
}, },
"version": "117c9a2cd905479022a66ddd92a41f599cccf10d", "version": "b7ca32169844f0b5143f3e5e318fc05fa025df18",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=" "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"name": "prometheus-operator-mixin"
}, },
{ {
"source": { "source": {
@ -119,8 +120,8 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "d8b7d3766225908d0239fd0d78258892cd0fc384", "version": "b7ca32169844f0b5143f3e5e318fc05fa025df18",
"sum": "Nl+N/h76bzD9tZ8tx7tuNIKHwCIJ9zyOsAWplH8HvAE=" "sum": "MRwyChXdKG3anL2OWpbUu3qWc97w9J6YsjUWjLFQyB0="
}, },
{ {
"source": { "source": {
@ -129,8 +130,8 @@
"subdir": "doc/alertmanager-mixin" "subdir": "doc/alertmanager-mixin"
} }
}, },
"version": "193ebba04d1e70d971047e983a0b489112610460", "version": "99f64e944b1043c790784cf5373c8fb349816fc4",
"sum": "QcftU7gjCQyj7B6M4YJeCAeaPd0kwxd4J4rolo7AnLE=", "sum": "V8jcZQ1Qrlm7AQ6wjbuQQsacPb0NvrcZovKyplmzW5w=",
"name": "alertmanager" "name": "alertmanager"
}, },
{ {
@ -140,8 +141,8 @@
"subdir": "docs/node-mixin" "subdir": "docs/node-mixin"
} }
}, },
"version": "8b466360a35581e0301bd22918be7011cf4203c3", "version": "b597c1244d7bef49e6f3359c87a56dd7707f6719",
"sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8=" "sum": "cZTNXQMUCLB5FGYpMn845dcqGdkcYt58qCqOFIV/BoQ="
}, },
{ {
"source": { "source": {
@ -150,8 +151,8 @@
"subdir": "documentation/prometheus-mixin" "subdir": "documentation/prometheus-mixin"
} }
}, },
"version": "26d89b4b0776fe4cd5a3656dfa520f119a375273", "version": "3cafc58827d1ebd1a67749f88be4218f0bab3d8d",
"sum": "1VRVMuxAEZ9vdGHFlndmG9iQzDD6AoIXrX80CDpGDaU=", "sum": "VK0c3sQ3ksiM6JQsAVfWmL5NbzGv9llMfXFNXfFdJ+A=",
"name": "prometheus" "name": "prometheus"
}, },
{ {
@ -161,8 +162,9 @@
"subdir": "mixin" "subdir": "mixin"
} }
}, },
"version": "37e6ef61566c7c70793ba6d128f00c4c66cb2402", "version": "ba6c5c4726ff52807c7383c68f2159b1af7980bb",
"sum": "OptiWUMOHFrRGTZhSfxV1RCeXZ90qsefGNTD4lDYVG0=" "sum": "XP3uq7xcfKHsnWsz1v992csZhhZR3jQma6hFOfSViTs=",
"name": "thanos-mixin"
}, },
{ {
"source": { "source": {

View File

@ -72,7 +72,7 @@ local masterIP = '185.95.218.11';
{ {
port: 'http-metrics', port: 'http-metrics',
interval: '30s', interval: '30s',
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [ metricRelabelings: (import 'kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet') + [
{ {
sourceLabels: ['__name__'], sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*', regex: 'etcd_(debugging|disk|request|server).*',

View File

@ -3,17 +3,34 @@ kind: Alertmanager
metadata: metadata:
labels: labels:
alertmanager: main alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: main name: main
namespace: monitoring namespace: default
spec: spec:
configSecret: alertmanager-tbrnt-config configSecret: alertmanager-tbrnt-config
image: quay.io/prometheus/alertmanager:v0.21.0 image: quay.io/prometheus/alertmanager:v0.21.0
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
replicas: 1 podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext: securityContext:
fsGroup: 2000 fsGroup: 2000
runAsNonRoot: true runAsNonRoot: true
runAsUser: 1000 runAsUser: 1000
serviceAccountName: alertmanager-main serviceAccountName: alertmanager-main
version: v0.21.0 version: 0.21.0

View File

@ -0,0 +1,18 @@
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: default
spec:
maxUnavailable: 1
selector:
matchLabels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@ -0,0 +1,156 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
prometheus: k8s
role: alert-rules
name: alertmanager-main-rules
namespace: default
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{
$labels.pod}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="default"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only
found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster
members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="default"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="default"}[5m]))
for: 10m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have
different configurations.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="default"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have been up for less than half of the
last 5m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster
are down.
expr: |
(
count by (namespace,service) (
avg_over_time(up{job="alertmanager-main",namespace="default"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="default"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have restarted at least 5 times in the
last 10m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster
are crashlooping.
expr: |
(
count by (namespace,service) (
changes(process_start_time_seconds{job="alertmanager-main",namespace="default"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="default"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@ -1,8 +1,14 @@
apiVersion: v1 apiVersion: v1
kind: Secret kind: Secret
metadata: metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main name: alertmanager-main
namespace: monitoring namespace: default
stringData: stringData:
alertmanager.yaml: |- alertmanager.yaml: |-
"global": "global":

View File

@ -3,8 +3,12 @@ kind: Service
metadata: metadata:
labels: labels:
alertmanager: main alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main name: alertmanager-main
namespace: monitoring namespace: default
spec: spec:
ports: ports:
- name: web - name: web
@ -13,4 +17,7 @@ spec:
selector: selector:
alertmanager: main alertmanager: main
app: alertmanager app: alertmanager
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP sessionAffinity: ClientIP

View File

@ -1,5 +1,11 @@
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main name: alertmanager-main
namespace: monitoring namespace: default

View File

@ -2,9 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels: labels:
k8s-app: alertmanager app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager name: alertmanager
namespace: monitoring namespace: default
spec: spec:
endpoints: endpoints:
- interval: 30s - interval: 30s
@ -12,3 +15,6 @@ spec:
selector: selector:
matchLabels: matchLabels:
alertmanager: main alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

View File

@ -6,11 +6,12 @@ metadata:
namespace: monitoring namespace: monitoring
spec: spec:
encryptedData: encryptedData:
alertmanager.yaml: AgDHh1Qgrdffl6IFXJuk3ZzNHGARWZLDzbMLKp5Wo/ZYqclCji4T5wx7Fql6FALCvvUBvThxLfiwM2LQHRcWfWnf5AwxipCxpos9oVvlu4aON0WZd/Kjz/ZwDq5pgR/etCrSA2DYNxYq4vnTxUAk29eY5F4dWwRPcCgVZ5/KsTIcNx4x+4einqQbwAhkUtAwQl5fEPmpwNRquJZM29XIEUxZLWne0YmCmEgNGleUU20ByfYMwgtDJGjgr6XNPtTmByAHVrxNuQwAMxgT6GcfDLCNsByaS6CY3wmSTg1GUv/CG8Xx29FwDWyf1Ly2KbmcAAafN5QJGvCCTEt/WB85GtzQisrWFZTykv3Zjuz101p9ShXQZALylaX5h22hHFXuQyiIQZEeM2ixiYQjcPhiPjx1/hkbQ25QRD73/gjalZO8bprDrJxkLlw+hrgJ0LzxWL881U6INLKow+8/GmLleFhMUXRsGqacLreCIAr4uVGEMGMVLhHJKnj597HRnn0UCxVNkDk8QjHyiVgJBrQ3Pz9SFdF7mxvJ9F4rEgGkE4dvfvWxrZFumTLEkVRF9To+rKxsIVkewvoHtN/gMzFMzumP+fz/oB9yAHsxkwVyfqXBg52hNSYIx5Z/67yy3hDRKPBcZgknf9S+F37ET5BABFxazwG8NJjf4td+UsAGuAMzKI/94u7TxuXLPCs/tIGKD7kJnPxAqpalepzABtVCmOrtWwNPb1h4XeuraUS9beJ2zV9oV5nVFJmX94EJ7qpZt0Um7+GGeavQ5SV3XHRolDS5PpZPTAWnc/1rtZ0nsKk8lllEr3aDWveMXma06NKkIXz8+iAonvHsDZuw0W6jUdUUtraIbSua9YkyugqCBGeeXIPLwFxqJTqIX5vedZVMveFiaxtCJjL48SUGxtyugfiYbPa3xpHWWe22BcJyTmAOG9aIq4Tp4nvftLyvWe7c9PotJk/7gdv1IO4RLx//eLtKWw0uADa0ara4hDuI8Yktlti24TlA9XYz00d5WtE+lJsSZN8547BUfFzXSOZSSbfrFLZmEmBTgkbj4szX19bXSctJN3BtOmRfCEPXYQN10HgnhpwqYHbXKUSTZNWLojnFL1/E56wUXGxRg9NGOwSXzTyfoLGxI9NEQpGc0Rj2Wna+JSUhlAUnfYW1eH8yyg5FfkyhQdyZJFvYfF0rk+XG5XNhLumST19uxrAkMWhk+Z9/eWwOaZQMmDcoi2Rs0za+1GGjPW5k56Ip+spwW5cvYmdl1PgkZ4g1mupjiB0FdgZHGR+kGn1lbPtSUd+amh9PXSDWkqfnix62H7374rQ3ZyG7fs9sQNnnRrd/cDCMxAl5Upk8D9dfxRmvuxRd8b89h7EQwUBML7TIriA2Pci5Ftux2R5wyIXjznLC5/kFZg6/Av3uKmKK6dLR2Ooey7/3g14CEjMumdijjySl8Pd2UUxSKVKD7vkq+3xYm0CJZqVvT/iBOccrv0UEiTHBsXrfaugUvqIKTAGYhJy0fUBXKisPdA0HdzrUmx57Du36TGyuEzGtVuDarcWzQYPqKJxOIuofJ+AGTDY53OjdUJ8pwJD6HDz55tu85gaV6ZOvSYqjqeX2FUe7lPhsGUIh/FemfichpypHyFpPYhkwAIO1AinKvsqjUuDXE6n5b7NMbI1gl87fPqT5wUSKXZqwViyFqUA5DFqPTEqvHIGU5Wz0GajEaQ== alertmanager.yaml: AgBj3KqLiF7EAnGK6c4+Thferv3Sur+fhlwE4wpXD1PBtwlJQsMCqjsLRRFNAESH9/8vhI9E9D8wLJiauNS7CGw5jd5KU1cvxo5EyGeFoVyAB4bHSy/pxptSFq+rn00E99/Tqkbdsgduwusfpi0I9F1+zNucyyamJsEzIcsyHMlBbACz+9KQV9SdbgVEmIeqabrAP9VQaQ+i69yurhPdV7VkZzr0WKcGg3x27+slmtjlJz5fwtv1qmbYt/MQnijF2tc6tJeq19Cm0O4zuQ09meW6DwAZ9SOIFU6LxrqJlKbuleaWmfIE3AQYA6Z+qXyBjT1ILW36RwGyg3YK7nm0MNDQxd6LN3zR0eifPqrPsm7O6LE+NAg4FkurV3lJlrBoU3lSSc+sQZZr00ct9Gp57EEvg9T2TaM1B/KHQNmIhpDGntD4+yTcvK3nU7+sxqG/c4Wk5xiUQyLYnigNy5qYCcsM+t9iCoGxP7uU8GrsvIkojTxzhdc6e5LduThKdGE9jI3R6nCP6kmsU6XyUzgKmxYJVVzhSrm9yxFVDPHriNaEM2hgEd3wStwmRjGjwAPUjQZfSJtmxY6+RQ/77TYGjiskDm6gAZuzkGjdptL2t5F+54y3uePaLHNspMgtZsTARCo3kAhgf61Gk2nvnEY/ws5qFjAnsUEXs86wAk2S+w401QKPKTcDr6e/rnve8IrXW0FPvzR3rzdWOcU8v0Z0sSFijIfXdx+A9WGCJuHNo65FbKSgWhlHBfvWB1qWBnDd/VVHIA2wR8gevAPJHSc0f1WdUDc4w2w8tc/qum1SZo2lWkMopvLaiVHU5dCGtG6+4qsC1DmFzIRGZN4AbdVd5k+OY15Fp+ysjuTpA/HuZ/N0kz/5BXzNbY3u7YKi3EV3Up+eZIi2jlG0XBXWdCouuxRW40qHuShivtbrBgey32kFo85dsrDqN7F2kBVnumOB7kvFOaCkL2AtsakVjUzGoh5eXCSHl0ZcqmW2UjInzZIirBChMW/G4yL/TwpVYbBLqWPfdVMFmq7I4srY2+hUP/5UBt/DKZi5zPlLR8H3q4i02zsNpqdhSa9o6ThhFtVX9/te/DMpyN1fJ1Hn2p3cDhoTsiTLPkvflVOx70flap0v2zzPoDm+yXhFllpWp/5avHy9pKf/RzpAodbNr/EydkC+KDKI88MhVUtxS27WbKFsq+vUkmHQj+KtGyRFjg2/CnmM8YbdRsMe8p39PVGLxj1RTnyYzlMltOTbJo3rhDzjmpzGVUpWokwTMGC1WgTenrS4IcCK61ri9bsBIL9n9sMLF1lT8NVKnQfluDTaHNzsQgJ1HTSwQOcAfugqlUrSeTLt3q6U4pSjjlF8P7wYpqzWc+bhOaHed9NxrGXFBC5Wh6+BULuCaCA6TtkLpUfABYHVUa4OS3huNsOeBhZ3aCCQXrc0jOOq2DQzxvdGu4YAQnvMHwJRVyKVcw0pOS5RjIqJW6IOn0MGHzAo7qNv6LUyJ9a7huT2W4ibrHFkMck1zKxbBekPQ9FxpufSXrEqEqNuB3j7Gi7lVDVbPySr1rr2KXLzOLsnZhpTpMq2RejglIAMF7WfIMfvHQ2mnjNuYNNQnXx8hPLm88GSxFYKHpUnAswgYuo4XX2drYMzzq3GWDMIHZ/kpLySU+eJGo6VGeFUV1DgaGksLXE3oCfrA1OCUyZ/qke3tzj8ixjwuprCmFPWsg==
template: template:
metadata: metadata:
creationTimestamp: null creationTimestamp: null
name: alertmanager-tbrnt-config name: alertmanager-tbrnt-config
namespace: monitoring namespace: monitoring
type: Opaque type: Opaque
status: {}

View File

@ -1,8 +1,13 @@
apiVersion: v1 apiVersion: v1
data: data:
datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJMb2tpIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAibG9raSIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL2xva2kubG9raTozMTAwIiwKICAgICAgICAgICAgInZlcnNpb24iOiAxCiAgICAgICAgfQogICAgXQp9 datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLmRlZmF1bHQuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0=
kind: Secret kind: Secret
metadata: metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana-datasources name: grafana-datasources
namespace: monitoring namespace: default
type: Opaque type: Opaque

File diff suppressed because it is too large Load Diff

View File

@ -17,5 +17,10 @@ data:
} }
kind: ConfigMap kind: ConfigMap
metadata: metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana-dashboards name: grafana-dashboards
namespace: monitoring namespace: default

View File

@ -2,26 +2,32 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
labels: labels:
app: grafana app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana name: grafana
namespace: monitoring namespace: default
spec: spec:
replicas: 1 replicas: 1
selector: selector:
matchLabels: matchLabels:
app: grafana app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
template: template:
metadata: metadata:
annotations: annotations:
checksum/grafana-datasources: 7103d054a6e94f976ca59b4ede77cf88 checksum/grafana-datasources: b822d7b1a1070f322d0773c043985b4a
labels: labels:
app: grafana app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
spec: spec:
containers: containers:
- env: - env: []
- name: GF_INSTALL_PLUGINS image: grafana/grafana:7.5.4
value: grafana-piechart-panel
image: grafana/grafana:7.3.5
name: grafana name: grafana
ports: ports:
- containerPort: 3000 - containerPort: 3000
@ -113,9 +119,6 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/statefulset - mountPath: /grafana-dashboard-definitions/0/statefulset
name: grafana-dashboard-statefulset name: grafana-dashboard-statefulset
readOnly: false readOnly: false
- mountPath: /grafana-dashboard-definitions/0/traefik
name: grafana-dashboard-traefik
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/workload-total - mountPath: /grafana-dashboard-definitions/0/workload-total
name: grafana-dashboard-workload-total name: grafana-dashboard-workload-total
readOnly: false readOnly: false
@ -201,9 +204,6 @@ spec:
- configMap: - configMap:
name: grafana-dashboard-statefulset name: grafana-dashboard-statefulset
name: grafana-dashboard-statefulset name: grafana-dashboard-statefulset
- configMap:
name: grafana-dashboard-traefik
name: grafana-dashboard-traefik
- configMap: - configMap:
name: grafana-dashboard-workload-total name: grafana-dashboard-workload-total
name: grafana-dashboard-workload-total name: grafana-dashboard-workload-total

View File

@ -2,14 +2,18 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels: labels:
app: grafana app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana name: grafana
namespace: monitoring namespace: default
spec: spec:
ports: ports:
- name: http - name: http
port: 3000 port: 3000
targetPort: http targetPort: http
selector: selector:
app: grafana app.kubernetes.io/component: grafana
type: NodePort app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus

View File

@ -2,4 +2,4 @@ apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
name: grafana name: grafana
namespace: monitoring namespace: default

View File

@ -1,12 +1,17 @@
apiVersion: monitoring.coreos.com/v1 apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana name: grafana
namespace: monitoring namespace: default
spec: spec:
endpoints: endpoints:
- interval: 15s - interval: 15s
port: http port: http
selector: selector:
matchLabels: matchLabels:
app: grafana app.kubernetes.io/name: grafana

View File

@ -13,4 +13,5 @@ spec:
name: healthchecks-io name: healthchecks-io
namespace: monitoring namespace: monitoring
type: Opaque type: Opaque
status: {}

View File

@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics name: kube-state-metrics
rules: rules:
- apiGroups: - apiGroups:
@ -24,16 +26,6 @@ rules:
verbs: verbs:
- list - list
- watch - watch
- apiGroups:
- extensions
resources:
- daemonsets
- deployments
- replicasets
- ingresses
verbs:
- list
- watch
- apiGroups: - apiGroups:
- apps - apps
resources: resources:
@ -105,6 +97,14 @@ rules:
- networking.k8s.io - networking.k8s.io
resources: resources:
- networkpolicies - networkpolicies
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs: verbs:
- list - list
- watch - watch

View File

@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics name: kube-state-metrics
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
@ -12,4 +14,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: default

View File

@ -2,20 +2,28 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: default
spec: spec:
replicas: 1 replicas: 1
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
template: template:
metadata: metadata:
annotations:
kubectl.kubernetes.io/default-container: kube-state-metrics
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
spec: spec:
containers: containers:
- args: - args:
@ -23,8 +31,17 @@ spec:
- --port=8081 - --port=8081
- --telemetry-host=127.0.0.1 - --telemetry-host=127.0.0.1
- --telemetry-port=8082 - --telemetry-port=8082
image: quay.io/coreos/kube-state-metrics:v1.9.7 image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0
name: kube-state-metrics name: kube-state-metrics
resources:
limits:
cpu: 100m
memory: 250Mi
requests:
cpu: 10m
memory: 190Mi
securityContext:
runAsUser: 65534
- args: - args:
- --logtostderr - --logtostderr
- --secure-listen-address=:8443 - --secure-listen-address=:8443
@ -35,6 +52,13 @@ spec:
ports: ports:
- containerPort: 8443 - containerPort: 8443
name: https-main name: https-main
resources:
limits:
cpu: 40m
memory: 40Mi
requests:
cpu: 20m
memory: 20Mi
securityContext: securityContext:
runAsGroup: 65532 runAsGroup: 65532
runAsNonRoot: true runAsNonRoot: true
@ -49,6 +73,13 @@ spec:
ports: ports:
- containerPort: 9443 - containerPort: 9443
name: https-self name: https-self
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext: securityContext:
runAsGroup: 65532 runAsGroup: 65532
runAsNonRoot: true runAsNonRoot: true

View File

@ -0,0 +1,46 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
prometheus: k8s
role: alert-rules
name: kube-state-metrics-rules
namespace: default
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical

View File

@ -2,10 +2,12 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: default
spec: spec:
clusterIP: None clusterIP: None
ports: ports:
@ -16,4 +18,6 @@ spec:
port: 9443 port: 9443
targetPort: https-self targetPort: https-self
selector: selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

View File

@ -2,7 +2,9 @@ apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: default

View File

@ -2,10 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.7 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: default
spec: spec:
endpoints: endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -28,4 +30,6 @@ spec:
jobLabel: app.kubernetes.io/name jobLabel: app.kubernetes.io/name
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter name: node-exporter
rules: rules:
- apiGroups: - apiGroups:

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter name: node-exporter
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: node-exporter name: node-exporter
namespace: monitoring namespace: default

View File

@ -2,30 +2,37 @@ apiVersion: apps/v1
kind: DaemonSet kind: DaemonSet
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter name: node-exporter
namespace: monitoring namespace: default
spec: spec:
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
template: template:
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
spec: spec:
containers: containers:
- args: - args:
- --web.listen-address=127.0.0.1:9100 - --web.listen-address=127.0.0.1:9100
- --path.procfs=/host/proc
- --path.sysfs=/host/sys - --path.sysfs=/host/sys
- --path.rootfs=/host/root - --path.rootfs=/host/root
- --no-collector.wifi - --no-collector.wifi
- --no-collector.hwmon - --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
image: quay.io/prometheus/node-exporter:v1.0.1 - --collector.netclass.ignored-devices=^(veth.*)$
- --collector.netdev.device-exclude=^(veth.*)$
image: quay.io/prometheus/node-exporter:v1.1.2
name: node-exporter name: node-exporter
resources: resources:
limits: limits:
@ -35,10 +42,6 @@ spec:
cpu: 102m cpu: 102m
memory: 180Mi memory: 180Mi
volumeMounts: volumeMounts:
- mountPath: /host/proc
mountPropagation: HostToContainer
name: proc
readOnly: true
- mountPath: /host/sys - mountPath: /host/sys
mountPropagation: HostToContainer mountPropagation: HostToContainer
name: sys name: sys
@ -85,9 +88,6 @@ spec:
tolerations: tolerations:
- operator: Exists - operator: Exists
volumes: volumes:
- hostPath:
path: /proc
name: proc
- hostPath: - hostPath:
path: /sys path: /sys
name: sys name: sys

View File

@ -0,0 +1,301 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
prometheus: k8s
role: alert-rules
name: node-exporter-rules
namespace: default
spec:
groups:
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 300s.
Ensure NTP is configured correctly on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed"} > 0
labels:
severity: warning
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m

View File

@ -2,10 +2,12 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter name: node-exporter
namespace: monitoring namespace: default
spec: spec:
clusterIP: None clusterIP: None
ports: ports:
@ -13,4 +15,6 @@ spec:
port: 9100 port: 9100
targetPort: https targetPort: https
selector: selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,5 +1,10 @@
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter name: node-exporter
namespace: monitoring namespace: default

View File

@ -2,10 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels: labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter name: node-exporter
namespace: monitoring namespace: default
spec: spec:
endpoints: endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -24,4 +26,6 @@ spec:
jobLabel: app.kubernetes.io/name jobLabel: app.kubernetes.io/name
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,6 +1,11 @@
apiVersion: apiregistration.k8s.io/v1 apiVersion: apiregistration.k8s.io/v1
kind: APIService kind: APIService
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: v1beta1.metrics.k8s.io name: v1beta1.metrics.k8s.io
spec: spec:
group: metrics.k8s.io group: metrics.k8s.io
@ -8,6 +13,6 @@ spec:
insecureSkipTLSVerify: true insecureSkipTLSVerify: true
service: service:
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default
version: v1beta1 version: v1beta1
versionPriority: 100 versionPriority: 100

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter name: prometheus-adapter
rules: rules:
- apiGroups: - apiGroups:

View File

@ -2,6 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels: labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
rbac.authorization.k8s.io/aggregate-to-admin: "true" rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true" rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true" rbac.authorization.k8s.io/aggregate-to-view: "true"

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter name: prometheus-adapter
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: resource-metrics:system:auth-delegator name: resource-metrics:system:auth-delegator
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: resource-metrics-server-resources name: resource-metrics-server-resources
rules: rules:
- apiGroups: - apiGroups:

View File

@ -4,8 +4,8 @@ data:
"resourceRules": "resourceRules":
"cpu": "cpu":
"containerLabel": "container" "containerLabel": "container"
"containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)" "containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)"
"nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)" "nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) or sum (1- irate(windows_cpu_time_total{mode=\"idle\", job=\"windows-exporter\",<<.LabelMatchers>>}[5m])) by (<<.GroupBy>>)"
"resources": "resources":
"overrides": "overrides":
"namespace": "namespace":
@ -16,8 +16,8 @@ data:
"resource": "pod" "resource": "pod"
"memory": "memory":
"containerLabel": "container" "containerLabel": "container"
"containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)" "containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)"
"nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)" "nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>) or sum(windows_cs_physical_memory_bytes{job=\"windows-exporter\",<<.LabelMatchers>>} - windows_memory_available_bytes{job=\"windows-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)"
"resources": "resources":
"overrides": "overrides":
"instance": "instance":
@ -29,5 +29,10 @@ data:
"window": "5m" "window": "5m"
kind: ConfigMap kind: ConfigMap
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: adapter-config name: adapter-config
namespace: monitoring namespace: default

View File

@ -1,21 +1,31 @@
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default
spec: spec:
replicas: 1 replicas: 2
selector: selector:
matchLabels: matchLabels:
name: prometheus-adapter app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
strategy: strategy:
rollingUpdate: rollingUpdate:
maxSurge: 1 maxSurge: 1
maxUnavailable: 0 maxUnavailable: 1
template: template:
metadata: metadata:
labels: labels:
name: prometheus-adapter app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
spec: spec:
containers: containers:
- args: - args:
@ -23,9 +33,9 @@ spec:
- --config=/etc/adapter/config.yaml - --config=/etc/adapter/config.yaml
- --logtostderr=true - --logtostderr=true
- --metrics-relist-interval=1m - --metrics-relist-interval=1m
- --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090/ - --prometheus-url=http://prometheus-k8s.default.svc.cluster.local:9090/
- --secure-port=6443 - --secure-port=6443
image: directxman12/k8s-prometheus-adapter:v0.8.2 image: directxman12/k8s-prometheus-adapter:v0.8.4
name: prometheus-adapter name: prometheus-adapter
ports: ports:
- containerPort: 6443 - containerPort: 6443

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: resource-metrics-auth-reader name: resource-metrics-auth-reader
namespace: kube-system namespace: kube-system
roleRef: roleRef:
@ -10,4 +15,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default

View File

@ -2,13 +2,18 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels: labels:
name: prometheus-adapter app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default
spec: spec:
ports: ports:
- name: https - name: https
port: 443 port: 443
targetPort: 6443 targetPort: 6443
selector: selector:
name: prometheus-adapter app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,5 +1,10 @@
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default

View File

@ -2,9 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels: labels:
name: prometheus-adapter app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter name: prometheus-adapter
namespace: monitoring namespace: default
spec: spec:
endpoints: endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -15,4 +18,6 @@ spec:
insecureSkipVerify: true insecureSkipVerify: true
selector: selector:
matchLabels: matchLabels:
name: prometheus-adapter app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
rules: rules:
- apiGroups: - apiGroups:

View File

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default

View File

@ -4,9 +4,10 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: default
spec: spec:
endpoints: endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -19,4 +20,5 @@ spec:
matchLabels: matchLabels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0

View File

@ -0,0 +1,18 @@
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: default
spec:
minAvailable: 1
selector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s

View File

@ -2,19 +2,31 @@ apiVersion: monitoring.coreos.com/v1
kind: Prometheus kind: Prometheus
metadata: metadata:
labels: labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
prometheus: k8s prometheus: k8s
name: k8s name: k8s
namespace: monitoring namespace: default
spec: spec:
alerting: alerting:
alertmanagers: alertmanagers:
- name: alertmanager-main - apiVersion: v2
namespace: monitoring name: alertmanager-main
namespace: default
port: web port: web
externalLabels: {}
externalUrl: http://prometheus-k8s.monitoring:9090 externalUrl: http://prometheus-k8s.monitoring:9090
image: quay.io/prometheus/prometheus:v2.22.1 image: quay.io/prometheus/prometheus:v2.26.0
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
podMonitorNamespaceSelector: podMonitorNamespaceSelector:
matchExpressions: matchExpressions:
- key: prometheus - key: prometheus
@ -25,7 +37,7 @@ spec:
podMonitorSelector: {} podMonitorSelector: {}
probeNamespaceSelector: {} probeNamespaceSelector: {}
probeSelector: {} probeSelector: {}
replicas: 1 replicas: 2
resources: resources:
requests: requests:
memory: 400Mi memory: 400Mi
@ -58,4 +70,4 @@ spec:
requests: requests:
storage: 10Gi storage: 10Gi
storageClassName: local-path storageClassName: local-path
version: v2.22.1 version: 2.26.0

View File

@ -0,0 +1,256 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
prometheus: k8s
role: alert-rules
name: prometheus-k8s-prometheus-rules
namespace: default
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
reload its configuration.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="default"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
is running full.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less
than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="default"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="default"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to
a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="default"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="default"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
to any Alertmanagers.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="default"}[5m]) < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} reload failures over the last 3h.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="default"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
{{$value | humanize}} compaction failures over the last 3h.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="default"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
samples.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: |
(
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="default"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="default"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="default"}) > 0
)
)
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with different values but duplicated
timestamp.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
{{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
$labels.url }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="default"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="default"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="default"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="default"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="default"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="default"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="default"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="default"}[5m])
)
> 120
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ $value }} shards for queue {{
$labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{
printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="default"}`
$labels.instance | query | first | value }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="default"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="default"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
evaluate {{ printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{
printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped
{{ printf "%.0f" $value }} targets because the number of targets exceeded
the configured target_limit.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded
the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="default"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="default",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="default",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical

View File

@ -1,8 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s-config name: prometheus-k8s-config
namespace: monitoring namespace: default
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: Role kind: Role
@ -10,4 +15,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default

View File

@ -3,6 +3,11 @@ items:
- apiVersion: rbac.authorization.k8s.io/v1 - apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: default namespace: default
roleRef: roleRef:
@ -12,10 +17,15 @@ items:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default
- apiVersion: rbac.authorization.k8s.io/v1 - apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: kube-system namespace: kube-system
roleRef: roleRef:
@ -25,12 +35,17 @@ items:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default
- apiVersion: rbac.authorization.k8s.io/v1 - apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
kind: Role kind: Role
@ -38,31 +53,5 @@ items:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: k8up
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: owntracks
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
kind: RoleBindingList kind: RoleBindingList

View File

@ -1,8 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: Role kind: Role
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s-config name: prometheus-k8s-config
namespace: monitoring namespace: default
rules: rules:
- apiGroups: - apiGroups:
- "" - ""

View File

@ -3,6 +3,11 @@ items:
- apiVersion: rbac.authorization.k8s.io/v1 - apiVersion: rbac.authorization.k8s.io/v1
kind: Role kind: Role
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: default namespace: default
rules: rules:
@ -24,9 +29,22 @@ items:
- get - get
- list - list
- watch - watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1 - apiVersion: rbac.authorization.k8s.io/v1
kind: Role kind: Role
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: kube-system namespace: kube-system
rules: rules:
@ -48,11 +66,24 @@ items:
- get - get
- list - list
- watch - watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1 - apiVersion: rbac.authorization.k8s.io/v1
kind: Role kind: Role
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default
rules: rules:
- apiGroups: - apiGroups:
- "" - ""
@ -72,48 +103,8 @@ items:
- get - get
- list - list
- watch - watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: k8up
rules:
- apiGroups: - apiGroups:
- "" - networking.k8s.io
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: owntracks
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources: resources:
- ingresses - ingresses
verbs: verbs:

File diff suppressed because it is too large Load Diff

View File

@ -2,9 +2,13 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels: labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
prometheus: k8s prometheus: k8s
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default
spec: spec:
ports: ports:
- name: web - name: web
@ -12,5 +16,8 @@ spec:
targetPort: web targetPort: web
selector: selector:
app: prometheus app: prometheus
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s prometheus: k8s
sessionAffinity: ClientIP sessionAffinity: ClientIP

View File

@ -1,5 +1,10 @@
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: default

View File

@ -2,13 +2,19 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels: labels:
k8s-app: prometheus app.kubernetes.io/component: prometheus
name: prometheus app.kubernetes.io/name: prometheus
namespace: monitoring app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.26.0
name: prometheus-k8s
namespace: default
spec: spec:
endpoints: endpoints:
- interval: 30s - interval: 30s
port: web port: web
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s prometheus: k8s

View File

@ -1,74 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: apiserver
name: kube-apiserver
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|server).*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_controller_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_step_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
sourceLabels:
- __name__
- le
port: https
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
jobLabel: component
namespaceSelector:
matchNames:
- default
selector:
matchLabels:
component: apiserver
provider: kubernetes

View File

@ -1,19 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: coredns
name: coredns
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
port: metrics
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-dns

View File

@ -1,10 +1,3 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kube-controller-manager
name: kube-controller-manager
namespace: monitoring
spec: spec:
endpoints: endpoints:
- interval: 30s - interval: 30s
@ -46,10 +39,3 @@ spec:
sourceLabels: sourceLabels:
- __name__ - __name__
port: http-metrics port: http-metrics
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-controller-manager

View File

@ -1,18 +1,4 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kube-scheduler
name: kube-scheduler
namespace: monitoring
spec: spec:
endpoints: endpoints:
- interval: 30s - interval: 30s
port: http-metrics port: http-metrics
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-scheduler

View File

@ -1,90 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: kubelet
name: kubelet
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
honorTimestamps: false
interval: 30s
metricRelabelings:
- action: drop
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
sourceLabels:
- __name__
path: /metrics/cadvisor
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
path: /metrics/probes
port: https-metrics
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: k8s-app
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kubelet

View File

@ -1,4 +1,4 @@
apiVersion: v1 apiVersion: v1
kind: Namespace kind: Namespace
metadata: metadata:
name: monitoring name: default

View File

@ -0,0 +1,76 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
role: alert-rules
name: kube-prometheus-rules
namespace: default
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/watchdog
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
severity: none
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{ $labels.device }}" changing it's up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkinterfaceflapping
expr: |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: AlertmanagerConfig kind: AlertmanagerConfig
listKind: AlertmanagerConfigList listKind: AlertmanagerConfigList
plural: alertmanagerconfigs plural: alertmanagerconfigs
@ -60,6 +62,7 @@ spec:
properties: properties:
name: name:
description: Label to match. description: Label to match.
minLength: 1
type: string type: string
regex: regex:
description: Whether to match on equality (false) or regular-expression description: Whether to match on equality (false) or regular-expression
@ -70,7 +73,6 @@ spec:
type: string type: string
required: required:
- name - name
- value
type: object type: object
type: array type: array
targetMatch: targetMatch:
@ -82,6 +84,7 @@ spec:
properties: properties:
name: name:
description: Label to match. description: Label to match.
minLength: 1
type: string type: string
regex: regex:
description: Whether to match on equality (false) or regular-expression description: Whether to match on equality (false) or regular-expression
@ -92,7 +95,6 @@ spec:
type: string type: string
required: required:
- name - name
- value
type: object type: object
type: array type: array
type: object type: object
@ -108,9 +110,13 @@ spec:
description: EmailConfig configures notifications via Email. description: EmailConfig configures notifications via Email.
properties: properties:
authIdentity: authIdentity:
description: The identity to use for authentication.
type: string type: string
authPassword: authPassword:
description: SecretKeySelector selects a key of a Secret. description: The secret's key that contains the password
to use for authentication. The secret needs to be in
the same namespace as the AlertmanagerConfig object
and accessible by the Prometheus Operator.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
@ -129,7 +135,10 @@ spec:
- key - key
type: object type: object
authSecret: authSecret:
description: SecretKeySelector selects a key of a Secret. description: The secret's key that contains the CRAM-MD5
secret. The secret needs to be in the same namespace
as the AlertmanagerConfig object and accessible by the
Prometheus Operator.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
@ -148,7 +157,7 @@ spec:
- key - key
type: object type: object
authUsername: authUsername:
description: SMTP authentication information. description: The username to use for authentication.
type: string type: string
from: from:
description: The sender address. description: The sender address.
@ -162,6 +171,7 @@ spec:
properties: properties:
key: key:
description: Key of the tuple. description: Key of the tuple.
minLength: 1
type: string type: string
value: value:
description: Value of the tuple. description: Value of the tuple.
@ -321,6 +331,7 @@ spec:
name: name:
description: Name of the receiver. Must be unique across all description: Name of the receiver. Must be unique across all
items from the list. items from the list.
minLength: 1
type: string type: string
opsgenieConfigs: opsgenieConfigs:
description: List of OpsGenie configurations. description: List of OpsGenie configurations.
@ -364,6 +375,7 @@ spec:
properties: properties:
key: key:
description: Key of the tuple. description: Key of the tuple.
minLength: 1
type: string type: string
value: value:
description: Value of the tuple. description: Value of the tuple.
@ -590,8 +602,8 @@ spec:
description: List of responders responsible for notifications. description: List of responders responsible for notifications.
items: items:
description: OpsGenieConfigResponder defines a responder description: OpsGenieConfigResponder defines a responder
to an incident. One of id, name or username has to to an incident. One of `id`, `name` or `username`
be defined. has to be defined.
properties: properties:
id: id:
description: ID of the responder. description: ID of the responder.
@ -601,10 +613,13 @@ spec:
type: string type: string
type: type:
description: Type of responder. description: Type of responder.
minLength: 1
type: string type: string
username: username:
description: Username of the responder. description: Username of the responder.
type: string type: string
required:
- type
type: object type: object
type: array type: array
sendResolved: sendResolved:
@ -649,6 +664,7 @@ spec:
properties: properties:
key: key:
description: Key of the tuple. description: Key of the tuple.
minLength: 1
type: string type: string
value: value:
description: Value of the tuple. description: Value of the tuple.
@ -1163,8 +1179,11 @@ spec:
description: Notification title. description: Notification title.
type: string type: string
token: token:
description: Your registered applications API token, description: The secret's key that contains the registered
see https://pushover.net/apps applications API token, see https://pushover.net/apps.
The secret needs to be in the same namespace as the
AlertmanagerConfig object and accessible by the Prometheus
Operator.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
@ -1190,7 +1209,10 @@ spec:
just the URL is shown just the URL is shown
type: string type: string
userKey: userKey:
description: The recipient users user key. description: The secret's key that contains the recipient
users user key. The secret needs to be in the same
namespace as the AlertmanagerConfig object and accessible
by the Prometheus Operator.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
@ -1237,6 +1259,7 @@ spec:
okText: okText:
type: string type: string
text: text:
minLength: 1
type: string type: string
title: title:
type: string type: string
@ -1248,8 +1271,10 @@ spec:
style: style:
type: string type: string
text: text:
minLength: 1
type: string type: string
type: type:
minLength: 1
type: string type: string
url: url:
type: string type: string
@ -1307,8 +1332,10 @@ spec:
short: short:
type: boolean type: boolean
title: title:
minLength: 1
type: string type: string
value: value:
minLength: 1
type: string type: string
required: required:
- title - title
@ -1558,8 +1585,10 @@ spec:
VictorOps. See https://prometheus.io/docs/alerting/latest/configuration/#victorops_config VictorOps. See https://prometheus.io/docs/alerting/latest/configuration/#victorops_config
properties: properties:
apiKey: apiKey:
description: The API key to use when talking to the VictorOps description: The secret's key that contains the API key
API. to use when talking to the VictorOps API. The secret
needs to be in the same namespace as the AlertmanagerConfig
object and accessible by the Prometheus Operator.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
@ -1587,6 +1616,7 @@ spec:
properties: properties:
key: key:
description: Key of the tuple. description: Key of the tuple.
minLength: 1
type: string type: string
value: value:
description: Value of the tuple. description: Value of the tuple.
@ -1820,8 +1850,6 @@ spec:
description: Contains long explanation of the alerted description: Contains long explanation of the alerted
problem. problem.
type: string type: string
required:
- routingKey
type: object type: object
type: array type: array
webhookConfigs: webhookConfigs:
@ -2035,8 +2063,9 @@ spec:
type: object type: object
maxAlerts: maxAlerts:
description: Maximum number of alerts to be sent per webhook description: Maximum number of alerts to be sent per webhook
message. message. When 0, all alerts are included.
format: int32 format: int32
minimum: 0
type: integer type: integer
sendResolved: sendResolved:
description: Whether or not to notify about resolved alerts. description: Whether or not to notify about resolved alerts.
@ -2334,8 +2363,8 @@ spec:
type: array type: array
route: route:
description: The Alertmanager route definition for alerts matching description: The Alertmanager route definition for alerts matching
the resources namespace. It will be added to the generated Alertmanager the resources namespace. If present, it will be added to the generated
configuration as a first-level route. Alertmanager configuration as a first-level route.
properties: properties:
continue: continue:
description: Boolean indicating whether an alert should continue description: Boolean indicating whether an alert should continue
@ -2367,6 +2396,7 @@ spec:
properties: properties:
name: name:
description: Label to match. description: Label to match.
minLength: 1
type: string type: string
regex: regex:
description: Whether to match on equality (false) or regular-expression description: Whether to match on equality (false) or regular-expression
@ -2377,13 +2407,11 @@ spec:
type: string type: string
required: required:
- name - name
- value
type: object type: object
type: array type: array
receiver: receiver:
description: Name of the receiver for this route. If present, description: Name of the receiver for this route. If not empty,
it should be listed in the `receivers` field. The field can it should be listed in the `receivers` field.
be omitted only for nested routes otherwise it is mandatory.
type: string type: string
repeatInterval: repeatInterval:
description: How long to wait before repeating the last notification. description: How long to wait before repeating the last notification.

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: Alertmanager kind: Alertmanager
listKind: AlertmanagerList listKind: AlertmanagerList
plural: alertmanagers plural: alertmanagers

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: PodMonitor kind: PodMonitor
listKind: PodMonitorList listKind: PodMonitorList
plural: podmonitors plural: podmonitors
@ -197,8 +199,10 @@ spec:
to proxy through this endpoint. to proxy through this endpoint.
type: string type: string
relabelings: relabelings:
description: 'RelabelConfigs to apply to samples before ingestion. description: 'RelabelConfigs to apply to samples before scraping.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' Prometheus Operator automatically adds relabelings for a few
standard Kubernetes fields and replaces original scrape job
name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items: items:
description: 'RelabelConfig allows dynamic rewriting of the description: 'RelabelConfig allows dynamic rewriting of the
label set, being applied to samples before ingestion. It label set, being applied to samples before ingestion. It

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: Probe kind: Probe
listKind: ProbeList listKind: ProbeList
plural: probes plural: probes
@ -35,6 +37,68 @@ spec:
description: Specification of desired Ingress selection for target discovery description: Specification of desired Ingress selection for target discovery
by Prometheus. by Prometheus.
properties: properties:
basicAuth:
description: 'BasicAuth allow an endpoint to authenticate over basic
authentication. More info: https://prometheus.io/docs/operating/configuration/#endpoint'
properties:
password:
description: The secret in the service monitor namespace that
contains the password for authentication.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
username:
description: The secret in the service monitor namespace that
contains the username for authentication.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
type: object
bearerTokenSecret:
description: Secret to mount to read bearer token for scraping targets.
The secret needs to be in the same namespace as the probe and accessible
by the Prometheus Operator.
properties:
key:
description: The key of the secret to select from. Must be a
valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be defined
type: boolean
required:
- key
type: object
interval: interval:
description: Interval at which targets are probed using the configured description: Interval at which targets are probed using the configured
prober. If not specified Prometheus' global scrape interval is used. prober. If not specified Prometheus' global scrape interval is used.
@ -190,6 +254,52 @@ spec:
description: Labels assigned to all metrics scraped from the description: Labels assigned to all metrics scraped from the
targets. targets.
type: object type: object
relabelingConfigs:
description: 'RelabelConfigs to apply to samples before ingestion.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items:
description: 'RelabelConfig allows dynamic rewriting of
the label set, being applied to samples before ingestion.
It defines `<metric_relabel_configs>`-section of Prometheus
configuration. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs'
properties:
action:
description: Action to perform based on regex matching.
Default is 'replace'
type: string
modulus:
description: Modulus to take of the hash of the source
label values.
format: int64
type: integer
regex:
description: Regular expression against which the extracted
value is matched. Default is '(.*)'
type: string
replacement:
description: Replacement value against which a regex
replace is performed if the regular expression matches.
Regex capture groups are available. Default is '$1'
type: string
separator:
description: Separator placed between concatenated source
label values. default is ';'.
type: string
sourceLabels:
description: The source labels select values from existing
labels. Their content is concatenated using the configured
separator and matched against the configured regular
expression for the replace, keep, and drop actions.
items:
type: string
type: array
targetLabel:
description: Label to which the resulting value is written
in a replace action. It is mandatory for replace actions.
Regex capture groups are available.
type: string
type: object
type: array
static: static:
description: Targets is a list of URLs to probe using the description: Targets is a list of URLs to probe using the
configured prober. configured prober.
@ -198,6 +308,112 @@ spec:
type: array type: array
type: object type: object
type: object type: object
tlsConfig:
description: TLS configuration to use when scraping the endpoint.
properties:
ca:
description: Struct containing the CA cert to use for the targets.
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
type: object
cert:
description: Struct containing the client cert file for the targets.
properties:
configMap:
description: ConfigMap containing data to use for the targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the ConfigMap or its key
must be defined
type: boolean
required:
- key
type: object
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must
be defined
type: boolean
required:
- key
type: object
type: object
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keySecret:
description: Secret containing the client key file for the targets.
properties:
key:
description: The key of the secret to select from. Must be
a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string
optional:
description: Specify whether the Secret or its key must be
defined
type: boolean
required:
- key
type: object
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object type: object
required: required:
- spec - spec

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: Prometheus kind: Prometheus
listKind: PrometheusList listKind: PrometheusList
plural: prometheuses plural: prometheuses
@ -2199,6 +2201,15 @@ spec:
only clients authorized to perform these actions can do so. For only clients authorized to perform these actions can do so. For
more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis'
type: boolean type: boolean
enableFeatures:
description: Enable access to Prometheus disabled features. By default,
no features are enabled. Enabling disabled features is entirely
outside the scope of what the maintainers will support and by doing
so, you accept that this behaviour may break at any time without
notice. For more information see https://prometheus.io/docs/prometheus/latest/disabled_features/
items:
type: string
type: array
enforcedNamespaceLabel: enforcedNamespaceLabel:
description: EnforcedNamespaceLabel enforces adding a namespace label description: EnforcedNamespaceLabel enforces adding a namespace label
of origin for each alert and metric that is user created. The label of origin for each alert and metric that is user created. The label
@ -3388,8 +3399,8 @@ spec:
type: string type: string
type: object type: object
podMonitorNamespaceSelector: podMonitorNamespaceSelector:
description: Namespaces to be selected for PodMonitor discovery. If description: Namespace's labels to match for PodMonitor discovery.
nil, only check own namespace. If nil, only check own namespace.
properties: properties:
matchExpressions: matchExpressions:
description: matchExpressions is a list of label selector requirements. description: matchExpressions is a list of label selector requirements.
@ -3682,7 +3693,7 @@ spec:
type: object type: object
type: object type: object
bearerToken: bearerToken:
description: bearer token for remote read. description: Bearer token for remote read.
type: string type: string
bearerTokenFile: bearerTokenFile:
description: File to read bearer token for remote read. description: File to read bearer token for remote read.
@ -3893,11 +3904,32 @@ spec:
type: object type: object
type: object type: object
bearerToken: bearerToken:
description: File to read bearer token for remote write. description: Bearer token for remote write.
type: string type: string
bearerTokenFile: bearerTokenFile:
description: File to read bearer token for remote write. description: File to read bearer token for remote write.
type: string type: string
headers:
additionalProperties:
type: string
description: Custom HTTP headers to be sent along with each
remote write request. Be aware that headers that are set by
Prometheus itself can't be overwritten. Only valid in Prometheus
versions 2.25.0 and newer.
type: object
metadataConfig:
description: MetadataConfig configures the sending of series
metadata to remote storage.
properties:
send:
description: Whether metric metadata is sent to remote storage
or not.
type: boolean
sendInterval:
description: How frequently metric metadata is sent to remote
storage.
type: string
type: object
name: name:
description: The name of the remote write queue, must be unique description: The name of the remote write queue, must be unique
if specified. The name is used in metrics and logging in order if specified. The name is used in metrics and logging in order
@ -4168,7 +4200,8 @@ spec:
(milliseconds seconds minutes hours days weeks years). (milliseconds seconds minutes hours days weeks years).
type: string type: string
retentionSize: retentionSize:
description: Maximum amount of disk space used by blocks. description: 'Maximum amount of disk space used by blocks. Supported
units: B, KB, MB, GB, TB, PB, EB. Ex: `512MB`.'
type: string type: string
routePrefix: routePrefix:
description: The route prefix Prometheus registers HTTP handlers for. description: The route prefix Prometheus registers HTTP handlers for.
@ -4435,7 +4468,7 @@ spec:
to use to run the Prometheus Pods. to use to run the Prometheus Pods.
type: string type: string
serviceMonitorNamespaceSelector: serviceMonitorNamespaceSelector:
description: Namespaces to be selected for ServiceMonitor discovery. description: Namespace's labels to match for ServiceMonitor discovery.
If nil, only check own namespace. If nil, only check own namespace.
properties: properties:
matchExpressions: matchExpressions:
@ -5072,6 +5105,11 @@ spec:
required: required:
- key - key
type: object type: object
tracingConfigFile:
description: TracingConfig specifies the path of the tracing configuration
file. When used alongside with TracingConfig, TracingConfigFile
takes precedence.
type: string
version: version:
description: Version describes the version of Thanos to use. description: Version describes the version of Thanos to use.
type: string type: string

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: ServiceMonitor kind: ServiceMonitor
listKind: ServiceMonitorList listKind: ServiceMonitorList
plural: servicemonitors plural: servicemonitors
@ -184,7 +186,9 @@ spec:
type: string type: string
relabelings: relabelings:
description: 'RelabelConfigs to apply to samples before scraping. description: 'RelabelConfigs to apply to samples before scraping.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' Prometheus Operator automatically adds relabelings for a few
standard Kubernetes fields and replaces original scrape job
name with __tmp_prometheus_job_name. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items: items:
description: 'RelabelConfig allows dynamic rewriting of the description: 'RelabelConfig allows dynamic rewriting of the
label set, being applied to samples before ingestion. It label set, being applied to samples before ingestion. It

View File

@ -8,6 +8,8 @@ metadata:
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
categories:
- prometheus-operator
kind: ThanosRuler kind: ThanosRuler
listKind: ThanosRulerList listKind: ThanosRulerList
plural: thanosrulers plural: thanosrulers

View File

@ -4,7 +4,8 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator name: prometheus-operator
rules: rules:
- apiGroups: - apiGroups:

View File

@ -4,7 +4,8 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator name: prometheus-operator
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
@ -13,4 +14,4 @@ roleRef:
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: default

View File

@ -4,27 +4,30 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: default
spec: spec:
replicas: 1 replicas: 1
selector: selector:
matchLabels: matchLabels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
template: template:
metadata: metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
spec: spec:
containers: containers:
- args: - args:
- --kubelet-service=kube-system/kubelet - --kubelet-service=kube-system/kubelet
- --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.44.1 - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.47.0
image: quay.io/prometheus-operator/prometheus-operator:v0.44.1 image: quay.io/prometheus-operator/prometheus-operator:v0.47.0
name: prometheus-operator name: prometheus-operator
ports: ports:
- containerPort: 8080 - containerPort: 8080
@ -48,12 +51,19 @@ spec:
ports: ports:
- containerPort: 8443 - containerPort: 8443
name: https name: https
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext: securityContext:
runAsGroup: 65532 runAsGroup: 65532
runAsNonRoot: true runAsNonRoot: true
runAsUser: 65532 runAsUser: 65532
nodeSelector: nodeSelector:
beta.kubernetes.io/os: linux kubernetes.io/os: linux
securityContext: securityContext:
runAsNonRoot: true runAsNonRoot: true
runAsUser: 65534 runAsUser: 65534

View File

@ -0,0 +1,95 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
prometheus: k8s
role: alert-rules
name: prometheus-operator-rules
namespace: default
spec:
groups:
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
description: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorlisterrors
summary: Errors while performing list operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="default"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="default"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
description: Errors while performing watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorwatcherrors
summary: Errors while performing watch operations in controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="default"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="default"}[10m]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorsyncfailed
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="default"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations
failed for {{ $labels.controller }} controller in {{ $labels.namespace }}
namespace.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorreconcileerrors
summary: Errors while reconciling controller.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="default"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="default"}[5m]))) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
description: Errors while reconciling Prometheus in {{ $labels.namespace }}
Namespace.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornodelookuperrors
summary: Errors while reconciling Prometheus.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="default"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNotReady
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace isn't
ready to reconcile {{ $labels.controller }} resources.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatornotready
summary: Prometheus operator not ready
expr: |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="default"}[5m]) == 0)
for: 5m
labels:
severity: warning
- alert: PrometheusOperatorRejectedResources
annotations:
description: Prometheus operator in {{ $labels.namespace }} namespace rejected
{{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource
}} resources.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/prometheusoperatorrejectedresources
summary: Resources rejected by Prometheus operator
expr: |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="default"}[5m]) > 0
for: 5m
labels:
severity: warning

View File

@ -4,9 +4,10 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: default
spec: spec:
clusterIP: None clusterIP: None
ports: ports:
@ -16,3 +17,4 @@ spec:
selector: selector:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/part-of: kube-prometheus

View File

@ -4,6 +4,7 @@ metadata:
labels: labels:
app.kubernetes.io/component: controller app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.44.1 app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.47.0
name: prometheus-operator name: prometheus-operator
namespace: monitoring namespace: default

View File

@ -8,7 +8,7 @@ local statefulSet = k.apps.v1.statefulSet;
local selector = statefulSet.mixin.spec.selectorType; local selector = statefulSet.mixin.spec.selectorType;
local kp = local kp =
(import 'kube-prometheus/kube-prometheus.libsonnet') + (import 'kube-prometheus/main.libsonnet') +
(import 'prometheus-pushgateway/pushgateway.libsonnet') + (import 'prometheus-pushgateway/pushgateway.libsonnet') +
(import 'k3s.libsonnet') (import 'k3s.libsonnet')

View File

@ -1 +0,0 @@
github.com/etcd-io/etcd/Documentation/etcd-mixin

View File

@ -7,7 +7,7 @@
}, },
imageRepos+:: { imageRepos+:: {
grafana: 'grafana/grafana', grafana: 'docker.io/grafana/grafana',
}, },
prometheus+:: { prometheus+:: {
@ -16,6 +16,11 @@
}, },
grafana+:: { grafana+:: {
labels: {
'app.kubernetes.io/name': 'grafana',
'app.kubernetes.io/version': $._config.versions.grafana,
'app.kubernetes.io/component': 'grafana',
},
dashboards: {}, dashboards: {},
rawDashboards: {}, rawDashboards: {},
folderDashboards: {}, folderDashboards: {},
@ -51,6 +56,7 @@
metadata: { metadata: {
name: 'grafana-config', name: 'grafana-config',
namespace: $._config.namespace, namespace: $._config.namespace,
labels: $._config.grafana.labels,
}, },
type: 'Opaque', type: 'Opaque',
data: { data: {
@ -67,6 +73,7 @@
metadata: { metadata: {
name: dashboardName, name: dashboardName,
namespace: $._config.namespace, namespace: $._config.namespace,
labels: $._config.grafana.labels,
}, },
data: { [name]: std.manifestJsonEx($._config.grafana.dashboards[name], ' ') }, data: { [name]: std.manifestJsonEx($._config.grafana.dashboards[name], ' ') },
} }
@ -79,6 +86,7 @@
metadata: { metadata: {
name: dashboardName, name: dashboardName,
namespace: $._config.namespace, namespace: $._config.namespace,
labels: $._config.grafana.labels,
}, },
data: { [name]: std.manifestJsonEx($._config.grafana.folderDashboards[folder][name], ' ') }, data: { [name]: std.manifestJsonEx($._config.grafana.folderDashboards[folder][name], ' ') },
} }
@ -95,6 +103,7 @@
metadata: { metadata: {
name: dashboardName, name: dashboardName,
namespace: $._config.namespace, namespace: $._config.namespace,
labels: $._config.grafana.labels,
}, },
data: { [name]: $._config.grafana.rawDashboards[name] }, data: { [name]: $._config.grafana.rawDashboards[name] },
} }
@ -141,6 +150,7 @@
metadata: { metadata: {
name: 'grafana-dashboards', name: 'grafana-dashboards',
namespace: $._config.namespace, namespace: $._config.namespace,
labels: $._config.grafana.labels,
}, },
data: { 'dashboards.yaml': std.manifestJsonEx(dashboardSources, ' ') }, data: { 'dashboards.yaml': std.manifestJsonEx(dashboardSources, ' ') },
}, },
@ -151,6 +161,7 @@
metadata: { metadata: {
name: 'grafana-datasources', name: 'grafana-datasources',
namespace: $._config.namespace, namespace: $._config.namespace,
labels: $._config.grafana.labels,
}, },
type: 'Opaque', type: 'Opaque',
data: { 'datasources.yaml': std.base64(std.encodeUTF8(std.manifestJsonEx({ data: { 'datasources.yaml': std.base64(std.encodeUTF8(std.manifestJsonEx({
@ -165,13 +176,10 @@
metadata: { metadata: {
name: 'grafana', name: 'grafana',
namespace: $._config.namespace, namespace: $._config.namespace,
labels: { labels: $._config.grafana.labels,
app: 'grafana',
},
}, },
spec: { spec: {
selector: $.grafana.deployment.spec.selector.matchLabels, selector: $.grafana.deployment.spec.selector.matchLabels,
type: 'NodePort',
ports: [ ports: [
{ name: 'http', targetPort: 'http', port: 3000 }, { name: 'http', targetPort: 'http', port: 3000 },
], ],
@ -189,7 +197,12 @@
deployment: deployment:
local targetPort = $._config.grafana.port; local targetPort = $._config.grafana.port;
local portName = 'http'; local portName = 'http';
local podLabels = { app: 'grafana' }; local podLabels = $._config.grafana.labels;
local podSelectorLabels = {
[labelName]: podLabels[labelName]
for labelName in std.objectFields(podLabels)
if !std.setMember(labelName, ['app.kubernetes.io/version'])
};
local configVolumeName = 'grafana-config'; local configVolumeName = 'grafana-config';
local configSecretName = 'grafana-config'; local configSecretName = 'grafana-config';
@ -311,7 +324,7 @@
spec: { spec: {
replicas: 1, replicas: 1,
selector: { selector: {
matchLabels: podLabels, matchLabels: podSelectorLabels,
}, },
template: { template: {
metadata: { metadata: {

View File

@ -10,6 +10,11 @@
// scrape_interval_seconds is the global scrape interval which can be // scrape_interval_seconds is the global scrape interval which can be
// used to dynamically adjust rate windows as a function of the interval. // used to dynamically adjust rate windows as a function of the interval.
scrape_interval_seconds: 30, scrape_interval_seconds: 30,
// Dashboard variable refresh option on Grafana (https://grafana.com/docs/grafana/latest/datasources/prometheus/).
// 0 : Never (Will never refresh the Dashboard variables values)
// 1 : On Dashboard Load (Will refresh Dashboards variables when dashboard are loaded)
// 2 : On Time Range Change (Will refresh Dashboards variables when time range will be changed)
dashboard_var_refresh: 2,
}, },
prometheusAlerts+:: { prometheusAlerts+:: {
@ -202,51 +207,6 @@
summary: 'etcd cluster 99th percentile commit durations are too high.', summary: 'etcd cluster 99th percentile commit durations are too high.',
}, },
}, },
{
alert: 'etcdHighNumberOfFailedHTTPRequests',
expr: |||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
without (code) > 0.01
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}',
summary: 'etcd has high number of failed HTTP requests.',
},
},
{
alert: 'etcdHighNumberOfFailedHTTPRequests',
expr: |||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
without (code) > 0.05
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
summary: 'etcd has high number of failed HTTP requests.',
},
},
{
alert: 'etcdHTTPRequestsSlow',
expr: |||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
description: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.',
summary: 'etcd instance HTTP requests are slow.',
},
},
{ {
alert: 'etcdBackendQuotaLowSpace', alert: 'etcdBackendQuotaLowSpace',
expr: ||| expr: |||
@ -283,7 +243,7 @@
uid: std.md5('etcd.json'), uid: std.md5('etcd.json'),
title: 'etcd', title: 'etcd',
description: 'etcd sample Grafana dashboard with Prometheus', description: 'etcd sample Grafana dashboard with Prometheus',
tags: [], tags: [ 'etcd-mixin' ],
style: 'dark', style: 'dark',
timezone: 'browser', timezone: 'browser',
editable: true, editable: true,
@ -1332,7 +1292,7 @@
name: 'cluster', name: 'cluster',
options: [], options: [],
query: 'label_values(etcd_server_has_leader, job)', query: 'label_values(etcd_server_has_leader, job)',
refresh: 1, refresh: $._config.dashboard_var_refresh,
regex: '', regex: '',
sort: 2, sort: 2,
tagValuesQuery: '', tagValuesQuery: '',

View File

@ -0,0 +1,57 @@
{
/**
* Creates a [Google Cloud Monitoring target](https://grafana.com/docs/grafana/latest/datasources/cloudmonitoring/)
*
* @name cloudmonitoring.target
*
* @param metric
* @param project
* @param filters (optional)
* @param groupBys (optional)
* @param period (default: `'cloud-monitoring-auto'`)
* @param crossSeriesReducer (default 'REDUCE_MAX')
* @param valueType (default 'INT64')
* @param perSeriesAligner (default 'ALIGN_DELTA')
* @param metricKind (default 'CUMULATIVE')
* @param unit (optional)
* @param alias (optional)
* @return Panel target
*/
target(
metric,
project,
filters=[],
groupBys=[],
period='cloud-monitoring-auto',
crossSeriesReducer='REDUCE_MAX',
valueType='INT64',
perSeriesAligner='ALIGN_DELTA',
metricKind='CUMULATIVE',
unit=1,
alias=null,
):: {
metricQuery: {
[if alias != null then 'aliasBy']: alias,
alignmentPeriod: period,
crossSeriesReducer: crossSeriesReducer,
[if filters != null then 'filters']: filters,
[if groupBys != null then 'groupBys']: groupBys,
metricKind: metricKind,
metricType: metric,
perSeriesAligner: perSeriesAligner,
projectName: project,
unit: unit,
valueType: valueType,
},
sloQuery: {
[if alias != null then 'aliasBy']: alias,
alignmentPeriod: period,
projectName: project,
selectorName: 'select_slo_health',
serviceId: '',
sloId: '',
},
},
}

View File

@ -13,6 +13,9 @@
* @param highResolution (default: `false`) * @param highResolution (default: `false`)
* @param period (default: `'1m'`) * @param period (default: `'1m'`)
* @param dimensions (optional) * @param dimensions (optional)
* @param id (optional)
* @param expression (optional)
* @param hide (optional)
* @return Panel target * @return Panel target
*/ */
@ -26,7 +29,10 @@
alias=null, alias=null,
highResolution=false, highResolution=false,
period='1m', period='1m',
dimensions={} dimensions={},
id=null,
expression=null,
hide=null
):: { ):: {
region: region, region: region,
namespace: namespace, namespace: namespace,
@ -37,5 +43,9 @@
highResolution: highResolution, highResolution: highResolution,
period: period, period: period,
dimensions: dimensions, dimensions: dimensions,
[if id != null then 'id']: id,
[if expression != null then 'expression']: expression,
[if hide != null then 'hide']: hide,
}, },
} }

View File

@ -36,6 +36,7 @@
* @method addMappings(mappings) Adds an array of value mappings. * @method addMappings(mappings) Adds an array of value mappings.
* @method addDataLink(link) Adds a data link. * @method addDataLink(link) Adds a data link.
* @method addDataLinks(links) Adds an array of data links. * @method addDataLinks(links) Adds an array of data links.
* @param timeFrom (optional)
*/ */
new( new(
title, title,
@ -58,6 +59,7 @@
repeat=null, repeat=null,
repeatDirection='h', repeatDirection='h',
repeatMaxPerRow=null, repeatMaxPerRow=null,
timeFrom=null,
pluginVersion='7', pluginVersion='7',
):: { ):: {
@ -71,6 +73,7 @@
[if repeat != null then 'repeat']: repeat, [if repeat != null then 'repeat']: repeat,
[if repeat != null then 'repeatDirection']: repeatDirection, [if repeat != null then 'repeatDirection']: repeatDirection,
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow, [if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
[if timeFrom != null then 'timeFrom']: timeFrom,
// targets // targets
_nextTarget:: 0, _nextTarget:: 0,
@ -138,6 +141,21 @@
fieldConfig+: { defaults+: { links+: [link] } }, fieldConfig+: { defaults+: { links+: [link] } },
}, },
// Overrides
addOverride(
matcher=null,
properties=null,
):: self {
fieldConfig+: {
overrides+: [
{
[if matcher != null then 'matcher']: matcher,
[if properties != null then 'properties']: properties,
},
],
},
},
addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self),
} else { } else {
options: { options: {

View File

@ -18,6 +18,7 @@
sql:: import 'sql.libsonnet', sql:: import 'sql.libsonnet',
graphite:: import 'graphite.libsonnet', graphite:: import 'graphite.libsonnet',
alertCondition:: import 'alert_condition.libsonnet', alertCondition:: import 'alert_condition.libsonnet',
cloudmonitoring:: import 'cloudmonitoring.libsonnet',
cloudwatch:: import 'cloudwatch.libsonnet', cloudwatch:: import 'cloudwatch.libsonnet',
elasticsearch:: import 'elasticsearch.libsonnet', elasticsearch:: import 'elasticsearch.libsonnet',
heatmapPanel:: import 'heatmap_panel.libsonnet', heatmapPanel:: import 'heatmap_panel.libsonnet',
@ -27,4 +28,5 @@
gaugePanel:: import 'gauge_panel.libsonnet', gaugePanel:: import 'gauge_panel.libsonnet',
barGaugePanel:: import 'bar_gauge_panel.libsonnet', barGaugePanel:: import 'bar_gauge_panel.libsonnet',
statPanel:: import 'stat_panel.libsonnet', statPanel:: import 'stat_panel.libsonnet',
transformation:: import 'transformation.libsonnet',
} }

View File

@ -21,6 +21,7 @@
* @param formatY2 (optional) Unit of the second Y axis * @param formatY2 (optional) Unit of the second Y axis
* @param min (optional) Min of the Y axes * @param min (optional) Min of the Y axes
* @param max (optional) Max of the Y axes * @param max (optional) Max of the Y axes
* @param maxDataPoints (optional) If the data source supports it, sets the maximum number of data points for each series returned.
* @param labelY1 (optional) Label of the first Y axis * @param labelY1 (optional) Label of the first Y axis
* @param labelY2 (optional) Label of the second Y axis * @param labelY2 (optional) Label of the second Y axis
* @param x_axis_mode (default `'time'`) X axis mode, one of [time, series, histogram] * @param x_axis_mode (default `'time'`) X axis mode, one of [time, series, histogram]
@ -57,6 +58,8 @@
* @param value_type (default `'individual'`) Type of tooltip value * @param value_type (default `'individual'`) Type of tooltip value
* @param shared_tooltip (default `true`) Allow to group or spit tooltips on mouseover within a chart * @param shared_tooltip (default `true`) Allow to group or spit tooltips on mouseover within a chart
* @param percentage (defaut: false) show as percentages * @param percentage (defaut: false) show as percentages
* @param interval (defaut: null) A lower limit for the interval.
* *
* @method addTarget(target) Adds a target object. * @method addTarget(target) Adds a target object.
* @method addTargets(targets) Adds an array of targets. * @method addTargets(targets) Adds an array of targets.
@ -126,8 +129,10 @@
value_type='individual', value_type='individual',
shared_tooltip=true, shared_tooltip=true,
percentage=false, percentage=false,
maxDataPoints=null,
time_from=null, time_from=null,
time_shift=null, time_shift=null,
interval=null
):: { ):: {
title: title, title: title,
[if span != null then 'span']: span, [if span != null then 'span']: span,
@ -179,6 +184,7 @@
bars: bars, bars: bars,
stack: stack, stack: stack,
percentage: percentage, percentage: percentage,
[if maxDataPoints != null then 'maxDataPoints']: maxDataPoints,
legend: { legend: {
show: legend_show, show: legend_show,
values: legend_values, values: legend_values,
@ -204,6 +210,7 @@
}, },
timeFrom: time_from, timeFrom: time_from,
timeShift: time_shift, timeShift: time_shift,
[if interval != null then 'interval']: interval,
[if transparent == true then 'transparent']: transparent, [if transparent == true then 'transparent']: transparent,
aliasColors: aliasColors, aliasColors: aliasColors,
repeat: repeat, repeat: repeat,
@ -288,5 +295,19 @@
links+: [link], links+: [link],
}, },
addLinks(links):: std.foldl(function(p, t) p.addLink(t), links, self), addLinks(links):: std.foldl(function(p, t) p.addLink(t), links, self),
addOverride(
matcher=null,
properties=null,
):: self {
fieldConfig+: {
overrides+: [
{
[if matcher != null then 'matcher']: matcher,
[if properties != null then 'properties']: properties,
},
],
},
},
addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self),
}, },
} }

View File

@ -42,6 +42,7 @@
* @param yBucketBound (default `'auto'`) Which bound ('lower' or 'upper') of the bucket to use * @param yBucketBound (default `'auto'`) Which bound ('lower' or 'upper') of the bucket to use
* @param yBucketNumber (optional) Number of buckets for the Y axis * @param yBucketNumber (optional) Number of buckets for the Y axis
* @param yBucketSize (optional) Size of Y axis buckets. Has priority over yBucketNumber * @param yBucketSize (optional) Size of Y axis buckets. Has priority over yBucketNumber
* @param maxDataPoints (optional) The maximum data points per series. Used directly by some data sources and used in calculation of auto interval. With streaming data this value is used for the rolling buffer.
* *
* @method addTarget(target) Adds a target object. * @method addTarget(target) Adds a target object.
* @method addTargets(targets) Adds an array of targets. * @method addTargets(targets) Adds an array of targets.
@ -83,7 +84,7 @@
yBucketBound='auto', yBucketBound='auto',
yBucketNumber=null, yBucketNumber=null,
yBucketSize=null, yBucketSize=null,
maxDataPoints=null,
):: { ):: {
title: title, title: title,
type: 'heatmap', type: 'heatmap',
@ -135,6 +136,7 @@
yBucketBound: yBucketBound, yBucketBound: yBucketBound,
[if dataFormat == 'timeseries' then 'yBucketNumber']: yBucketNumber, [if dataFormat == 'timeseries' then 'yBucketNumber']: yBucketNumber,
[if dataFormat == 'timeseries' then 'yBucketSize']: yBucketSize, [if dataFormat == 'timeseries' then 'yBucketSize']: yBucketSize,
[if maxDataPoints != null then 'maxDataPoints']: maxDataPoints,
_nextTarget:: 0, _nextTarget:: 0,
addTarget(target):: self { addTarget(target):: self {

View File

@ -7,14 +7,17 @@
* @param rawSql The SQL query * @param rawSql The SQL query
* @param datasource (optional) * @param datasource (optional)
* @param format (default `'time_series'`) * @param format (default `'time_series'`)
* @param alias (optional)
*/ */
target( target(
rawSql, rawSql,
datasource=null, datasource=null,
format='time_series', format='time_series',
alias=null,
):: { ):: {
[if datasource != null then 'datasource']: datasource, [if datasource != null then 'datasource']: datasource,
format: format, format: format,
[if alias != null then 'alias']: alias,
rawSql: rawSql, rawSql: rawSql,
}, },
} }

View File

@ -23,9 +23,10 @@
* @param displayName (optional) Change the field or series name. * @param displayName (optional) Change the field or series name.
* @param noValue (optional) What to show when there is no value. * @param noValue (optional) What to show when there is no value.
* @param thresholdsMode (default `'absolute'`) 'absolute' or 'percentage'. * @param thresholdsMode (default `'absolute'`) 'absolute' or 'percentage'.
* @param timeFrom (optional) Override the relative time range.
* @param repeat (optional) Name of variable that should be used to repeat this panel. * @param repeat (optional) Name of variable that should be used to repeat this panel.
* @param repeatDirection (default `'h'`) 'h' for horizontal or 'v' for vertical. * @param repeatDirection (default `'h'`) 'h' for horizontal or 'v' for vertical.
* @param repeatMaxPerRow (optional) Maximum panels per row in repeat mode. * @param maxPerRow (optional) Maximum panels per row in repeat mode.
* @param pluginVersion (default `'7'`) Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'. * @param pluginVersion (default `'7'`) Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'.
* *
* @method addTarget(target) Adds a target object. * @method addTarget(target) Adds a target object.
@ -59,9 +60,10 @@
displayName=null, displayName=null,
noValue=null, noValue=null,
thresholdsMode='absolute', thresholdsMode='absolute',
timeFrom=null,
repeat=null, repeat=null,
repeatDirection='h', repeatDirection='h',
repeatMaxPerRow=null, maxPerRow=null,
pluginVersion='7', pluginVersion='7',
):: { ):: {
@ -74,7 +76,8 @@
links: [], links: [],
[if repeat != null then 'repeat']: repeat, [if repeat != null then 'repeat']: repeat,
[if repeat != null then 'repeatDirection']: repeatDirection, [if repeat != null then 'repeatDirection']: repeatDirection,
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow, [if timeFrom != null then 'timeFrom']: timeFrom,
[if repeat != null then 'maxPerRow']: maxPerRow,
// targets // targets
_nextTarget:: 0, _nextTarget:: 0,
@ -143,6 +146,22 @@
addDataLink(link):: self { addDataLink(link):: self {
fieldConfig+: { defaults+: { links+: [link] } }, fieldConfig+: { defaults+: { links+: [link] } },
}, },
// Overrides
addOverride(
matcher=null,
properties=null,
):: self {
fieldConfig+: {
overrides+: [
{
[if matcher != null then 'matcher']: matcher,
[if properties != null then 'properties']: properties,
},
],
},
},
addOverrides(overrides):: std.foldl(function(p, o) p.addOverride(o.matcher, o.properties), overrides, self),
} else { } else {
options: { options: {
fieldOptions: { fieldOptions: {

View File

@ -24,6 +24,8 @@
* @method addColumn(field, style) Adds a column * @method addColumn(field, style) Adds a column
* @method hideColumn(field) Hides a column * @method hideColumn(field) Hides a column
* @method addLink(link) Adds a link * @method addLink(link) Adds a link
* @method addTransformation(transformation) Adds a transformation object
* @method addTransformations(transformations) Adds an array of transformations
*/ */
new( new(
title, title,
@ -81,5 +83,9 @@
addLink(link):: self { addLink(link):: self {
links+: [link], links+: [link],
}, },
addTransformation(transformation):: self {
transformations+: [transformation],
},
addTransformations(transformations):: std.foldl(function(p, t) p.addTransformation(t), transformations, self),
}, },
} }

View File

@ -1,6 +1,6 @@
{ {
/** /**
* Creates a [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates) that can be added to a dashboard. * Creates a [template](https://grafana.com/docs/grafana/latest/variables/#templates) that can be added to a dashboard.
* *
* @name template.new * @name template.new
* *
@ -18,7 +18,7 @@
* @param multi (default `false`) Whether multiple values can be selected or not from variable value list. * @param multi (default `false`) Whether multiple values can be selected or not from variable value list.
* @param sort (default `0`) `0`: Without Sort, `1`: Alphabetical (asc), `2`: Alphabetical (desc), `3`: Numerical (asc), `4`: Numerical (desc). * @param sort (default `0`) `0`: Without Sort, `1`: Alphabetical (asc), `2`: Alphabetical (desc), `3`: Numerical (asc), `4`: Numerical (desc).
* *
* @return A [template](https://grafana.com/docs/grafana/latest/variables/templates-and-variables/#templates) * @return A [template](https://grafana.com/docs/grafana/latest/variables/#templates)
*/ */
new( new(
name, name,

View File

@ -31,8 +31,10 @@
'7d', '7d',
'30d', '30d',
], ],
nowDelay=null,
):: { ):: {
refresh_intervals: refresh_intervals, refresh_intervals: refresh_intervals,
time_options: time_options, time_options: time_options,
[if nowDelay != null then 'nowDelay']: nowDelay,
}, },
} }

View File

@ -0,0 +1,12 @@
{
/**
* @name transformation.new
*/
new(
id='',
options={}
):: {
id: id,
options: options,
},
}

View File

@ -44,7 +44,7 @@
addMultiTemplate(name, metric_name, label_name, hide=0):: self { addMultiTemplate(name, metric_name, label_name, hide=0):: self {
templating+: { templating+: {
list+: [{ list+: [{
allValue: null, allValue: '.+',
current: { current: {
selected: true, selected: true,
text: 'All', text: 'All',
@ -196,7 +196,7 @@
timeShift: null, timeShift: null,
title: title, title: title,
tooltip: { tooltip: {
shared: true, shared: false,
sort: 0, sort: 0,
value_type: 'individual', value_type: 'individual',
}, },
@ -382,7 +382,7 @@
expr: expr:
||| |||
sum by (status) ( sum by (status) (
label_replace(label_replace(rate(%s[$__interval]), label_replace(label_replace(rate(%s[$__rate_interval]),
"status", "${1}xx", "%s", "([0-9]).."), "status", "${1}xx", "%s", "([0-9]).."),
"status", "${1}", "%s", "([a-z]+)")) "status", "${1}", "%s", "([a-z]+)"))
||| % [selector, statusLabelName, statusLabelName], ||| % [selector, statusLabelName, statusLabelName],
@ -399,7 +399,7 @@
nullPointMode: 'null as zero', nullPointMode: 'null as zero',
targets: [ targets: [
{ {
expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__interval])) by (le)) * %s' % [metricName, selector, multiplier], expr: 'histogram_quantile(0.99, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier],
format: 'time_series', format: 'time_series',
intervalFactor: 2, intervalFactor: 2,
legendFormat: '99th Percentile', legendFormat: '99th Percentile',
@ -407,7 +407,7 @@
step: 10, step: 10,
}, },
{ {
expr: 'histogram_quantile(0.50, sum(rate(%s_bucket%s[$__interval])) by (le)) * %s' % [metricName, selector, multiplier], expr: 'histogram_quantile(0.50, sum(rate(%s_bucket%s[$__rate_interval])) by (le)) * %s' % [metricName, selector, multiplier],
format: 'time_series', format: 'time_series',
intervalFactor: 2, intervalFactor: 2,
legendFormat: '50th Percentile', legendFormat: '50th Percentile',
@ -415,7 +415,7 @@
step: 10, step: 10,
}, },
{ {
expr: 'sum(rate(%s_sum%s[$__interval])) * %s / sum(rate(%s_count%s[$__interval]))' % [metricName, selector, multiplier, metricName, selector], expr: 'sum(rate(%s_sum%s[$__rate_interval])) * %s / sum(rate(%s_count%s[$__rate_interval]))' % [metricName, selector, multiplier, metricName, selector],
format: 'time_series', format: 'time_series',
intervalFactor: 2, intervalFactor: 2,
legendFormat: 'Average', legendFormat: 'Average',

View File

@ -6,6 +6,8 @@ approvers:
- metalmatze - metalmatze
- tomwilkie - tomwilkie
- s-urbaniak - s-urbaniak
- povilasv
- paulfantom
reviewers: reviewers:
- brancz - brancz
@ -13,3 +15,5 @@ reviewers:
- metalmatze - metalmatze
- tomwilkie - tomwilkie
- s-urbaniak - s-urbaniak
- povilasv
- paulfantom

View File

@ -7,15 +7,17 @@ A set of Grafana dashboards and Prometheus alerts for Kubernetes.
## Releases ## Releases
| Release branch | Kubernetes Compatibility | Prometheus Compatibility | | Release branch | Kubernetes Compatibility | Prometheus Compatibility | Kube-state-metrics Compatibility |
| ------- | -------------------------- | ------------------------ | | -------------- | -------------------------- | ------------------------ | -------------------------------- |
| release-0.1 | v1.13 and before | | | release-0.1 | v1.13 and before | | |
| release-0.2 | v1.14.1 and before | v2.11.0+ | | release-0.2 | v1.14.1 and before | v2.11.0+ | |
| release-0.3 | v1.17 and before | v2.11.0+ | | release-0.3 | v1.17 and before | v2.11.0+ | |
| release-0.4 | v1.18 | v2.11.0+ | | release-0.4 | v1.18 | v2.11.0+ | |
| release-0.5 | v1.19 | v2.11.0+ | | release-0.5 | v1.19 | v2.11.0+ | |
| release-0.6 | v1.19+ | v2.11.0+ | | release-0.6 | v1.19+ | v2.11.0+ | |
| master | v1.19+ | v2.11.0+ | | release-0.7 | v1.19+ | v2.11.0+ | v1.x |
| release-0.8 | v1.20+ | v2.11.0+ | v2.0+ |
| master | v1.20+ | v2.11.0+ | v2.0+ |
In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented. In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.
Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis. Therefore v0.1.x of this repository is the last release to support Kubernetes 1.13 and previous version on a best effort basis.

Some files were not shown because too many files have changed in this diff Show More