Browse Source

major monitoring upgrade

pull/452/head
Tobias Brunner 2 weeks ago
parent
commit
d26b64b384
  1. 2
      monitoring/jsonnetfile.json
  2. 60
      monitoring/jsonnetfile.lock.json
  3. 2
      monitoring/k3s.libsonnet
  4. 23
      monitoring/manifests/alertmanager-alertmanager.yaml
  5. 18
      monitoring/manifests/alertmanager-podDisruptionBudget.yaml
  6. 156
      monitoring/manifests/alertmanager-prometheusRule.yaml
  7. 8
      monitoring/manifests/alertmanager-secret.yaml
  8. 9
      monitoring/manifests/alertmanager-service.yaml
  9. 8
      monitoring/manifests/alertmanager-serviceAccount.yaml
  10. 10
      monitoring/manifests/alertmanager-serviceMonitor.yaml
  11. 3
      monitoring/manifests/alertmanager-tbrnt-config-secret.yaml
  12. 9
      monitoring/manifests/grafana-dashboardDatasources.yaml
  13. 6093
      monitoring/manifests/grafana-dashboardDefinitions.yaml
  14. 7
      monitoring/manifests/grafana-dashboardSources.yaml
  15. 30
      monitoring/manifests/grafana-deployment.yaml
  16. 12
      monitoring/manifests/grafana-service.yaml
  17. 2
      monitoring/manifests/grafana-serviceAccount.yaml
  18. 9
      monitoring/manifests/grafana-serviceMonitor.yaml
  19. 1
      monitoring/manifests/healthchecks-io-secret.yaml
  20. 22
      monitoring/manifests/kube-state-metrics-clusterRole.yaml
  21. 6
      monitoring/manifests/kube-state-metrics-clusterRoleBinding.yaml
  22. 39
      monitoring/manifests/kube-state-metrics-deployment.yaml
  23. 46
      monitoring/manifests/kube-state-metrics-prometheusRule.yaml
  24. 8
      monitoring/manifests/kube-state-metrics-service.yaml
  25. 6
      monitoring/manifests/kube-state-metrics-serviceAccount.yaml
  26. 8
      monitoring/manifests/kube-state-metrics-serviceMonitor.yaml
  27. 5
      monitoring/manifests/node-exporter-clusterRole.yaml
  28. 7
      monitoring/manifests/node-exporter-clusterRoleBinding.yaml
  29. 24
      monitoring/manifests/node-exporter-daemonset.yaml
  30. 301
      monitoring/manifests/node-exporter-prometheusRule.yaml
  31. 8
      monitoring/manifests/node-exporter-service.yaml
  32. 7
      monitoring/manifests/node-exporter-serviceAccount.yaml
  33. 8
      monitoring/manifests/node-exporter-serviceMonitor.yaml
  34. 7
      monitoring/manifests/prometheus-adapter-apiService.yaml
  35. 5
      monitoring/manifests/prometheus-adapter-clusterRole.yaml
  36. 4
      monitoring/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml
  37. 7
      monitoring/manifests/prometheus-adapter-clusterRoleBinding.yaml
  38. 7
      monitoring/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml
  39. 5
      monitoring/manifests/prometheus-adapter-clusterRoleServerResources.yaml
  40. 15
      monitoring/manifests/prometheus-adapter-configMap.yaml
  41. 24
      monitoring/manifests/prometheus-adapter-deployment.yaml
  42. 7
      monitoring/manifests/prometheus-adapter-roleBindingAuthReader.yaml
  43. 11
      monitoring/manifests/prometheus-adapter-service.yaml
  44. 7
      monitoring/manifests/prometheus-adapter-serviceAccount.yaml
  45. 11
      monitoring/manifests/prometheus-adapter-serviceMonitor.yaml
  46. 5
      monitoring/manifests/prometheus-clusterRole.yaml
  47. 7
      monitoring/manifests/prometheus-clusterRoleBinding.yaml
  48. 8
      monitoring/manifests/prometheus-operator-serviceMonitor.yaml
  49. 18
      monitoring/manifests/prometheus-podDisruptionBudget.yaml
  50. 24
      monitoring/manifests/prometheus-prometheus.yaml
  51. 256
      monitoring/manifests/prometheus-prometheusRule.yaml
  52. 9
      monitoring/manifests/prometheus-roleBindingConfig.yaml
  53. 49
      monitoring/manifests/prometheus-roleBindingSpecificNamespaces.yaml
  54. 7
      monitoring/manifests/prometheus-roleConfig.yaml
  55. 61
      monitoring/manifests/prometheus-roleSpecificNamespaces.yaml
  56. 2220
      monitoring/manifests/prometheus-rules.yaml
  57. 9
      monitoring/manifests/prometheus-service.yaml
  58. 7
      monitoring/manifests/prometheus-serviceAccount.yaml
  59. 12
      monitoring/manifests/prometheus-serviceMonitor.yaml
  60. 74
      monitoring/manifests/prometheus-serviceMonitorApiserver.yaml
  61. 19
      monitoring/manifests/prometheus-serviceMonitorCoreDNS.yaml
  62. 14
      monitoring/manifests/prometheus-serviceMonitorKubeControllerManager.yaml
  63. 14
      monitoring/manifests/prometheus-serviceMonitorKubeScheduler.yaml
  64. 90
      monitoring/manifests/prometheus-serviceMonitorKubelet.yaml
  65. 2
      monitoring/manifests/setup/0namespace-namespace.yaml
  66. 76
      monitoring/manifests/setup/0namespace-prometheusRule.yaml
  67. 70
      monitoring/manifests/setup/prometheus-operator-0alertmanagerConfigCustomResourceDefinition.yaml
  68. 2
      monitoring/manifests/setup/prometheus-operator-0alertmanagerCustomResourceDefinition.yaml
  69. 8
      monitoring/manifests/setup/prometheus-operator-0podmonitorCustomResourceDefinition.yaml
  70. 216
      monitoring/manifests/setup/prometheus-operator-0probeCustomResourceDefinition.yaml
  71. 50
      monitoring/manifests/setup/prometheus-operator-0prometheusCustomResourceDefinition.yaml
  72. 6
      monitoring/manifests/setup/prometheus-operator-0servicemonitorCustomResourceDefinition.yaml
  73. 2
      monitoring/manifests/setup/prometheus-operator-0thanosrulerCustomResourceDefinition.yaml
  74. 3
      monitoring/manifests/setup/prometheus-operator-clusterRole.yaml
  75. 5
      monitoring/manifests/setup/prometheus-operator-clusterRoleBinding.yaml
  76. 22
      monitoring/manifests/setup/prometheus-operator-deployment.yaml
  77. 95
      monitoring/manifests/setup/prometheus-operator-prometheusRule.yaml
  78. 6
      monitoring/manifests/setup/prometheus-operator-service.yaml
  79. 5
      monitoring/manifests/setup/prometheus-operator-serviceAccount.yaml
  80. 2
      monitoring/monitoring.jsonnet
  81. 1
      monitoring/vendor/etcd-mixin
  82. 27
      monitoring/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet
  83. 0
      monitoring/vendor/github.com/etcd-io/etcd/contrib/mixin/README.md
  84. 54
      monitoring/vendor/github.com/etcd-io/etcd/contrib/mixin/mixin.libsonnet
  85. 0
      monitoring/vendor/github.com/etcd-io/etcd/contrib/mixin/test.yaml
  86. 57
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/cloudmonitoring.libsonnet
  87. 12
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/cloudwatch.libsonnet
  88. 18
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/gauge_panel.libsonnet
  89. 2
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet
  90. 21
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/graph_panel.libsonnet
  91. 4
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/heatmap_panel.libsonnet
  92. 3
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/sql.libsonnet
  93. 25
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/stat_panel.libsonnet
  94. 6
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/table_panel.libsonnet
  95. 4
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/template.libsonnet
  96. 2
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/timepicker.libsonnet
  97. 12
      monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/transformation.libsonnet
  98. 12
      monitoring/vendor/github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet
  99. 4
      monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/OWNERS
  100. 20
      monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/README.md

2
monitoring/jsonnetfile.json

@ -8,7 +8,7 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "master",
"version": "main",
"name": "kube-prometheus"
},
{

60
monitoring/jsonnetfile.lock.json

@ -8,18 +8,18 @@
"subdir": "grafana"
}
},
"version": "8024f4fdaeb3a3a7d72f77e2ed87deb92c79aeda",
"sum": "WXrJQtWuU5lJVc4jXkJGddPMpPP0+4eMcIB5cauZGgM="
"version": "8ea4e7bc04b1bf5e9bd99918ca28c6271b42be0e",
"sum": "muenICtKXABk6MZZHCZD2wCbmtiE96GwWRMGa1Rg+wA="
},
{
"source": {
"git": {
"remote": "https://github.com/etcd-io/etcd",
"subdir": "Documentation/etcd-mixin"
"subdir": "contrib/mixin"
}
},
"version": "ca866c02422ff3f3d1f0876898a30c33dd7bcccf",
"sum": "bLqTqEr0jky9zz5MV/7ucn6H5mph2NlXas0TVnGNB1Y="
"version": "562d645ac923388ff5b8d270b0536764d34b0e0f",
"sum": "W/Azptf1PoqjyMwJON96UY69MFugDA4IAYiKURscryc="
},
{
"source": {
@ -28,8 +28,8 @@
"subdir": "grafonnet"
}
},
"version": "356bd73e4792ffe107725776ca8946895969c191",
"sum": "CSMZ3dJrpJpwvffie8BqcfrIVVwiKNqdPEN+1XWRBGU="
"version": "55cf4ee53ced2b6d3ce96ecce9fb813b4465be98",
"sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk="
},
{
"source": {
@ -38,8 +38,8 @@
"subdir": "grafana-builder"
}
},
"version": "216bc806bb512f218e3cf5ed3d4f5699b07f04d6",
"sum": "9/eJqljTTtJeq9QRjabdKWL6yD8a7VzLmGKBK3ir77k="
"version": "dbf1211d003d20c7adcdee942c477e648507a398",
"sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8="
},
{
"source": {
@ -59,8 +59,8 @@
"subdir": ""
}
},
"version": "ead45674dba3c8712e422d99223453177aac6bf4",
"sum": "3i0NkntlBluDS1NRF+iSc2e727Alkv3ziuVjAP12/kE="
"version": "c67c0f19e869f1da34d79b6507c1fa37c23a6e4e",
"sum": "F+RxcI26zeoeI81uot39Jv6IpQ6BOz+xlSHlElJYsz8="
},
{
"source": {
@ -69,7 +69,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "ead45674dba3c8712e422d99223453177aac6bf4",
"version": "39a9cda705b5201c35105bd1f24c83923fa839ef",
"sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps="
},
{
@ -79,8 +79,8 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "89aaf6c524ee891140c4c8f2a05b1b16f5847309",
"sum": "zD/pbQLnQq+5hegEelaheHS8mn1h09GTktFO74iwlBI="
"version": "b1889aa1561ee269f628e2b9659155e7714dbbf0",
"sum": "S5qI+PJUdNeYOv76jH5nxwYS9N6U7CRxvyuB1wI4cTE="
},
{
"source": {
@ -89,7 +89,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "7bdd62593c9273b5179cf3c9d2d819e9d997aaa4",
"version": "b1889aa1561ee269f628e2b9659155e7714dbbf0",
"sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo="
},
{
@ -99,8 +99,8 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "7d7d40b4dee70ecd3328dcdee2ed0cc8f806df93",
"sum": "6PhhQPWilq4skfe+z/hXKEg1pRqHnwvMR1Au6W136U0="
"version": "5b2740d517095a6ae9ad51bcb9c53e5ef28c62a0",
"sum": "+6VkkR44AC3Qnwfr9cWYCKs+uRi5JaIOda/3X1JEzAg="
},
{
"source": {
@ -109,8 +109,9 @@
"subdir": "jsonnet/mixin"
}
},
"version": "117c9a2cd905479022a66ddd92a41f599cccf10d",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U="
"version": "b7ca32169844f0b5143f3e5e318fc05fa025df18",
"sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=",
"name": "prometheus-operator-mixin"
},
{
"source": {
@ -119,8 +120,8 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "d8b7d3766225908d0239fd0d78258892cd0fc384",
"sum": "Nl+N/h76bzD9tZ8tx7tuNIKHwCIJ9zyOsAWplH8HvAE="
"version": "b7ca32169844f0b5143f3e5e318fc05fa025df18",
"sum": "MRwyChXdKG3anL2OWpbUu3qWc97w9J6YsjUWjLFQyB0="
},
{
"source": {
@ -129,8 +130,8 @@
"subdir": "doc/alertmanager-mixin"
}
},
"version": "193ebba04d1e70d971047e983a0b489112610460",
"sum": "QcftU7gjCQyj7B6M4YJeCAeaPd0kwxd4J4rolo7AnLE=",
"version": "99f64e944b1043c790784cf5373c8fb349816fc4",
"sum": "V8jcZQ1Qrlm7AQ6wjbuQQsacPb0NvrcZovKyplmzW5w=",
"name": "alertmanager"
},
{
@ -140,8 +141,8 @@
"subdir": "docs/node-mixin"
}
},
"version": "8b466360a35581e0301bd22918be7011cf4203c3",
"sum": "rvyiD/yCB4BeYAWqYF53bP8c+aCUt2ipLHW2Ea8ELO8="
"version": "b597c1244d7bef49e6f3359c87a56dd7707f6719",
"sum": "cZTNXQMUCLB5FGYpMn845dcqGdkcYt58qCqOFIV/BoQ="
},
{
"source": {
@ -150,8 +151,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "26d89b4b0776fe4cd5a3656dfa520f119a375273",
"sum": "1VRVMuxAEZ9vdGHFlndmG9iQzDD6AoIXrX80CDpGDaU=",
"version": "3cafc58827d1ebd1a67749f88be4218f0bab3d8d",
"sum": "VK0c3sQ3ksiM6JQsAVfWmL5NbzGv9llMfXFNXfFdJ+A=",
"name": "prometheus"
},
{
@ -161,8 +162,9 @@
"subdir": "mixin"
}
},
"version": "37e6ef61566c7c70793ba6d128f00c4c66cb2402",
"sum": "OptiWUMOHFrRGTZhSfxV1RCeXZ90qsefGNTD4lDYVG0="
"version": "ba6c5c4726ff52807c7383c68f2159b1af7980bb",
"sum": "XP3uq7xcfKHsnWsz1v992csZhhZR3jQma6hFOfSViTs=",
"name": "thanos-mixin"
},
{
"source": {

2
monitoring/k3s.libsonnet

@ -72,7 +72,7 @@ local masterIP = '185.95.218.11';
{
port: 'http-metrics',
interval: '30s',
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
metricRelabelings: (import 'kube-prometheus/addons/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*',

23
monitoring/manifests/alertmanager-alertmanager.yaml

@ -3,17 +3,34 @@ kind: Alertmanager
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: main
namespace: monitoring
namespace: default
spec:
configSecret: alertmanager-tbrnt-config
image: quay.io/prometheus/alertmanager:v0.21.0
nodeSelector:
kubernetes.io/os: linux
replicas: 1
podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: v0.21.0
version: 0.21.0

18
monitoring/manifests/alertmanager-podDisruptionBudget.yaml

@ -0,0 +1,18 @@
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: default
spec:
maxUnavailable: 1
selector:
matchLabels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

156
monitoring/manifests/alertmanager-prometheusRule.yaml

@ -0,0 +1,156 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
prometheus: k8s
role: alert-rules
name: alertmanager-main-rules
namespace: default
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{
$labels.pod}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="default"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only
found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster
members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="default"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="default"}[5m]))
for: 10m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
}}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="default", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="default", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have
different configurations.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="default"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have been up for less than half of the
last 5m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster
are down.
expr: |
(
count by (namespace,service) (
avg_over_time(up{job="alertmanager-main",namespace="default"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="default"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have restarted at least 5 times in the
last 10m.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster
are crashlooping.
expr: |
(
count by (namespace,service) (
changes(process_start_time_seconds{job="alertmanager-main",namespace="default"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="default"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

8
monitoring/manifests/alertmanager-secret.yaml

@ -1,8 +1,14 @@
apiVersion: v1
kind: Secret
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
namespace: default
stringData:
alertmanager.yaml: |-
"global":

9
monitoring/manifests/alertmanager-service.yaml

@ -3,8 +3,12 @@ kind: Service
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
namespace: default
spec:
ports:
- name: web
@ -13,4 +17,7 @@ spec:
selector:
alertmanager: main
app: alertmanager
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
sessionAffinity: ClientIP

8
monitoring/manifests/alertmanager-serviceAccount.yaml

@ -1,5 +1,11 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager-main
namespace: monitoring
namespace: default

10
monitoring/manifests/alertmanager-serviceMonitor.yaml

@ -2,9 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: alertmanager
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.21.0
name: alertmanager
namespace: monitoring
namespace: default
spec:
endpoints:
- interval: 30s
@ -12,3 +15,6 @@ spec:
selector:
matchLabels:
alertmanager: main
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus

3
monitoring/manifests/alertmanager-tbrnt-config-secret.yaml

@ -6,11 +6,12 @@ metadata:
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgDHh1Qgrdffl6IFXJuk3ZzNHGARWZLDzbMLKp5Wo/ZYqclCji4T5wx7Fql6FALCvvUBvThxLfiwM2LQHRcWfWnf5AwxipCxpos9oVvlu4aON0WZd/Kjz/ZwDq5pgR/etCrSA2DYNxYq4vnTxUAk29eY5F4dWwRPcCgVZ5/KsTIcNx4x+4einqQbwAhkUtAwQl5fEPmpwNRquJZM29XIEUxZLWne0YmCmEgNGleUU20ByfYMwgtDJGjgr6XNPtTmByAHVrxNuQwAMxgT6GcfDLCNsByaS6CY3wmSTg1GUv/CG8Xx29FwDWyf1Ly2KbmcAAafN5QJGvCCTEt/WB85GtzQisrWFZTykv3Zjuz101p9ShXQZALylaX5h22hHFXuQyiIQZEeM2ixiYQjcPhiPjx1/hkbQ25QRD73/gjalZO8bprDrJxkLlw+hrgJ0LzxWL881U6INLKow+8/GmLleFhMUXRsGqacLreCIAr4uVGEMGMVLhHJKnj597HRnn0UCxVNkDk8QjHyiVgJBrQ3Pz9SFdF7mxvJ9F4rEgGkE4dvfvWxrZFumTLEkVRF9To+rKxsIVkewvoHtN/gMzFMzumP+fz/oB9yAHsxkwVyfqXBg52hNSYIx5Z/67yy3hDRKPBcZgknf9S+F37ET5BABFxazwG8NJjf4td+UsAGuAMzKI/94u7TxuXLPCs/tIGKD7kJnPxAqpalepzABtVCmOrtWwNPb1h4XeuraUS9beJ2zV9oV5nVFJmX94EJ7qpZt0Um7+GGeavQ5SV3XHRolDS5PpZPTAWnc/1rtZ0nsKk8lllEr3aDWveMXma06NKkIXz8+iAonvHsDZuw0W6jUdUUtraIbSua9YkyugqCBGeeXIPLwFxqJTqIX5vedZVMveFiaxtCJjL48SUGxtyugfiYbPa3xpHWWe22BcJyTmAOG9aIq4Tp4nvftLyvWe7c9PotJk/7gdv1IO4RLx//eLtKWw0uADa0ara4hDuI8Yktlti24TlA9XYz00d5WtE+lJsSZN8547BUfFzXSOZSSbfrFLZmEmBTgkbj4szX19bXSctJN3BtOmRfCEPXYQN10HgnhpwqYHbXKUSTZNWLojnFL1/E56wUXGxRg9NGOwSXzTyfoLGxI9NEQpGc0Rj2Wna+JSUhlAUnfYW1eH8yyg5FfkyhQdyZJFvYfF0rk+XG5XNhLumST19uxrAkMWhk+Z9/eWwOaZQMmDcoi2Rs0za+1GGjPW5k56Ip+spwW5cvYmdl1PgkZ4g1mupjiB0FdgZHGR+kGn1lbPtSUd+amh9PXSDWkqfnix62H7374rQ3ZyG7fs9sQNnnRrd/cDCMxAl5Upk8D9dfxRmvuxRd8b89h7EQwUBML7TIriA2Pci5Ftux2R5wyIXjznLC5/kFZg6/Av3uKmKK6dLR2Ooey7/3g14CEjMumdijjySl8Pd2UUxSKVKD7vkq+3xYm0CJZqVvT/iBOccrv0UEiTHBsXrfaugUvqIKTAGYhJy0fUBXKisPdA0HdzrUmx57Du36TGyuEzGtVuDarcWzQYPqKJxOIuofJ+AGTDY53OjdUJ8pwJD6HDz55tu85gaV6ZOvSYqjqeX2FUe7lPhsGUIh/FemfichpypHyFpPYhkwAIO1AinKvsqjUuDXE6n5b7NMbI1gl87fPqT5wUSKXZqwViyFqUA5DFqPTEqvHIGU5Wz0GajEaQ==
alertmanager.yaml: AgBj3KqLiF7EAnGK6c4+Thferv3Sur+fhlwE4wpXD1PBtwlJQsMCqjsLRRFNAESH9/8vhI9E9D8wLJiauNS7CGw5jd5KU1cvxo5EyGeFoVyAB4bHSy/pxptSFq+rn00E99/Tqkbdsgduwusfpi0I9F1+zNucyyamJsEzIcsyHMlBbACz+9KQV9SdbgVEmIeqabrAP9VQaQ+i69yurhPdV7VkZzr0WKcGg3x27+slmtjlJz5fwtv1qmbYt/MQnijF2tc6tJeq19Cm0O4zuQ09meW6DwAZ9SOIFU6LxrqJlKbuleaWmfIE3AQYA6Z+qXyBjT1ILW36RwGyg3YK7nm0MNDQxd6LN3zR0eifPqrPsm7O6LE+NAg4FkurV3lJlrBoU3lSSc+sQZZr00ct9Gp57EEvg9T2TaM1B/KHQNmIhpDGntD4+yTcvK3nU7+sxqG/c4Wk5xiUQyLYnigNy5qYCcsM+t9iCoGxP7uU8GrsvIkojTxzhdc6e5LduThKdGE9jI3R6nCP6kmsU6XyUzgKmxYJVVzhSrm9yxFVDPHriNaEM2hgEd3wStwmRjGjwAPUjQZfSJtmxY6+RQ/77TYGjiskDm6gAZuzkGjdptL2t5F+54y3uePaLHNspMgtZsTARCo3kAhgf61Gk2nvnEY/ws5qFjAnsUEXs86wAk2S+w401QKPKTcDr6e/rnve8IrXW0FPvzR3rzdWOcU8v0Z0sSFijIfXdx+A9WGCJuHNo65FbKSgWhlHBfvWB1qWBnDd/VVHIA2wR8gevAPJHSc0f1WdUDc4w2w8tc/qum1SZo2lWkMopvLaiVHU5dCGtG6+4qsC1DmFzIRGZN4AbdVd5k+OY15Fp+ysjuTpA/HuZ/N0kz/5BXzNbY3u7YKi3EV3Up+eZIi2jlG0XBXWdCouuxRW40qHuShivtbrBgey32kFo85dsrDqN7F2kBVnumOB7kvFOaCkL2AtsakVjUzGoh5eXCSHl0ZcqmW2UjInzZIirBChMW/G4yL/TwpVYbBLqWPfdVMFmq7I4srY2+hUP/5UBt/DKZi5zPlLR8H3q4i02zsNpqdhSa9o6ThhFtVX9/te/DMpyN1fJ1Hn2p3cDhoTsiTLPkvflVOx70flap0v2zzPoDm+yXhFllpWp/5avHy9pKf/RzpAodbNr/EydkC+KDKI88MhVUtxS27WbKFsq+vUkmHQj+KtGyRFjg2/CnmM8YbdRsMe8p39PVGLxj1RTnyYzlMltOTbJo3rhDzjmpzGVUpWokwTMGC1WgTenrS4IcCK61ri9bsBIL9n9sMLF1lT8NVKnQfluDTaHNzsQgJ1HTSwQOcAfugqlUrSeTLt3q6U4pSjjlF8P7wYpqzWc+bhOaHed9NxrGXFBC5Wh6+BULuCaCA6TtkLpUfABYHVUa4OS3huNsOeBhZ3aCCQXrc0jOOq2DQzxvdGu4YAQnvMHwJRVyKVcw0pOS5RjIqJW6IOn0MGHzAo7qNv6LUyJ9a7huT2W4ibrHFkMck1zKxbBekPQ9FxpufSXrEqEqNuB3j7Gi7lVDVbPySr1rr2KXLzOLsnZhpTpMq2RejglIAMF7WfIMfvHQ2mnjNuYNNQnXx8hPLm88GSxFYKHpUnAswgYuo4XX2drYMzzq3GWDMIHZ/kpLySU+eJGo6VGeFUV1DgaGksLXE3oCfrA1OCUyZ/qke3tzj8ixjwuprCmFPWsg==
template:
metadata:
creationTimestamp: null
name: alertmanager-tbrnt-config
namespace: monitoring
type: Opaque
status: {}

9
monitoring/manifests/grafana-dashboardDatasources.yaml

@ -1,8 +1,13 @@
apiVersion: v1
data:
datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9LAogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJMb2tpIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAibG9raSIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL2xva2kubG9raTozMTAwIiwKICAgICAgICAgICAgInZlcnNpb24iOiAxCiAgICAgICAgfQogICAgXQp9
datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLmRlZmF1bHQuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0=
kind: Secret
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana-datasources
namespace: monitoring
namespace: default
type: Opaque

6093
monitoring/manifests/grafana-dashboardDefinitions.yaml
File diff suppressed because it is too large
View File

7
monitoring/manifests/grafana-dashboardSources.yaml

@ -17,5 +17,10 @@ data:
}
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana-dashboards
namespace: monitoring
namespace: default

30
monitoring/manifests/grafana-deployment.yaml

@ -2,26 +2,32 @@ apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana
namespace: monitoring
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
checksum/grafana-datasources: 7103d054a6e94f976ca59b4ede77cf88
checksum/grafana-datasources: b822d7b1a1070f322d0773c043985b4a
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
spec:
containers:
- env:
- name: GF_INSTALL_PLUGINS
value: grafana-piechart-panel
image: grafana/grafana:7.3.5
- env: []
image: grafana/grafana:7.5.4
name: grafana
ports:
- containerPort: 3000
@ -113,9 +119,6 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/statefulset
name: grafana-dashboard-statefulset
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/traefik
name: grafana-dashboard-traefik
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/workload-total
name: grafana-dashboard-workload-total
readOnly: false
@ -201,9 +204,6 @@ spec:
- configMap:
name: grafana-dashboard-statefulset
name: grafana-dashboard-statefulset
- configMap:
name: grafana-dashboard-traefik
name: grafana-dashboard-traefik
- configMap:
name: grafana-dashboard-workload-total
name: grafana-dashboard-workload-total

12
monitoring/manifests/grafana-service.yaml

@ -2,14 +2,18 @@ apiVersion: v1
kind: Service
metadata:
labels:
app: grafana
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana
namespace: monitoring
namespace: default
spec:
ports:
- name: http
port: 3000
targetPort: http
selector:
app: grafana
type: NodePort
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus

2
monitoring/manifests/grafana-serviceAccount.yaml

@ -2,4 +2,4 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: grafana
namespace: monitoring
namespace: default

9
monitoring/manifests/grafana-serviceMonitor.yaml

@ -1,12 +1,17 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 7.5.4
name: grafana
namespace: monitoring
namespace: default
spec:
endpoints:
- interval: 15s
port: http
selector:
matchLabels:
app: grafana
app.kubernetes.io/name: grafana

1
monitoring/manifests/healthchecks-io-secret.yaml

@ -13,4 +13,5 @@ spec:
name: healthchecks-io
namespace: monitoring
type: Opaque
status: {}

22
monitoring/manifests/kube-state-metrics-clusterRole.yaml

@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
rules:
- apiGroups:
@ -24,16 +26,6 @@ rules:
verbs:
- list
- watch
- apiGroups:
- extensions
resources:
- daemonsets
- deployments
- replicasets
- ingresses
verbs:
- list
- watch
- apiGroups:
- apps
resources:
@ -105,6 +97,14 @@ rules:
- networking.k8s.io
resources:
- networkpolicies
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch

6
monitoring/manifests/kube-state-metrics-clusterRoleBinding.yaml

@ -2,8 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -12,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring
namespace: default

39
monitoring/manifests/kube-state-metrics-deployment.yaml

@ -2,20 +2,28 @@ apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: kube-state-metrics
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
spec:
containers:
- args:
@ -23,8 +31,17 @@ spec:
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: quay.io/coreos/kube-state-metrics:v1.9.7
image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0
name: kube-state-metrics
resources:
limits:
cpu: 100m
memory: 250Mi
requests:
cpu: 10m
memory: 190Mi
securityContext:
runAsUser: 65534
- args:
- --logtostderr
- --secure-listen-address=:8443
@ -35,6 +52,13 @@ spec:
ports:
- containerPort: 8443
name: https-main
resources:
limits:
cpu: 40m
memory: 40Mi
requests:
cpu: 20m
memory: 20Mi
securityContext:
runAsGroup: 65532
runAsNonRoot: true
@ -49,6 +73,13 @@ spec:
ports:
- containerPort: 9443
name: https-self
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
runAsGroup: 65532
runAsNonRoot: true

46
monitoring/manifests/kube-state-metrics-prometheusRule.yaml

@ -0,0 +1,46 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
prometheus: k8s
role: alert-rules
name: kube-state-metrics-rules
namespace: default
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical

8
monitoring/manifests/kube-state-metrics-service.yaml

@ -2,10 +2,12 @@ apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default
spec:
clusterIP: None
ports:
@ -16,4 +18,6 @@ spec:
port: 9443
targetPort: https-self
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

6
monitoring/manifests/kube-state-metrics-serviceAccount.yaml

@ -2,7 +2,9 @@ apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default

8
monitoring/manifests/kube-state-metrics-serviceMonitor.yaml

@ -2,10 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.7
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: monitoring
namespace: default
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -28,4 +30,6 @@ spec:
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus

5
monitoring/manifests/node-exporter-clusterRole.yaml

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
rules:
- apiGroups:

7
monitoring/manifests/node-exporter-clusterRoleBinding.yaml

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: node-exporter
namespace: monitoring
namespace: default

24
monitoring/manifests/node-exporter-daemonset.yaml

@ -2,30 +2,37 @@ apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
spec:
containers:
- args:
- --web.listen-address=127.0.0.1:9100
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
image: quay.io/prometheus/node-exporter:v1.0.1
- --collector.netclass.ignored-devices=^(veth.*)$
- --collector.netdev.device-exclude=^(veth.*)$
image: quay.io/prometheus/node-exporter:v1.1.2
name: node-exporter
resources:
limits:
@ -35,10 +42,6 @@ spec:
cpu: 102m
memory: 180Mi
volumeMounts:
- mountPath: /host/proc
mountPropagation: HostToContainer
name: proc
readOnly: true
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
@ -85,9 +88,6 @@ spec:
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /proc
name: proc
- hostPath:
path: /sys
name: sys

301
monitoring/manifests/node-exporter-prometheusRule.yaml

@ -0,0 +1,301 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
prometheus: k8s
role: alert-rules
name: node-exporter-rules
namespace: default
spec:
groups:
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 300s.
Ensure NTP is configured correctly on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed"} > 0
labels:
severity: warning
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m

8
monitoring/manifests/node-exporter-service.yaml

@ -2,10 +2,12 @@ apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default
spec:
clusterIP: None
ports:
@ -13,4 +15,6 @@ spec:
port: 9100
targetPort: https
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

7
monitoring/manifests/node-exporter-serviceAccount.yaml

@ -1,5 +1,10 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default

8
monitoring/manifests/node-exporter-serviceMonitor.yaml

@ -2,10 +2,12 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v1.0.1
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: monitoring
namespace: default
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -24,4 +26,6 @@ spec:
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus

7
monitoring/manifests/prometheus-adapter-apiService.yaml

@ -1,6 +1,11 @@
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
@ -8,6 +13,6 @@ spec:
insecureSkipTLSVerify: true
service:
name: prometheus-adapter
namespace: monitoring
namespace: default
version: v1beta1
versionPriority: 100

5
monitoring/manifests/prometheus-adapter-clusterRole.yaml

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
rules:
- apiGroups:

4
monitoring/manifests/prometheus-adapter-clusterRoleAggregatedMetricsReader.yaml

@ -2,6 +2,10 @@ apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"

7
monitoring/manifests/prometheus-adapter-clusterRoleBinding.yaml

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: metrics-adapter
app.kubernetes.io/name: prometheus-adapter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.8.4
name: prometheus-adapter
roleRef:
apiGroup: rbac.authorization.k8s.io
@ -9,4 +14,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring
namespace: default

7
monitoring/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml

@ -1,6 +1,11 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata: