upgrade monitoring stack to latest and greatest
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2020-05-04 21:13:34 +02:00
parent a96d01ba65
commit d14fbc6e17
809 changed files with 2242 additions and 1417 deletions

View File

@ -1,24 +1,26 @@
{
"version": 1,
"dependencies": [
{
"name": "kube-prometheus",
"source": {
"git": {
"remote": "https://github.com/coreos/kube-prometheus",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "master"
"version": "master",
"name": "kube-prometheus"
},
{
"name": "prometheus-pushgateway",
"source": {
"git": {
"remote": "https://github.com/tobru/kube-prometheus-pushgateway",
"subdir": "prometheus-pushgateway"
}
},
"version": "master"
"version": "master",
"name": "prometheus-pushgateway"
}
]
],
"legacyImports": true
}

View File

@ -1,18 +1,7 @@
{
"version": 1,
"dependencies": [
{
"name": "etcd-mixin",
"source": {
"git": {
"remote": "https://github.com/coreos/etcd",
"subdir": "Documentation/etcd-mixin"
}
},
"version": "dd816f0735f8a5ee7a1bbd134002961b2884197c",
"sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
},
{
"name": "grafana",
"source": {
"git": {
"remote": "https://github.com/brancz/kubernetes-grafana",
@ -23,29 +12,56 @@
"sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34="
},
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"subdir": "grafana-builder"
"remote": "https://github.com/coreos/etcd",
"subdir": "Documentation/etcd-mixin"
}
},
"version": "e347979bbe1e89fa0f48ea61fb486fdbaec42b74",
"sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE="
"version": "4a64198a9f424549c64d6c2eeb92eed8ff3a7bd8",
"sum": "pk7mLpdUrHuJKkj2vhD6LGMU7P+oYYooBXAeZyZa398="
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/kube-prometheus",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "d07466766d46710e54a627f913eea5661382331a",
"sum": "vFv2xVX1SALKIOlQmcIvQ2pDKXqAbdBJ6zdGIrZUcis="
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/prometheus-operator",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "00cbd4911f931380cf9e19d771d7ebae1ec0a807",
"sum": "PfB8G2nfy3e/BrXS1ayymsRRFJvQLWT+oY5aqLS0tE8="
},
{
"name": "grafonnet",
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib",
"subdir": "grafonnet"
}
},
"version": "906768d46973e022594d3f03d82c5a51d86de2cc",
"sum": "J3Vp0EVbxTObr6KydLXsi4Rc0ssNVAEuwLc0NQ+4wqU="
"version": "f1ceb29c053445881cf47c8761225d463273f7e2",
"sum": "PQ1u4FqaLkqmYF1tE/BNx6Db8roTei0uqo+SKCwRlbo="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"subdir": "grafana-builder"
}
},
"version": "d851699bb178b62deabb5532cdb9e48c1a83b1b9",
"sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE="
},
{
"name": "ksonnet",
"source": {
"git": {
"remote": "https://github.com/ksonnet/ksonnet-lib",
@ -53,87 +69,71 @@
}
},
"version": "0d2f82676817bbf9e4acf6495b2090205f323b9f",
"sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg="
"sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg=",
"name": "ksonnet"
},
{
"name": "kube-prometheus",
"source": {
"git": {
"remote": "https://github.com/coreos/kube-prometheus",
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "cf7bb8706c840ae5ff80ffbc255ed0d49d3fff33",
"sum": "jBsN3YSsYzSmaV2zNU4Um4K+28qUtw9uga7HwF/tqI4="
},
{
"name": "kube-state-metrics",
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "e99fbbc6b2b6539f44b4e769e432016eaabee650",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
"name": "kube-state-metrics-mixin",
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "e99fbbc6b2b6539f44b4e769e432016eaabee650",
"sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU="
},
{
"name": "kubernetes-mixin",
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": ""
}
},
"version": "544148ee3ea4da1ffa1e97ebb66599a9548fde8d",
"sum": "6uEUlF5xrvjk8FBLpGaIyfH+3PNAPnzedbTsbqEaB3U="
"version": "90a6d132e3c9558c0690153c9b404b16b4754139",
"sum": "RZn1B0sbrPEKenxKIrXGdDN3NGb9OesQ+IA5whAP4ik="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": "lib/promgrafonnet"
}
},
"version": "90a6d132e3c9558c0690153c9b404b16b4754139",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "765afc98be4b2a53b3b12c48df2cf93379932a29",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "765afc98be4b2a53b3b12c48df2cf93379932a29",
"sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU="
},
{
"name": "node-mixin",
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"subdir": "docs/node-mixin"
}
},
"version": "da5972b5398cd67a8854e6b1ee6b861bfda5ef83",
"sum": "ZwrC0+4o1xD6+oPBu1p+rBXLlf6pMBD9rT8ygyl2aW0="
"version": "0c532984b7a6a00d67de07a9794aacfeec2c11d3",
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
},
{
"name": "prometheus",
"source": {
"git": {
"remote": "https://github.com/prometheus/prometheus",
"subdir": "documentation/prometheus-mixin"
}
},
"version": "fe47c9c86ee01cb089b76002cbc047e0f22b5501",
"sum": "5EUgr6Spr1zNR8Y2/NevjvEkGV9WMvKo6nEScNER1Lc="
"version": "0e2004f6fbe9eeddcd0c0084e401b5acb61760a4",
"sum": "kRb3XBTe/AALDcaTFfyuiKqzhxtLvihBkVkvJ5cUd/I=",
"name": "prometheus"
},
{
"name": "prometheus-operator",
"source": {
"git": {
"remote": "https://github.com/coreos/prometheus-operator",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "378d36df448366414de53a66a64020cd053002b7",
"sum": "vegTm8VSDazwYflBQGLkjs3ystWahwUv0fUyuMbpNRg="
},
{
"name": "prometheus-pushgateway",
"source": {
"git": {
"remote": "https://github.com/tobru/kube-prometheus-pushgateway",
@ -142,28 +142,7 @@
},
"version": "7bb93ca3ddf3b83f1fdfe95d9bad415b57d0fe4b",
"sum": "6nOJeHJExjYyTSovZvU6xbGjWS88oUfGnF1DAo+Q6tg="
},
{
"name": "promgrafonnet",
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": "lib/promgrafonnet"
}
},
"version": "544148ee3ea4da1ffa1e97ebb66599a9548fde8d",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
"name": "slo-libsonnet",
"source": {
"git": {
"remote": "https://github.com/metalmatze/slo-libsonnet",
"subdir": "slo-libsonnet"
}
},
"version": "2615ffbafc121fc0a507b75509764e53c58409e2",
"sum": "D9MryEvtOBLzIBC1nU4IvVeJPP+l3P/vpZsC0KG8vnk="
}
]
],
"legacyImports": false
}

View File

@ -6,7 +6,7 @@ metadata:
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgBlhYgX4pIgWPP1u5dNb9lPMoS0BR7Ca3M5kUVRKrkeT6Jn+kLZcju4g7A1WlkQ+jR2ptlaq+MkgYs1GS27wrw/iBsOZIbvfLoyFr4pL5i58WzzxPvk1wFO86oCpLJoK3pDh/eKR4e16BR70Q3DOTMRXEQ8SuOMKFonBPALOllAAXPYQFioF9MN+0MBifvnGe6UrfbXsQCdoL9H0G/OiJv/ZJxJcmx8eJRpJEgeyAQsdi0NNzhg2RiiNDC0qVCIblSxOaGhlNNQxiqLaQW5Uh7DXV2u+Rz1u85dm6lzSazjoXozPdPS4vMEMy8ZmFvzW2S7DhcfBJ0RE99DzGofIT67QIFh4edxCBRtlpYaYDoLKNkNRpp9inTHnQEGLga2qrElOUU5O2OwgC8qKl4k54Zxu4NUlGKr1IDsnbUiBNge+uvrsu/RahCYQ13H5eeJgkIuFSYE9h1qNG+s/lrjNQY7JsL9ZYj1ddn/5ZOYjgkCWs6yjTU+20uboU5vLOK2zawrJ6cvO6FHPjEMrnpcVSUlqIc588Ligg1coU5J0+UkLd1OPKY/5YBuMdNUBsmLfqutHbWSCiz9PgcI8wZsvCkPiyeUP3QlgvnLaQfcNUxv/tlU+Yg2nBYceKDPr3i2CN6IGrN5CxcCKMClEdTZ5xgmREN6sj0gIKPm1TpOdL1BjBcokZSXgYGnmr5XxDv9ybyainqcitSewznxHKbtvszJihHU6XcOr0MKo0L+abfwjlWvcHM+cZAT35xf1xtZ18A2JauwJ94+Flhzro+a1UclnwDQJf8Hp5PhMQRzu5q8WCLOF08N9Xq469K/FB4jtxnrYPJb6hQhZ+/LmLbLL9c3p2g1X+YQWeoUDvAZ6ujjPoBvNxPoLIsXO2Tu77se94e3WGxaHtcEqtFLQujH3jgAHLRp/x9jHoAniBLVo69/Xd6rbLZQ4DLhMlkdkipaM2iQZxoWQ0qHsISVseEu9741IZfveq0MY3acTcwPm7B0JXAn8dLmdv1BNrRo0/Xxgysdn3DybFmJ7AlFCi3iKacsIvnjEdZgCD3ogFyySSu/b52IyrhOl6WKLZaLURLHe8mxx8kXbRVVOQQ162Xgtx4viqg15yF15Qt0NmMXzU+hQc/XmVRuRVYPYhFuRAQVCD40ENwt/X3/ae/Mkfc2jzWvN3O+VfbepKfc4D+jVM1mv+E2dS4ZcWPmvvIXiLKJMcscZBr75bO6pt8LaMgz+n+ZPJRJzKa+yESbOYefNuP8DuwImPD8ridLBWR49EZD/P3I/7ZwJjHXH+VpYxMErcKI5R2znnLrnCBgMAV8M78vzbzuJ7Ljb6RkjCSU4NWnIAZmxgldbSLoHwXK57yoW+HDC9twDcvR40XL2NGSlChtd+x9M/mgHuS/NpAalJtF0zlo/hbaCSwE56wUYpjO9yx/rhj1wrYIbdAZ/aJaPcpi8cbhDltFMZT6OrWoCCiKR69wB3/HFSHd+SnLTBb1mdfqPwngekfe2TRQ7kXMfbPiBYJC6H5rQHHojmSGwl4AHL6qJ3InXGSG1FFnnc/roGGffjkpbIdmkqPfwn4Ob60xJwdLEYFEs1TOrOnKhTJfgc723EZr8AdVNhXbUlEiVyNhbyp4hHvzeybqMryEGuhKZIMD+GnfWC3/TSKFJC8wwFSOdmnjD3FhTF3ZEX4ehuhZLtE7sJjftjDXyrjVOF3eXlp9bQ==
alertmanager.yaml: AgB6MMnL2RsryYbKhQIyJvqXopB4wtUdotRShqk32jFScXUaJKMxPFF8t1Uhwpt1UPcjdrzOElEdHVX9nYj2LK+KGa23L5IV6ZACfjFLbHsn2PHA/kWDNCS7hLaAahSO2ByfoAnoGQp8OejsYDCSsp4NtkM+ooNgw1mGbO/yVNZZRDzccVLZFwJchsR0+26KEqzL3eHtsG40e7lf71uxBgYhvggukEeDfuuPDkKG4YeFJSQ7clURFhckbe+/MmnP+MHw7K9xkdlANl4Tu+e2+njm32PuZ34PKUrLQJBFSHG3DpGjPJBxBZbV7GP+T+VK5GQHrUZOewjgp5e4fDrqZvOVoASPJqY1C961bLjhwsTC8uzdJVHE78LR29hWnjlDBEbQdNsGwDoAHG0yBiC5N0rWfm+XNLh1+e96IF4OKp0/BVqNJaxrs6jlmTDMQEp4Q8JwVTqNSKfFr2jMMW0uI3ggqPAXnWDsLWLd9pD6gpbqGHELmG7dmPi2FQWBlnLXf4VllFbYzRpLZ12hKILBzWuhY6OkCSXfb/bbLyfxyjO8rWvgKxcm9RpjdwtiU4+wT6KKc1/x9nSvqqgaXq81ecHZr8EbJz6nAE/+0Qm1WhQ5wB89RtbX0K4D3i0e7P7JEB3mrSwdSxUd1JLyVKesWKwDiIiwpA7ljD6cTgmLS/00be1eHreqSJ0SLSODBkIadIH4Kc0tyO3/dbdRTkQC3pTQp5q7gVfJzPEGcmZFG0OUHHDXgigS4VUDalpTnaXXDj0Gjuq0+rf1Gu3ZuVC1QKwZ3W26eUrUz4NLtRDtJNYTgsrtJs+T3T3iQHjoO53vTp1TFDX6zGTaJd7yzis5CZWVIn4K8t1rcR/MexzR/Dk46HBLVJXuL0uN17RvFuKsCIFy0rR0BH8qbmWUwiqEfjDG4csBPgH5xAHWGVJHfPStILSZbDoZ/twms391umr+eVeZrSyjgb/W9MSsikqTJaMRH6gHByqljm5/fdmnF3G+Pe6wT8UTM4E1gSOwfmVaow2sqLCT0wdL/ZVOaJBPIG5MIBS7nSQ9nMsDVCmQp7TUD/XNp0M9hYB96AbJOElONQq+0aPzfMgyZElsAMO6Odkr8FBSitMe8lSz2WJvjF7XWPvOQrj1eaXOwNKawArvBjwMqEX5lnMcjHKFgSAOzeWjcEdnTFpifg6kmQn9hO2HKwNggXO8evT9undrnm8qj6vPcc+oXEgs2RlYk5pHUbf4V+O1KxVSBWJCIC2sXyacHI3Emkb5qkTw+Wv4BXla2JJSbLWn9o1hgOI88xpyZpnFG4A1Y/XVQa2002TFccF78UjxcFSA2c1XL0uQxJFkERNxwinrCPjXqA1i7CnCYH53ZpAGj/TqwAzZSZ1FN3L1s+Ro0ASiN/92ppeHhXGPhhTLhj1i4rr2qQ1XhixDSMQLgctAN0ebe2lnjG07yZodRaZbHHl4GfYfcxLCqemwHg5xVzbS4XP+70/hoWes9UVygK9rkzTUUN9IbbtZpgpGplnT3MymbqbYbrlAU59U/vSpxS6K3RUiTMz5IK6p5u7baMYeVuthUy3LU9r07P+vicwAlEQ6jEWKXQD1Q/Vhc5H0wUgZrTBcgo2w9JwT4QUhO1qzIKiUmMEWn1bDh1GaurRZtMRgjP5sfWGMGtUqMCVmAZvUbYsr8k6+1nn93xmn78QNOabtth8Ayfp6vHYeF4h3xA==
template:
metadata:
creationTimestamp: null

File diff suppressed because it is too large Load Diff

View File

@ -10,7 +10,6 @@ spec:
selector:
matchLabels:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1
template:
metadata:
labels:

View File

@ -14,4 +14,3 @@ spec:
targetPort: https
selector:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1

View File

@ -25,4 +25,3 @@ spec:
selector:
matchLabels:
app.kubernetes.io/name: node-exporter
app.kubernetes.io/version: v0.18.1

View File

@ -1,32 +1,32 @@
apiVersion: v1
data:
config.yaml: |
resourceRules:
cpu:
containerQuery: sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
nodeQuery: sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources:
overrides:
node:
resource: node
namespace:
resource: namespace
pod:
resource: pod
containerLabel: container
memory:
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)
nodeQuery: sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources:
overrides:
instance:
resource: node
namespace:
resource: namespace
pod:
resource: pod
containerLabel: container
window: 5m
config.yaml: |-
"resourceRules":
"cpu":
"containerLabel": "container"
"containerQuery": "sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}[5m])) by (<<.GroupBy>>)"
"nodeQuery": "sum(1 - irate(node_cpu_seconds_total{mode=\"idle\"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)"
"resources":
"overrides":
"namespace":
"resource": "namespace"
"node":
"resource": "node"
"pod":
"resource": "pod"
"memory":
"containerLabel": "container"
"containerQuery": "sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!=\"POD\",container!=\"\",pod!=\"\"}) by (<<.GroupBy>>)"
"nodeQuery": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job=\"node-exporter\",<<.LabelMatchers>>}) by (<<.GroupBy>>)"
"resources":
"overrides":
"instance":
"resource": "node"
"namespace":
"resource": "namespace"
"pod":
"resource": "pod"
"window": "5m"
kind: ConfigMap
metadata:
name: adapter-config

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
name: prometheus-operator
namespace: monitoring
spec:
@ -19,4 +19,4 @@ spec:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1

View File

@ -65,122 +65,289 @@ spec:
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
- name: kube-apiserver-error
rules:
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[5m]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate5m
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[30m]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate30m
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[1h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate1h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[2h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate2h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[6h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate6h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[1d]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate1d
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[3d]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate3d
- expr: |
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate5m
- expr: |
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate30m
- expr: |
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate1h
- expr: |
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate2h
- expr: |
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate6h
- expr: |
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate1d
- expr: |
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate3d
- name: kube-apiserver.rules
rules:
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
record: apiserver_request:burnrate1d
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
record: apiserver_request:burnrate1h
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
record: apiserver_request:burnrate2h
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
record: apiserver_request:burnrate30m
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
record: apiserver_request:burnrate3d
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: apiserver_request:burnrate5m
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
)
)
+
# errors
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
record: apiserver_request:burnrate6h
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
record: apiserver_request:burnrate1d
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
record: apiserver_request:burnrate1h
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
record: apiserver_request:burnrate2h
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
record: apiserver_request:burnrate30m
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
record: apiserver_request:burnrate3d
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: apiserver_request:burnrate5m
- expr: |
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
-
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h]))
)
+
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
)
/
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
record: apiserver_request:burnrate6h
- expr: |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: code_resource:apiserver_request_total:rate5m
- expr: |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: code_resource:apiserver_request_total:rate5m
- expr: |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0
labels:
quantile: "0.99"
verb: read
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0
labels:
quantile: "0.99"
verb: write
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: |
sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
/
@ -201,6 +368,153 @@ spec:
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- interval: 3m
name: kube-apiserver-availability.rules
rules:
- expr: |
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
# errors
sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- expr: |
1 - (
sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d]))
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- expr: |
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d]))
-
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d]))
)
+
# errors
sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d]))
record: code_verb:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- expr: |
sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- name: k8s.rules
rules:
- expr: |
@ -607,13 +921,22 @@ spec:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used'
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
message: Clock on {{ $labels.instance }} is out of sync by more than 300s.
@ -657,7 +980,7 @@ spec:
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 15m
labels:
severity: critical
severity: warning
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
@ -667,7 +990,7 @@ spec:
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
@ -680,7 +1003,7 @@ spec:
kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
@ -698,7 +1021,7 @@ spec:
)
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
@ -716,7 +1039,7 @@ spec:
)
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
@ -729,7 +1052,7 @@ spec:
kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
@ -749,7 +1072,7 @@ spec:
)
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
@ -761,7 +1084,7 @@ spec:
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
for: 15m
labels:
severity: critical
severity: warning
- alert: KubeContainerWaiting
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}}
@ -791,7 +1114,7 @@ spec:
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
expr: |
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 10m
for: 15m
labels:
severity: warning
- alert: KubeCronJobRunning
@ -934,12 +1257,12 @@ spec:
severity: warning
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeUsageCritical
- alert: KubePersistentVolumeFillingUp
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: |
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
@ -948,12 +1271,12 @@ spec:
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFullInFourDays
- alert: KubePersistentVolumeFillingUp
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
expr: |
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
@ -964,7 +1287,7 @@ spec:
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
for: 1h
labels:
severity: critical
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{
@ -1000,47 +1323,51 @@ spec:
for: 15m
labels:
severity: warning
- name: kube-apiserver-error-alerts
- name: kube-apiserver-slos
rules:
- alert: ErrorBudgetBurn
- alert: KubeAPIErrorBudgetBurn
annotations:
message: 'High requests error budget burn for job=apiserver (current value:
{{ $value }})'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
message: The API server is burning too much error budget
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
expr: |
(
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
)
or
(
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
)
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
labels:
job: apiserver
severity: critical
- alert: ErrorBudgetBurn
- alert: KubeAPIErrorBudgetBurn
annotations:
message: 'High requests error budget burn for job=apiserver (current value:
{{ $value }})'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
message: The API server is burning too much error budget
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
expr: |
(
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
)
or
(
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
)
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
labels:
severity: critical
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
expr: |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
labels:
severity: warning
- alert: KubeAPIErrorBudgetBurn
annotations:
message: The API server is burning too much error budget
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
expr: |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
labels:
job: apiserver
severity: warning
- name: kubernetes-system-apiserver
rules:
@ -1068,29 +1395,6 @@ spec:
for: 5m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value | humanizePercentage
@ -1208,7 +1512,7 @@ spec:
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
labels:
severity: warning
@ -1373,8 +1677,8 @@ spec:
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
{{ printf "%.1f" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue
}}{{ else }}{{ $labels.url }}{{ end }}.
{{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
$labels.url }}
summary: Prometheus fails to send samples to remote storage.
expr: |
(
@ -1394,8 +1698,8 @@ spec:
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue
}}{{ else }}{{ $labels.url }}{{ end }}.
is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
}}.
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
@ -1412,8 +1716,9 @@ spec:
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ $value }} shards, which is more
than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
desired shards calculation wants to run {{ $value }} shards for queue {{
$labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{
printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.

View File

@ -612,6 +612,11 @@ spec:
items:
type: string
type: array
alertQueryUrl:
description: The external Query URL the Thanos Ruler will set in the
'Source' field of all alerts. Maps to the '--alert.query-url' CLI
arg.
type: string
alertmanagersConfig:
description: Define configuration for connecting to alertmanager. Only
available with thanos v0.10.0 and higher. Maps to the `alertmanagers.config`

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
name: prometheus-operator
rules:
- apiGroups:

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
name: prometheus-operator
namespace: monitoring
spec:
@ -18,15 +18,15 @@ spec:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --logtostderr=true
- --config-reloader-image=jimmidyson/configmap-reload:v0.3.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.38.0
image: quay.io/coreos/prometheus-operator:v0.38.0
- --prometheus-config-reloader=quay.io/coreos/prometheus-config-reloader:v0.38.1
image: quay.io/coreos/prometheus-operator:v0.38.1
name: prometheus-operator
ports:
- containerPort: 8080

View File

@ -4,7 +4,7 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
name: prometheus-operator
namespace: monitoring
spec:

View File

@ -4,6 +4,6 @@ metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: prometheus-operator
app.kubernetes.io/version: v0.38.0
app.kubernetes.io/version: v0.38.1
name: prometheus-operator
namespace: monitoring

1
monitoring/vendor/etcd-mixin vendored Symbolic link
View File

@ -0,0 +1 @@
github.com/coreos/etcd/Documentation/etcd-mixin

View File

@ -217,7 +217,7 @@
grafanaDashboards+:: {
'etcd.json': {
id: 6,
uid: std.md5('etcd.json'),
title: 'etcd',
description: 'etcd sample Grafana dashboard with Prometheus',
tags: [],
@ -516,7 +516,7 @@
stack: false,
steppedLine: false,
targets: [{
expr: 'etcd_debugging_mvcc_db_total_size_in_bytes{job="$cluster"}',
expr: 'etcd_mvcc_db_total_size_in_bytes{job="$cluster"}',
hide: false,
interval: '',
intervalFactor: 2,

View File

@ -0,0 +1,89 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/brancz/kubernetes-grafana",
"subdir": "grafana"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/etcd",
"subdir": "Documentation/etcd-mixin"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/coreos/prometheus-operator",
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "release-0.38"
},
{
"source": {
"git": {
"remote": "https://github.com/ksonnet/ksonnet-lib",
"subdir": ""
}
},
"version": "master",
"name": "ksonnet"
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": ""
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"subdir": "docs/node-mixin"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/prometheus/prometheus",
"subdir": "documentation/prometheus-mixin"
}
},
"version": "master",
"name": "prometheus"
}
],
"legacyImports": true
}

View File

@ -0,0 +1,179 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
// Custom metrics API allows the HPA v2 to scale based on arbirary metrics.
// For more details on usage visit https://github.com/DirectXMan12/k8s-prometheus-adapter#quick-links
{
_config+:: {
prometheusAdapter+:: {
// Rules for custom-metrics
config+:: {
rules+: [
{
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
seriesFilters: [],
resources: {
overrides: {
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
}
},
},
name: {
matches: '^container_(.*)_seconds_total$',
as: ""
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>,container!="POD"}[1m])) by (<<.GroupBy>>)'
},
{
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
seriesFilters: [
{ isNot: '^container_.*_seconds_total$' },
],
resources: {
overrides: {
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
}
},
},
name: {
matches: '^container_(.*)_total$',
as: ''
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>,container!="POD"}[1m])) by (<<.GroupBy>>)'
},
{
seriesQuery: '{__name__=~"^container_.*",container!="POD",namespace!="",pod!=""}',
seriesFilters: [
{ isNot: '^container_.*_total$' },
],
resources: {
overrides: {
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
}
},
},
name: {
matches: '^container_(.*)$',
as: ''
},
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>,container!="POD"}) by (<<.GroupBy>>)'
},
{
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
seriesFilters: [
{ isNot: '.*_total$' },
],
resources: {
template: '<<.Resource>>'
},
name: {
matches: '',
as: ''
},
metricsQuery: 'sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)'
},
{
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
seriesFilters: [
{ isNot: '.*_seconds_total' },
],
resources: {
template: '<<.Resource>>'
},
name: {
matches: '^(.*)_total$',
as: ''
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)'
},
{
seriesQuery: '{namespace!="",__name__!~"^container_.*"}',
seriesFilters: [],
resources: {
template: '<<.Resource>>'
},
name: {
matches: '^(.*)_seconds_total$',
as: ''
},
metricsQuery: 'sum(rate(<<.Series>>{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)'
}
],
},
},
},
prometheusAdapter+:: {
customMetricsApiService: {
apiVersion: 'apiregistration.k8s.io/v1',
kind: 'APIService',
metadata: {
name: 'v1beta1.custom.metrics.k8s.io',
},
spec: {
service: {
name: $.prometheusAdapter.service.metadata.name,
namespace: $._config.namespace,
},
group: 'custom.metrics.k8s.io',
version: 'v1beta1',
insecureSkipTLSVerify: true,
groupPriorityMinimum: 100,
versionPriority: 100,
},
},
customMetricsClusterRoleServerResources:
local clusterRole = k.rbac.v1.clusterRole;
local policyRule = clusterRole.rulesType;
local rules =
policyRule.new() +
policyRule.withApiGroups(['custom.metrics.k8s.io']) +
policyRule.withResources(['*']) +
policyRule.withVerbs(['*']);
clusterRole.new() +
clusterRole.mixin.metadata.withName('custom-metrics-server-resources') +
clusterRole.withRules(rules),
customMetricsClusterRoleBindingServerResources:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('custom-metrics-server-resources') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('custom-metrics-server-resources') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{
kind: 'ServiceAccount',
name: $.prometheusAdapter.serviceAccount.metadata.name,
namespace: $._config.namespace,
}]),
customMetricsClusterRoleBindingHPA:
local clusterRoleBinding = k.rbac.v1.clusterRoleBinding;
clusterRoleBinding.new() +
clusterRoleBinding.mixin.metadata.withName('hpa-controller-custom-metrics') +
clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') +
clusterRoleBinding.mixin.roleRef.withName('custom-metrics-server-resources') +
clusterRoleBinding.mixin.roleRef.mixinInstance({ kind: 'ClusterRole' }) +
clusterRoleBinding.withSubjects([{
kind: 'ServiceAccount',
name: 'horizontal-pod-autoscaler',
namespace: 'kube-system',
}]),
}
}

View File

@ -3,6 +3,12 @@ local service = k.core.v1.service;
local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
eks: {
minimumAvailableIPs: 10,
minimumAvailableIPsTime: '10m'
}
},
prometheus+: {
serviceMonitorCoreDNS+: {
spec+: {
@ -59,14 +65,14 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
name: 'kube-prometheus-eks.rules',
rules: [
{
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < 10',
expr: 'sum by(instance) (awscni_total_ip_addresses) - sum by(instance) (awscni_assigned_ip_addresses) < %s' % $._config.eks.minimumAvailableIPs,
labels: {
severity: 'critical',
},
annotations: {
message: 'Instance {{ $labels.instance }} has less than 10 IPs available.'
},
'for': '10m',
'for': $._config.eks.minimumAvailableIPsTime,
alert: 'EksAvailableIPs'
},
],

View File

@ -20,6 +20,11 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
'app.kubernetes.io/name': 'node-exporter',
'app.kubernetes.io/version': $._config.versions.nodeExporter,
},
selectorLabels: {
[labelName]: $._config.nodeExporter.labels[labelName]
for labelName in std.objectFields($._config.nodeExporter.labels)
if !std.setMember(labelName, ['app.kubernetes.io/version'])
},
},
},
@ -69,6 +74,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local containerEnv = container.envType;
local podLabels = $._config.nodeExporter.labels;
local selectorLabels = $._config.nodeExporter.selectorLabels;
local existsToleration = toleration.new() +
toleration.withOperator('Exists');
@ -133,7 +139,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
daemonset.mixin.metadata.withName('node-exporter') +
daemonset.mixin.metadata.withNamespace($._config.namespace) +
daemonset.mixin.metadata.withLabels(podLabels) +
daemonset.mixin.spec.selector.withMatchLabels(podLabels) +
daemonset.mixin.spec.selector.withMatchLabels(selectorLabels) +
daemonset.mixin.spec.template.metadata.withLabels(podLabels) +
daemonset.mixin.spec.template.spec.withTolerations([existsToleration]) +
daemonset.mixin.spec.template.spec.withNodeSelector({ 'kubernetes.io/os': 'linux' }) +
@ -163,7 +169,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
spec: {
jobLabel: 'app.kubernetes.io/name',
selector: {
matchLabels: $._config.nodeExporter.labels,
matchLabels: $._config.nodeExporter.selectorLabels,
},
endpoints: [
{
@ -194,7 +200,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local nodeExporterPort = servicePort.newNamed('https', $._config.nodeExporter.port, 'https');
service.new('node-exporter', $.nodeExporter.daemonset.spec.selector.matchLabels, nodeExporterPort) +
service.new('node-exporter', $._config.nodeExporter.selectorLabels, nodeExporterPort) +
service.mixin.metadata.withNamespace($._config.namespace) +
service.mixin.metadata.withLabels($._config.nodeExporter.labels) +
service.mixin.spec.withClusterIp('None'),

View File

@ -16,34 +16,47 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
name: 'prometheus-adapter',
labels: { name: $._config.prometheusAdapter.name },
prometheusURL: 'http://prometheus-' + $._config.prometheus.name + '.' + $._config.namespace + '.svc:9090/',
config: |||
resourceRules:
cpu:
containerQuery: sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
nodeQuery: sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources:
overrides:
node:
resource: node
namespace:
resource: namespace
pod:
resource: pod
containerLabel: container
memory:
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)
nodeQuery: sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources:
overrides:
instance:
resource: node
namespace:
resource: namespace
pod:
resource: pod
containerLabel: container
window: 5m
|||,
config: {
resourceRules: {
cpu: {
containerQuery: 'sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)',
nodeQuery: 'sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)',
resources: {
overrides: {
node: {
resource: 'node'
},
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
},
},
},
containerLabel: 'container'
},
memory: {
containerQuery: 'sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)',
nodeQuery: 'sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)',
resources: {
overrides: {
instance: {
resource: 'node'
},
namespace: {
resource: 'namespace'
},
pod: {
resource: 'pod'
},
},
},
containerLabel: 'container'
},
window: '5m',
},
}
},
},
@ -70,8 +83,8 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
configMap:
local configmap = k.core.v1.configMap;
configmap.new('adapter-config', { 'config.yaml': std.manifestYamlDoc($._config.prometheusAdapter.config) }) +
configmap.new('adapter-config', { 'config.yaml': $._config.prometheusAdapter.config }) +
configmap.mixin.metadata.withNamespace($._config.namespace),
service:

View File

@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
namespace: 'default',
versions+:: {
prometheus: 'v2.15.2',
prometheus: 'v2.17.2',
},
imageRepos+:: {

View File

@ -15,7 +15,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
},
versions+:: {
prometheusOperator: 'v0.38.0',
prometheusOperator: 'v0.38.1',
prometheusConfigReloader: self.prometheusOperator,
configmapReloader: 'v0.3.0',
},

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,27 @@
{
new(
title='',
span=null,
show='current',
limit=10,
sortOrder=1,
stateFilter=[],
onlyAlertsOnDashboard=true,
transparent=null,
description=null,
datasource=null,
)::
{
[if transparent != null then 'transparent']: transparent,
title: title,
[if span != null then 'span']: span,
type: 'alertlist',
show: show,
limit: limit,
sortOrder: sortOrder,
[if show != 'changes' then 'stateFilter']: stateFilter,
onlyAlertsOnDashboard: onlyAlertsOnDashboard,
[if description != null then 'description']: description,
datasource: datasource,
},
}

View File

@ -3,6 +3,8 @@
title,
datasource=null,
calc='mean',
time_from=null,
span=null,
description='',
height=null,
transparent=null,
@ -11,6 +13,8 @@
[if description != '' then 'description']: description,
[if height != null then 'height']: height,
[if transparent != null then 'transparent']: transparent,
[if time_from != null then 'timeFrom']: time_from,
[if span != null then 'span']: span,
title: title,
type: 'gauge',
datasource: datasource,

View File

@ -1,4 +1,5 @@
{
alertlist:: import 'alertlist.libsonnet',
dashboard:: import 'dashboard.libsonnet',
template:: import 'template.libsonnet',
text:: import 'text.libsonnet',

View File

@ -128,7 +128,7 @@
mode: x_axis_mode,
name: null,
values: if x_axis_mode == 'series' then [x_axis_values] else [],
buckets: if x_axis_mode == 'histogram' then [x_axis_buckets] else null,
buckets: if x_axis_mode == 'histogram' then x_axis_buckets else null,
[if x_axis_min != null then 'min']: x_axis_min,
[if x_axis_max != null then 'max']: x_axis_max,
},

View File

@ -18,6 +18,7 @@
* @param color_mode How to display difference in frequency with color, default 'opacity'
* @param dataFormat How to format the data, default is 'timeseries'
* @param highlightCards TODO: document
* @param hideZeroBuckets Whether or not to hide empty buckets, default is false
* @param legend_show Show legend
* @param minSpan Minimum span of the panel when repeated on a template variable
* @param repeat Variable used to repeat the heatmap panel
@ -54,6 +55,7 @@
color_mode='spectrum',
dataFormat='timeseries',
highlightCards=true,
hideZeroBuckets=false,
legend_show=false,
minSpan=null,
span=null,
@ -96,6 +98,7 @@
},
[if dataFormat != null then 'dataFormat']: dataFormat,
heatmap: {},
hideZeroBuckets: hideZeroBuckets,
highlightCards: highlightCards,
legend: {
show: legend_show,

Some files were not shown because too many files have changed in this diff Show More