update monitoring
continuous-integration/drone/push Build is passing
Details
continuous-integration/drone/push Build is passing
Details
This commit is contained in:
parent
db6e205ae7
commit
b684265f0d
|
@ -8,8 +8,8 @@
|
|||
"subdir": "grafana"
|
||||
}
|
||||
},
|
||||
"version": "57b4365eacda291b82e0d55ba7eec573a8198dda",
|
||||
"sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34="
|
||||
"version": "014301fd5f71d8305a395b2fb437089a7b1a3999",
|
||||
"sum": "RHtpk2c0CcliWyt6F4DIgwpi4cEfHADK7nAxIw6RTGs="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
|
@ -18,8 +18,8 @@
|
|||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "7f726db202a4285597c7076fee156e8b2737928f",
|
||||
"sum": "pk7mLpdUrHuJKkj2vhD6LGMU7P+oYYooBXAeZyZa398="
|
||||
"version": "9006d8d4f9d82f6cce6eb93d6f2dfe7c154fa05d",
|
||||
"sum": "Uv8ysXlEACF7BafoCkHnrBmJ2AHh/VldI5mm3BuMiy0="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
|
@ -28,8 +28,8 @@
|
|||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "2c1fc1cc11547ca06a143fce6e430e4f7e0be294",
|
||||
"sum": "Srp/B6oh85sEpjZxWOVyoBciNn6oA1SkjgLX4hUxsIE="
|
||||
"version": "6771c9bcc287e8047510207a4ab60fa5e63e48fe",
|
||||
"sum": "52ukcsyazUhdJWb48PPGQQurdFrGE0xgKYE++yWO7aI="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
|
@ -38,7 +38,7 @@
|
|||
"subdir": "jsonnet/prometheus-operator"
|
||||
}
|
||||
},
|
||||
"version": "e31c69f9b5c6555e0f4a5c1f39d0f03182dd6b41",
|
||||
"version": "0dca0f21ffff72a063db8855b5d515e15ab0dccb",
|
||||
"sum": "WggWVWZ+CBEUThQCztSaRELbtqdXf9s3OFzf06HbYNA="
|
||||
},
|
||||
{
|
||||
|
@ -48,8 +48,8 @@
|
|||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "5c6e8a8113486cdecd0961730aeaada3e6c69fe7",
|
||||
"sum": "tDuuSKE9f4Ew2bjBM33Rs6behLEAzkmKkShSt+jpAak="
|
||||
"version": "ad85aec356b4544a41f62ac8c32f8042c0ffc42e",
|
||||
"sum": "JHhSwlCa9A+AwG4o+YEXXFDbQ91iwwd9G/FoYnGhObw="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
|
@ -58,7 +58,7 @@
|
|||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "21b638f4e4922c0b6fde12120ed45d8ef803edc7",
|
||||
"version": "f2a35172b97a0c944c4a167bb1f6e688624602e4",
|
||||
"sum": "N65Fv0M2JvFE3GN8ZxP5xh1U5a314ey8geLAioJLzF8="
|
||||
},
|
||||
{
|
||||
|
@ -79,8 +79,8 @@
|
|||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "fba82a1c0bc225127b084e91bd142c99b1792cb6",
|
||||
"sum": "hJ5n6OeumIpKYuZQHwxL/rtpAJaW/qTFE9oOA8RWd7w="
|
||||
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
|
||||
"sum": "skD7Rm0m5lOQOn8IrnGEdJyhWUI7qsKPXwcci7Hjn0E="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
|
@ -89,7 +89,7 @@
|
|||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "fba82a1c0bc225127b084e91bd142c99b1792cb6",
|
||||
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
|
||||
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
|
||||
},
|
||||
{
|
||||
|
@ -99,7 +99,7 @@
|
|||
"subdir": "jsonnet/kube-state-metrics"
|
||||
}
|
||||
},
|
||||
"version": "d667979ed55ad1c4db44d331b51d646f5b903aa7",
|
||||
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
|
||||
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
|
||||
},
|
||||
{
|
||||
|
@ -109,7 +109,7 @@
|
|||
"subdir": "jsonnet/kube-state-metrics-mixin"
|
||||
}
|
||||
},
|
||||
"version": "d667979ed55ad1c4db44d331b51d646f5b903aa7",
|
||||
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
|
||||
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8="
|
||||
},
|
||||
{
|
||||
|
@ -119,7 +119,7 @@
|
|||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "7ad86f7994d6f1f290a28fcc2e16e54193e5ab9e",
|
||||
"version": "503e4fc8486c0082d6bd8c53fad646bcfafeedf6",
|
||||
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
|
||||
},
|
||||
{
|
||||
|
@ -129,8 +129,8 @@
|
|||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "1861bf38f588b288bc66196aba1c2516f97aa90c",
|
||||
"sum": "lEzhZ8gllSfAO4kmXeTwl4W0anapIeFd5GCaCNuDe18=",
|
||||
"version": "348ff4285ffa59907c9d7fd7eb3cb7d748f42758",
|
||||
"sum": "TBq4SL7YsPInARbJqwz25JaBvvAegcnRCsuz3K9niWc=",
|
||||
"name": "prometheus"
|
||||
},
|
||||
{
|
||||
|
|
|
@ -7,7 +7,7 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
configSecret: alertmanager-tbrnt-config
|
||||
image: quay.io/prometheus/alertmanager:v0.20.0
|
||||
image: quay.io/prometheus/alertmanager:v0.21.0
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
replicas: 1
|
||||
|
@ -16,4 +16,4 @@ spec:
|
|||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
serviceAccountName: alertmanager-main
|
||||
version: v0.20.0
|
||||
version: v0.21.0
|
||||
|
|
|
@ -6,7 +6,7 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
encryptedData:
|
||||
alertmanager.yaml: AgAt4qHTMHgNlti5rXewe3FaQhwDdM9SuW2ZDqKWE+Pnwr4lx+tl/NDzt+UE7xaJWQJDr7pSMo787g5sbzy5v1OxYG6cOcaNK/n8h4SWXim+S91fwvgVaemZ4l6nB1IJdkctLnqOz9Q1/MtAdVzWMJxk3sxoQFounnWZGfzmVybRhdeg93ionzFHK7lpRVZqRBrQRUnbILqEKYBQKfpnj0HadjHggSBAIDIp0NiNidmEHpeDy3dCjjEujJNwvZtUQoepm7bQ5bsFrcC/HUiqgDCRgo/SGZknaCtZaydXV2IgmyNjI5P/IfwUWPOlhA52ebs2dEDswfzj2OOq7JAmlmzorTT7joO+R1OfwFTvHcdjar/fbTm0ZJg1vbx0asxbL5BsXwwTtQVyr7oX0k25xeKe81folZSwUEy+Is5LhoU3VxUlX/cakYeRgPrwEHjX9CB/xs5/zZUoGeF2GIq6JsrFh4P0ZwPclB2HkXzE9pcjekgTi+cZzZ0MJq97phN7etIlq3HvuBgF+xb6zyZ+vD3W/5HPksUrJQX/2+U1RO+AzZ8pwMgTI7bXYiVuGjfpm0s8yoeBb9e6FkDGwwsop4tiAfrxmanF41sdW9og/YPqIRGFYKyI2hASxWbREKgBwKzUgU8DhoF2YJtL47UV+qq+/q0E3okraUIMwdoERbORA6+YgQaACvefq5PbUrLFVsjsmgWWW3wYL78pt/IZ6DEvD9GvJ0YjGIyEqBrfnPqnZeHP4GzMjn2leXy8SfygmfZ4rd6dPDq6nznOwTRQ4wPzG945rKX8A6vaDm8x57gZm1uVQptALNNCd7T6QaDhj9ZUPcJBhi4YHJIEVdlk/Fuy7asUNOhnL2pCv60QlApZMKIemrD2kyAs6GXipfvjr4uWbs+WjIdGdRdrSErAwglwDULasyHQroLz6e17Z4NMM2HoTdw/cOM1C/3VYqWmLx67oo3BykL8mcQGVdTyDkr2M5631UqR22C9AOjpTqCLeGSo9vmXpudgdrOEv3ZNqBraIhz1nF//MFun6fkTCQENsrcO5pr/yl2hbPRAf5TIh7mFguEblOfCSPgLE8ELYSCQzfb8PTexFq9oKnhyr8mct0fgrp/JXFduLkj5ue6FCKe6H13GbW/igQ02BYZKJVLOr8anF1GtwOpzXZAgy6RZbZc6a1M97i21m1IhHbNiy8Xc5IFOEqg822wU1EV21OdBTL+eDAldVi/NTmRfemOta/m9khzKE6xYFSWuq/ulXRrojtashIqjqCPWxUPt+VaAjispmWT38mo+Bw2A7fxa787MfC7nfCOXpr8x/5xJMBROJSBA//HLRQCdgdXZgEeIWfh/tbdEDbofv53fk4oKZPKiUH+jNiEGZot+Qf4nz2NLCdymTysehiHYCYyDheztXHv898zkhOdM9Oqsk4Pfw7BpWd+LheiRmhqyPBnhvkWz5uHSZ8rwx7ixVXQaWntLtsLILDwlXXZXGhX/JjIGzKBgYe/Gxax8ce5B7NL53HZhmZAJ1PEzXeNCAfRlHRRwejy4S/SF5hu9LiZnPjjYsJuga7XDmdq28k7lC5580s8WfCyHVmTQIsEgSk79H/w0LHYPz/KoXz3jy19Uryzue1RKa1EPeXV2Uw6p06IZiIHOPGidpTcgoTGXCj1oELzhuQ3S4HfvPGhdufWARdcpVNs6WV5ms3EO9vTyeSftNulAsQ==
|
||||
alertmanager.yaml: AgBzDdfETjZXZ9hJSITSeLkdcoszO9O9tbf2srOAwr0VdE2+4PDinymJoNwojY0xWegaxz+7p383egqxJGgK1kirs4RINgLJEsGeOG46yQk656s8kF00/bV3u44C6MCiKs7bHF117Jg3eL3C7GjeoE+Dv4YHjehfUscF+OS6Ld96WPBIN5t1mxox7qd+ysw0KA2neYjPyYyfHw8y00160yopQizChGCBu2oG7RJPhctthcnnGSxskcVtxOdYhBKwwdEJ1SaPFNCGWs6wAy/NwtAJCcEQ2xBizbNA3GsXzK3zkUtWu7/a8ikjKDruwbb2DkjGxqRTJt3LPpntXPxcVB2HTysgZ02+InrZ7EmMCXW1HosXKQdztEYeaN2Ijtalx/+AHhv1iKA0pvEQj76rPzgXWfV+MkCUlBW+VIZiU8Kg8kqZQqLkfODGALwckPymR7kPzgUqbI2IFiN+6tAbyOOLlttoBtIcNsZzLcKmO59gke6bob0T2ZlIhRpUbZlz0l3kUNoQtn4jnriaFinyeKDcGyBhI/bKl01fEokSXkkRlKibyflGdYXLfTCURZeGn3KAxLwfILoojww8zfUfJClAXoleg01qeWuAfIidVy5KJBTfenHcgDq/o5/wP5bZjhQbT5lyUJoFdEn39j0kxusEAgvGNRrI287zj0E61z9bHK9Z7Rkve6l12awY90CxxmcW6KvNF88iejxTWtD89ti/U4FGHTbRpjXrS2YMk8Shv6QMHybRkk3gqxardmVycKW9DfUGrRLWnN58yw16EHaM828/Hr0+l87Ax73PfslE2SEjHPzdKGuhW24WSTXCHlPAwJv+NIx9+uWAcf8edaCfyB/uWgCNNNNszgNzyyi7EGrSuOmKguMIfqzs++KSXWSkrKnCxtco1Sscr60xtkb/gjHc0rn2hHJfU6bmxCP5naat+5xp1yVINb8gfG+ez+BeZdNBGBxbB32BNH7UtzBtwJGnBtDP5Y16q8Tq42nnFBsKy5lr/uwA0ESg7FxuuufcQ0f+jhkfxNBlUc7lxeLaJpeZJsfwlc6ns6rBTrPD0kGmdiusZTqzTd6kuibEk5/RdLnl6kme6OAxVgnU1bIs5l/j5ebmmvM6AbDXTqtuL6zY04M1QDlLJVldS3oS/fqsvAHe5u/W0Fgk5XN8naFpM0r7tfVzspNqyMch2ZrfFfQOO20QNrpzaln0/kknXcdu/pONxSyvaoVkdOgn6lJLOdpWtUAffSPWd3X+uOUB1KHEp1kCKJbXYNpy1qwUoHaLuX0S57MF/Syyx2UAPU5o8WEqgI1vlY6mo4OPjW6Yet1pKoI3CDgLhTf32a+nnxXfad/zR0yLp+ayw/R95VpdS/t0+FuWcgbFb2Jc16WSFTHXaBM0dUxOPBO/6uWfwLGd0eOHkFE2qniqAdkdki3oSQKHRCeI6De9ZGCEOsV4+z9nNJCGGY1wE/ICuD4vOCo9D3iDeNoxr1S8lqs4l2ssxuCOuyodfP6lAxWhznIT4qDbj4oIR/gi+3m5uweG6V2YDFqaLBmmn6PlOCJLc5dirJZDhED9vtg7gAMFRhvhp+DADfyRZRUXDGVwn/6J86DhKOKwZNcC0YPt04eEqOCuiVKIz74cQgeBbMsidt5R38d/gt1BV6y2/UGZd1aaBC45OOiSiNmnsojuJDuy+8iUNU1hOUgHuHUf1es+LKyi/sDm1g==
|
||||
template:
|
||||
metadata:
|
||||
creationTimestamp: null
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -12,6 +12,8 @@ spec:
|
|||
app: grafana
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
checksum/grafana-datasources: 7103d054a6e94f976ca59b4ede77cf88
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
|
|
|
@ -30,7 +30,7 @@ spec:
|
|||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=:8443
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||
- --upstream=http://127.0.0.1:8081/
|
||||
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
|
||||
name: kube-rbac-proxy-main
|
||||
|
@ -42,7 +42,7 @@ spec:
|
|||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=:9443
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||
- --upstream=http://127.0.0.1:8082/
|
||||
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
|
||||
name: kube-rbac-proxy-self
|
||||
|
|
|
@ -48,7 +48,7 @@ spec:
|
|||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=[$(IP)]:9100
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||
- --upstream=http://127.0.0.1:9100/
|
||||
env:
|
||||
- name: IP
|
||||
|
|
|
@ -12,7 +12,7 @@ spec:
|
|||
namespace: monitoring
|
||||
port: web
|
||||
externalUrl: http://prometheus-k8s.monitoring:9090
|
||||
image: quay.io/prometheus/prometheus:v2.17.2
|
||||
image: quay.io/prometheus/prometheus:v2.20.0
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
podMonitorNamespaceSelector:
|
||||
|
@ -56,4 +56,4 @@ spec:
|
|||
requests:
|
||||
storage: 10Gi
|
||||
storageClassName: local-path
|
||||
version: v2.17.2
|
||||
version: v2.20.0
|
||||
|
|
|
@ -74,8 +74,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
|
||||
)
|
||||
)
|
||||
|
@ -95,8 +101,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
|
||||
)
|
||||
)
|
||||
|
@ -116,8 +128,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
|
||||
)
|
||||
)
|
||||
|
@ -137,8 +155,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
|
||||
)
|
||||
)
|
||||
|
@ -158,8 +182,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
|
||||
)
|
||||
)
|
||||
|
@ -179,8 +209,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
|
||||
)
|
||||
)
|
||||
|
@ -200,8 +236,14 @@ spec:
|
|||
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
|
||||
)
|
||||
)
|
||||
|
@ -384,8 +426,14 @@ spec:
|
|||
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
|
||||
-
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
) +
|
||||
|
@ -403,8 +451,14 @@ spec:
|
|||
-
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
|
||||
)
|
||||
+
|
||||
|
@ -592,7 +646,7 @@ spec:
|
|||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
|
@ -602,7 +656,7 @@ spec:
|
|||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- expr: |
|
||||
max by (cluster, namespace, workload, pod) (
|
||||
label_replace(
|
||||
|
@ -612,7 +666,7 @@ spec:
|
|||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: |
|
||||
|
@ -711,9 +765,6 @@ spec:
|
|||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||
(instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
||||
BY (instance)
|
||||
record: instance:node_filesystem_usage:sum
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
|
@ -1087,13 +1138,33 @@ spec:
|
|||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
|
||||
{{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
|
||||
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||
finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||
expr: |
|
||||
kube_daemonset_status_number_ready{job="kube-state-metrics"}
|
||||
/
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -1132,11 +1203,11 @@ spec:
|
|||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||
than one hour to complete.
|
||||
than 12 hours to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
||||
expr: |
|
||||
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||
for: 1h
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
|
@ -1256,7 +1327,7 @@ spec:
|
|||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: info
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
|
@ -1308,7 +1379,7 @@ spec:
|
|||
components running.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||
expr: |
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -1412,11 +1483,11 @@ spec:
|
|||
severity: warning
|
||||
- alert: AggregatedAPIDown
|
||||
annotations:
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down.
|
||||
It has not been available at least for the past five minutes.
|
||||
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
|
||||
been only {{ $value | humanize }}% available over the last 5m.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
|
||||
expr: |
|
||||
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
|
||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -1445,7 +1516,7 @@ spec:
|
|||
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
|
||||
expr: |
|
||||
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1
|
||||
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
|
@ -1454,7 +1525,13 @@ spec:
|
|||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95
|
||||
count by(node) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by(node) (
|
||||
kube_node_status_capacity_pods{job="kube-state-metrics"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -1793,6 +1870,24 @@ spec:
|
|||
severity: warning
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
message: Errors while performing List operations in controller {{$labels.controller}}
|
||||
in {{$labels.namespace}} namespace.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
message: Errors while performing Watch operations in controller {{$labels.controller}}
|
||||
in {{$labels.namespace}} namespace.
|
||||
expr: |
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
|
||||
|
|
|
@ -43,7 +43,7 @@ spec:
|
|||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
regex: etcd_(debugging|disk|request|server).*
|
||||
regex: etcd_(debugging|disk|server).*
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- action: drop
|
||||
|
|
|
@ -7,7 +7,8 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
endpoints:
|
||||
- interval: 30s
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||
|
@ -45,7 +46,10 @@ spec:
|
|||
regex: etcd_(debugging|disk|request|server).*
|
||||
sourceLabels:
|
||||
- __name__
|
||||
port: http-metrics
|
||||
port: https-metrics
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
jobLabel: k8s-app
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
|
|
|
@ -7,8 +7,12 @@ metadata:
|
|||
namespace: monitoring
|
||||
spec:
|
||||
endpoints:
|
||||
- interval: 30s
|
||||
port: http-metrics
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 30s
|
||||
port: https-metrics
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
jobLabel: k8s-app
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
|
|
|
@ -43,7 +43,7 @@ spec:
|
|||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=:8443
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||
- --upstream=http://127.0.0.1:8080/
|
||||
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
|
||||
name: kube-rbac-proxy
|
||||
|
|
53
monitoring/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet
generated
vendored
53
monitoring/vendor/github.com/brancz/kubernetes-grafana/grafana/grafana.libsonnet
generated
vendored
|
@ -77,28 +77,33 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
local configMap = k.core.v1.configMap;
|
||||
local dashboardSources = {
|
||||
apiVersion: 1,
|
||||
providers: [
|
||||
{
|
||||
name: '0',
|
||||
orgId: 1,
|
||||
folder: 'Default',
|
||||
type: 'file',
|
||||
options: {
|
||||
path: '/grafana-dashboard-definitions/0',
|
||||
},
|
||||
},
|
||||
] + [
|
||||
{
|
||||
name: folder,
|
||||
orgId: 1,
|
||||
folder: folder,
|
||||
type: 'file',
|
||||
options: {
|
||||
path: '/grafana-dashboard-definitions/' + folder,
|
||||
},
|
||||
}
|
||||
for folder in std.objectFields($._config.grafana.folderDashboards)
|
||||
],
|
||||
providers:
|
||||
(
|
||||
if std.length($._config.grafana.dashboards) +
|
||||
std.length($._config.grafana.rawDashboards) > 0 then [
|
||||
{
|
||||
name: '0',
|
||||
orgId: 1,
|
||||
folder: 'Default',
|
||||
type: 'file',
|
||||
options: {
|
||||
path: '/grafana-dashboard-definitions/0',
|
||||
},
|
||||
},
|
||||
] else []
|
||||
) +
|
||||
[
|
||||
{
|
||||
name: folder,
|
||||
orgId: 1,
|
||||
folder: folder,
|
||||
type: 'file',
|
||||
options: {
|
||||
path: '/grafana-dashboard-definitions/' + folder,
|
||||
},
|
||||
}
|
||||
for folder in std.objectFields($._config.grafana.folderDashboards)
|
||||
],
|
||||
};
|
||||
|
||||
configMap.new('grafana-dashboards', { 'dashboards.yaml': std.manifestJsonEx(dashboardSources, ' ') }) +
|
||||
|
@ -224,6 +229,10 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
deployment.mixin.metadata.withNamespace($._config.namespace) +
|
||||
deployment.mixin.metadata.withLabels(podLabels) +
|
||||
deployment.mixin.spec.selector.withMatchLabels(podLabels) +
|
||||
deployment.mixin.spec.template.metadata.withAnnotations({
|
||||
[if std.length($._config.grafana.config) > 0 then 'checksum/grafana-config']: std.md5(std.toString($.grafana.config)),
|
||||
'checksum/grafana-datasources': std.md5(std.toString($.grafana.dashboardDatasources)),
|
||||
}) +
|
||||
deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) +
|
||||
deployment.mixin.spec.template.spec.withVolumes(volumes) +
|
||||
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +
|
||||
|
|
36
monitoring/vendor/github.com/coreos/etcd/Documentation/etcd-mixin/mixin.libsonnet
generated
vendored
36
monitoring/vendor/github.com/coreos/etcd/Documentation/etcd-mixin/mixin.libsonnet
generated
vendored
|
@ -1,6 +1,12 @@
|
|||
{
|
||||
_config+:: {
|
||||
etcd_selector: 'job=~".*etcd.*"',
|
||||
// etcd_instance_labels are the label names that are uniquely
|
||||
// identifying an instance and need to be aggreated away for alerts
|
||||
// that are about an etcd cluster as a whole. For example, if etcd
|
||||
// instances are deployed on K8s, you will likely want to change
|
||||
// this to 'instance, pod'.
|
||||
etcd_instance_labels: 'instance',
|
||||
},
|
||||
|
||||
prometheusAlerts+:: {
|
||||
|
@ -11,11 +17,11 @@
|
|||
{
|
||||
alert: 'etcdMembersDown',
|
||||
expr: |||
|
||||
max by (job) (
|
||||
sum by (job) (up{%(etcd_selector)s} == bool 0)
|
||||
max without (endpoint) (
|
||||
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
||||
or
|
||||
count by (job,endpoint) (
|
||||
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[3m])) > 0.01
|
||||
count without (To) (
|
||||
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
|
@ -31,7 +37,7 @@
|
|||
{
|
||||
alert: 'etcdInsufficientMembers',
|
||||
expr: |||
|
||||
sum(up{%(etcd_selector)s} == bool 1) by (job) < ((count(up{%(etcd_selector)s}) by (job) + 1) / 2)
|
||||
sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2)
|
||||
||| % $._config,
|
||||
'for': '3m',
|
||||
labels: {
|
||||
|
@ -57,7 +63,7 @@
|
|||
{
|
||||
alert: 'etcdHighNumberOfLeaderChanges',
|
||||
expr: |||
|
||||
increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3
|
||||
increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
|
@ -70,9 +76,9 @@
|
|||
{
|
||||
alert: 'etcdHighNumberOfFailedGRPCRequests',
|
||||
expr: |||
|
||||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
|
@ -86,9 +92,9 @@
|
|||
{
|
||||
alert: 'etcdHighNumberOfFailedGRPCRequests',
|
||||
expr: |||
|
||||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
|
@ -102,7 +108,7 @@
|
|||
{
|
||||
alert: 'etcdGRPCRequestsSlow',
|
||||
expr: |||
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
|
@ -171,8 +177,8 @@
|
|||
{
|
||||
alert: 'etcdHighNumberOfFailedHTTPRequests',
|
||||
expr: |||
|
||||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
|
||||
BY (method) > 0.01
|
||||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
|
||||
without (code) > 0.01
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
|
@ -185,8 +191,8 @@
|
|||
{
|
||||
alert: 'etcdHighNumberOfFailedHTTPRequests',
|
||||
expr: |||
|
||||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
|
||||
BY (method) > 0.05
|
||||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
|
||||
without (code) > 0.05
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
|
|
|
@ -99,7 +99,7 @@ tests:
|
|||
job: etcd
|
||||
severity: warning
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
message: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
||||
|
|
|
@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
namespace: 'default',
|
||||
|
||||
versions+:: {
|
||||
alertmanager: 'v0.20.0',
|
||||
alertmanager: 'v0.21.0',
|
||||
},
|
||||
|
||||
imageRepos+:: {
|
||||
|
|
|
@ -4,6 +4,32 @@
|
|||
{
|
||||
name: 'prometheus-operator',
|
||||
rules: [
|
||||
{
|
||||
alert: 'PrometheusOperatorListErrors',
|
||||
expr: |||
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
|
||||
},
|
||||
'for': '15m',
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusOperatorWatchErrors',
|
||||
expr: |||
|
||||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
|
||||
},
|
||||
'for': '15m',
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusOperatorReconcileErrors',
|
||||
expr: |||
|
||||
|
|
|
@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|||
{
|
||||
prometheus+:: {
|
||||
kubeControllerManagerPrometheusDiscoveryService:
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
kubeSchedulerPrometheusDiscoveryService:
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
|
|
|
@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|||
{
|
||||
prometheus+:: {
|
||||
kubeControllerManagerPrometheusDiscoveryService:
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
kubeSchedulerPrometheusDiscoveryService:
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
|
|
|
@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|||
{
|
||||
prometheus+: {
|
||||
kubeControllerManagerPrometheusDiscoveryService:
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
kubeSchedulerPrometheusDiscoveryService:
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
|
|
|
@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|||
{
|
||||
prometheus+: {
|
||||
kubeControllerManagerPrometheusDiscoveryService:
|
||||
service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
|
||||
service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
kubeSchedulerPrometheusDiscoveryService:
|
||||
service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
|
||||
service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
|
|
|
@ -6,12 +6,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|||
|
||||
prometheus+: {
|
||||
kubeControllerManagerPrometheusDiscoveryService:
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'component': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
|
||||
service.new('kube-controller-manager-prometheus-discovery', { 'component': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
kubeSchedulerPrometheusDiscoveryService:
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'component': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
|
||||
service.new('kube-scheduler-prometheus-discovery', { 'component': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
|
||||
service.mixin.metadata.withNamespace('kube-system') +
|
||||
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||
service.mixin.spec.withClusterIp('None'),
|
||||
|
|
|
@ -5,7 +5,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|||
{
|
||||
_config+:: {
|
||||
versions+:: {
|
||||
thanos: 'v0.10.0',
|
||||
thanos: 'v0.14.0',
|
||||
},
|
||||
imageRepos+:: {
|
||||
thanos: 'quay.io/thanos/thanos',
|
||||
|
|
|
@ -104,36 +104,36 @@ local configMapList = k3.core.v1.configMapList;
|
|||
namespace: 'default',
|
||||
|
||||
versions+:: {
|
||||
grafana: '6.7.4',
|
||||
grafana: '7.1.0',
|
||||
},
|
||||
|
||||
tlsCipherSuites: [
|
||||
'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
|
||||
'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
|
||||
|
||||
// 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
||||
// 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
|
||||
// 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
||||
// 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
||||
'TLS_RSA_WITH_AES_128_CBC_SHA256',
|
||||
// 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2
|
||||
// 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
||||
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA',// disabled by h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA',// disabled by h2
|
||||
// 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
||||
// 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
|
||||
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
||||
// 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
||||
'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256',
|
||||
'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256',
|
||||
// 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
||||
// 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
|
||||
// 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
||||
// 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
||||
// 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
|
||||
// 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2
|
||||
// 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
||||
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
||||
// 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
|
||||
// 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
|
||||
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
|
||||
// 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
|
||||
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
|
||||
|
||||
// disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go
|
||||
|
||||
// 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2
|
||||
// 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
|
||||
// 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
|
||||
'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384',
|
||||
'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384',
|
||||
'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305',
|
||||
'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305',
|
||||
],
|
||||
|
||||
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',
|
||||
|
|
|
@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
namespace: 'default',
|
||||
|
||||
versions+:: {
|
||||
prometheus: 'v2.17.2',
|
||||
prometheus: 'v2.20.0',
|
||||
},
|
||||
|
||||
imageRepos+:: {
|
||||
|
@ -246,8 +246,13 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
jobLabel: 'k8s-app',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'http-metrics',
|
||||
port: 'https-metrics',
|
||||
interval: '30s',
|
||||
scheme: "https",
|
||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
||||
tlsConfig: {
|
||||
insecureSkipVerify: true
|
||||
}
|
||||
},
|
||||
],
|
||||
selector: {
|
||||
|
@ -347,8 +352,13 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
jobLabel: 'k8s-app',
|
||||
endpoints: [
|
||||
{
|
||||
port: 'http-metrics',
|
||||
port: 'https-metrics',
|
||||
interval: '30s',
|
||||
scheme: "https",
|
||||
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
|
||||
tlsConfig: {
|
||||
insecureSkipVerify: true
|
||||
},
|
||||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
|
@ -407,7 +417,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|||
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
|
||||
{
|
||||
sourceLabels: ['__name__'],
|
||||
regex: 'etcd_(debugging|disk|request|server).*',
|
||||
regex: 'etcd_(debugging|disk|server).*',
|
||||
action: 'drop',
|
||||
},
|
||||
{
|
||||
|
|
|
@ -8,10 +8,6 @@
|
|||
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)',
|
||||
record: 'instance:node_cpu:rate:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)',
|
||||
record: 'instance:node_filesystem_usage:sum',
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)',
|
||||
record: 'instance:node_network_receive_bytes:rate:sum',
|
||||
|
|
|
@ -1,39 +0,0 @@
|
|||
{
|
||||
/**
|
||||
* @name gauge.new
|
||||
*/
|
||||
new(
|
||||
title,
|
||||
datasource=null,
|
||||
calc='mean',
|
||||
time_from=null,
|
||||
span=null,
|
||||
description='',
|
||||
height=null,
|
||||
transparent=null,
|
||||
)::
|
||||
{
|
||||
[if description != '' then 'description']: description,
|
||||
[if height != null then 'height']: height,
|
||||
[if transparent != null then 'transparent']: transparent,
|
||||
[if time_from != null then 'timeFrom']: time_from,
|
||||
[if span != null then 'span']: span,
|
||||
title: title,
|
||||
type: 'gauge',
|
||||
datasource: datasource,
|
||||
options: {
|
||||
fieldOptions: {
|
||||
calcs: [
|
||||
calc,
|
||||
],
|
||||
},
|
||||
},
|
||||
_nextTarget:: 0,
|
||||
addTarget(target):: self {
|
||||
local nextTarget = super._nextTarget,
|
||||
_nextTarget: nextTarget + 1,
|
||||
targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }],
|
||||
},
|
||||
},
|
||||
|
||||
}
|
193
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/gauge_panel.libsonnet
generated
vendored
Normal file
193
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/gauge_panel.libsonnet
generated
vendored
Normal file
|
@ -0,0 +1,193 @@
|
|||
{
|
||||
/**
|
||||
* Creates a [gauge panel](https://grafana.com/docs/grafana/latest/panels/visualizations/gauge-panel/).
|
||||
*
|
||||
* @name gaugePanel.new
|
||||
*
|
||||
* @param title Panel title.
|
||||
* @param description Panel description.
|
||||
* @param transparent Whether to display the panel without a background.
|
||||
* @param datasource Panel datasource.
|
||||
* @param allValues Show all values instead of reducing to one.
|
||||
* @param valueLimit Limit of values in all values mode.
|
||||
* @param reducerFunction Function to use to reduce values to when using single value.
|
||||
* @param fields Fields that should be included in the panel.
|
||||
* @param showThresholdLabels Render the threshold values around the gauge bar.
|
||||
* @param showThresholdMarkers Render the thresholds as an outer bar.
|
||||
* @param unit Panel unit field option.
|
||||
* @param min Leave empty to calculate based on all values.
|
||||
* @param max Leave empty to calculate based on all values.
|
||||
* @param decimals Number of decimal places to show.
|
||||
* @param displayName Change the field or series name.
|
||||
* @param noValue What to show when there is no value.
|
||||
* @param thresholdsMode 'absolute' or 'percentage'.
|
||||
* @param repeat Name of variable that should be used to repeat this panel.
|
||||
* @param repeatDirection 'h' for horizontal or 'v' for vertical.
|
||||
* @param repeatMaxPerRow Maximum panels per row in repeat mode.
|
||||
* @param pluginVersion Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'.
|
||||
*
|
||||
* @method addTarget(target) Adds a target object.
|
||||
* @method addTargets(targets) Adds an array of targets.
|
||||
* @method addLink(link) Adds a link. Aregument format: `{ title: 'Link Title', url: 'https://...', targetBlank: true }`.
|
||||
* @method addLinks(links) Adds an array of links.
|
||||
* @method addThreshold(step) Adds a threshold step. Aregument format: `{ color: 'green', value: 0 }`.
|
||||
* @method addThresholds(steps) Adds an array of threshold steps.
|
||||
* @method addMapping(mapping) Adds a value mapping.
|
||||
* @method addMappings(mappings) Adds an array of value mappings.
|
||||
* @method addDataLink(link) Adds a data link.
|
||||
* @method addDataLinks(links) Adds an array of data links.
|
||||
*/
|
||||
new(
|
||||
title,
|
||||
description=null,
|
||||
transparent=false,
|
||||
datasource=null,
|
||||
allValues=false,
|
||||
valueLimit=null,
|
||||
reducerFunction='mean',
|
||||
fields='',
|
||||
showThresholdLabels=false,
|
||||
showThresholdMarkers=true,
|
||||
unit='percent',
|
||||
min=0,
|
||||
max=100,
|
||||
decimals=null,
|
||||
displayName=null,
|
||||
noValue=null,
|
||||
thresholdsMode='absolute',
|
||||
repeat=null,
|
||||
repeatDirection='h',
|
||||
repeatMaxPerRow=null,
|
||||
pluginVersion='7',
|
||||
):: {
|
||||
|
||||
type: 'gauge',
|
||||
title: title,
|
||||
[if description != null then 'description']: description,
|
||||
transparent: transparent,
|
||||
datasource: datasource,
|
||||
targets: [],
|
||||
links: [],
|
||||
[if repeat != null then 'repeat']: repeat,
|
||||
[if repeat != null then 'repeatDirection']: repeatDirection,
|
||||
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
|
||||
|
||||
// targets
|
||||
_nextTarget:: 0,
|
||||
addTarget(target):: self {
|
||||
local nextTarget = super._nextTarget,
|
||||
_nextTarget: nextTarget + 1,
|
||||
targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }],
|
||||
},
|
||||
addTargets(targets):: std.foldl(function(p, t) p.addTarget(t), targets, self),
|
||||
|
||||
// links
|
||||
addLink(link):: self {
|
||||
links+: [link],
|
||||
},
|
||||
addLinks(links):: std.foldl(function(p, l) p.addLink(l), links, self),
|
||||
|
||||
pluginVersion: pluginVersion,
|
||||
} + (
|
||||
|
||||
if pluginVersion >= '7' then {
|
||||
options: {
|
||||
reduceOptions: {
|
||||
values: allValues,
|
||||
[if allValues && valueLimit != null then 'limit']: valueLimit,
|
||||
calcs: [
|
||||
reducerFunction,
|
||||
],
|
||||
fields: fields,
|
||||
},
|
||||
showThresholdLabels: showThresholdLabels,
|
||||
showThresholdMarkers: showThresholdMarkers,
|
||||
},
|
||||
fieldConfig: {
|
||||
defaults: {
|
||||
unit: unit,
|
||||
[if min != null then 'min']: min,
|
||||
[if max != null then 'max']: max,
|
||||
[if decimals != null then 'decimals']: decimals,
|
||||
[if displayName != null then 'displayName']: displayName,
|
||||
[if noValue != null then 'noValue']: noValue,
|
||||
thresholds: {
|
||||
mode: thresholdsMode,
|
||||
steps: [],
|
||||
},
|
||||
mappings: [],
|
||||
links: [],
|
||||
},
|
||||
},
|
||||
|
||||
// thresholds
|
||||
addThreshold(step):: self {
|
||||
fieldConfig+: { defaults+: { thresholds+: { steps+: [step] } } },
|
||||
},
|
||||
|
||||
// mappings
|
||||
_nextMapping:: 0,
|
||||
addMapping(mapping):: self {
|
||||
local nextMapping = super._nextMapping,
|
||||
_nextMapping: nextMapping + 1,
|
||||
fieldConfig+: { defaults+: { mappings+: [mapping { id: nextMapping }] } },
|
||||
},
|
||||
|
||||
// data links
|
||||
addDataLink(link):: self {
|
||||
fieldConfig+: { defaults+: { links+: [link] } },
|
||||
},
|
||||
|
||||
} else {
|
||||
|
||||
options: {
|
||||
fieldOptions: {
|
||||
values: allValues,
|
||||
[if allValues && valueLimit != null then 'limit']: valueLimit,
|
||||
calcs: [
|
||||
reducerFunction,
|
||||
],
|
||||
fields: fields,
|
||||
defaults: {
|
||||
unit: unit,
|
||||
[if min != null then 'min']: min,
|
||||
[if max != null then 'max']: max,
|
||||
[if decimals != null then 'decimals']: decimals,
|
||||
[if displayName != null then 'displayName']: displayName,
|
||||
[if noValue != null then 'noValue']: noValue,
|
||||
thresholds: {
|
||||
mode: thresholdsMode,
|
||||
steps: [],
|
||||
},
|
||||
mappings: [],
|
||||
links: [],
|
||||
},
|
||||
},
|
||||
showThresholdLabels: showThresholdLabels,
|
||||
showThresholdMarkers: showThresholdMarkers,
|
||||
},
|
||||
|
||||
// thresholds
|
||||
addThreshold(step):: self {
|
||||
options+: { fieldOptions+: { defaults+: { thresholds+: { steps+: [step] } } } },
|
||||
},
|
||||
|
||||
// mappings
|
||||
_nextMapping:: 0,
|
||||
addMapping(mapping):: self {
|
||||
local nextMapping = super._nextMapping,
|
||||
_nextMapping: nextMapping + 1,
|
||||
options+: { fieldOptions+: { defaults+: { mappings+: [mapping { id: nextMapping }] } } },
|
||||
},
|
||||
|
||||
// data links
|
||||
addDataLink(link):: self {
|
||||
options+: { fieldOptions+: { defaults+: { links+: [link] } } },
|
||||
},
|
||||
}
|
||||
) + {
|
||||
addThresholds(steps):: std.foldl(function(p, s) p.addThreshold(s), steps, self),
|
||||
addMappings(mappings):: std.foldl(function(p, m) p.addMapping(m), mappings, self),
|
||||
addDataLinks(links):: std.foldl(function(p, l) p.addDataLink(l), links, self),
|
||||
},
|
||||
}
|
|
@ -23,5 +23,7 @@
|
|||
heatmapPanel:: import 'heatmap_panel.libsonnet',
|
||||
dashlist:: import 'dashlist.libsonnet',
|
||||
pluginlist:: import 'pluginlist.libsonnet',
|
||||
gauge:: import 'gauge.libsonnet',
|
||||
gauge:: error 'gauge is removed, migrate to gaugePanel',
|
||||
gaugePanel:: import 'gauge_panel.libsonnet',
|
||||
statPanel:: import 'stat_panel.libsonnet',
|
||||
}
|
||||
|
|
200
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/stat_panel.libsonnet
generated
vendored
Normal file
200
monitoring/vendor/github.com/grafana/grafonnet-lib/grafonnet/stat_panel.libsonnet
generated
vendored
Normal file
|
@ -0,0 +1,200 @@
|
|||
{
|
||||
/**
|
||||
* Creates a [stat panel](https://grafana.com/docs/grafana/latest/panels/visualizations/stat-panel/).
|
||||
*
|
||||
* @name statPanel.new
|
||||
*
|
||||
* @param title Panel title.
|
||||
* @param description Panel description.
|
||||
* @param transparent Whether to display the panel without a background.
|
||||
* @param datasource Panel datasource.
|
||||
* @param allValues Show all values instead of reducing to one.
|
||||
* @param valueLimit Limit of values in all values mode.
|
||||
* @param reducerFunction Function to use to reduce values to when using single value.
|
||||
* @param fields Fields that should be included in the panel.
|
||||
* @param orientation Stacking direction in case of multiple series or fields.
|
||||
* @param colorMode 'value' or 'background'.
|
||||
* @param graphMode 'none' or 'area' to enable sparkline mode.
|
||||
* @param justifyMode 'auto' or 'center'.
|
||||
* @param unit Panel unit field option.
|
||||
* @param min Leave empty to calculate based on all values.
|
||||
* @param max Leave empty to calculate based on all values.
|
||||
* @param decimals Number of decimal places to show.
|
||||
* @param displayName Change the field or series name.
|
||||
* @param noValue What to show when there is no value.
|
||||
* @param thresholdsMode 'absolute' or 'percentage'.
|
||||
* @param repeat Name of variable that should be used to repeat this panel.
|
||||
* @param repeatDirection 'h' for horizontal or 'v' for vertical.
|
||||
* @param repeatMaxPerRow Maximum panels per row in repeat mode.
|
||||
* @param pluginVersion Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'.
|
||||
*
|
||||
* @method addTarget(target) Adds a target object.
|
||||
* @method addTargets(targets) Adds an array of targets.
|
||||
* @method addLink(link) Adds a link. Aregument format: `{ title: 'Link Title', url: 'https://...', targetBlank: true }`.
|
||||
* @method addLinks(links) Adds an array of links.
|
||||
* @method addThreshold(step) Adds a threshold step. Aregument format: `{ color: 'green', value: 0 }`.
|
||||
* @method addThresholds(steps) Adds an array of threshold steps.
|
||||
* @method addMapping(mapping) Adds a value mapping.
|
||||
* @method addMappings(mappings) Adds an array of value mappings.
|
||||
* @method addDataLink(link) Adds a data link.
|
||||
* @method addDataLinks(links) Adds an array of data links.
|
||||
*/
|
||||
new(
|
||||
title,
|
||||
description=null,
|
||||
transparent=false,
|
||||
datasource=null,
|
||||
allValues=false,
|
||||
valueLimit=null,
|
||||
reducerFunction='mean',
|
||||
fields='',
|
||||
orientation='auto',
|
||||
colorMode='value',
|
||||
graphMode='area',
|
||||
justifyMode='auto',
|
||||
unit='none',
|
||||
min=null,
|
||||
max=null,
|
||||
decimals=null,
|
||||
displayName=null,
|
||||
noValue=null,
|
||||
thresholdsMode='absolute',
|
||||
repeat=null,
|
||||
repeatDirection='h',
|
||||
repeatMaxPerRow=null,
|
||||
pluginVersion='7',
|
||||
):: {
|
||||
|
||||
type: 'stat',
|
||||
title: title,
|
||||
[if description != null then 'description']: description,
|
||||
transparent: transparent,
|
||||
datasource: datasource,
|
||||
targets: [],
|
||||
links: [],
|
||||
[if repeat != null then 'repeat']: repeat,
|
||||
[if repeat != null then 'repeatDirection']: repeatDirection,
|
||||
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
|
||||
|
||||
// targets
|
||||
_nextTarget:: 0,
|
||||
addTarget(target):: self {
|
||||
local nextTarget = super._nextTarget,
|
||||
_nextTarget: nextTarget + 1,
|
||||
targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }],
|
||||
},
|
||||
addTargets(targets):: std.foldl(function(p, t) p.addTarget(t), targets, self),
|
||||
|
||||
// links
|
||||
addLink(link):: self {
|
||||
links+: [link],
|
||||
},
|
||||
addLinks(links):: std.foldl(function(p, l) p.addLink(l), links, self),
|
||||
|
||||
pluginVersion: pluginVersion,
|
||||
} + (
|
||||
|
||||
if pluginVersion >= '7' then {
|
||||
options: {
|
||||
reduceOptions: {
|
||||
values: allValues,
|
||||
[if allValues && valueLimit != null then 'limit']: valueLimit,
|
||||
calcs: [
|
||||
reducerFunction,
|
||||
],
|
||||
fields: fields,
|
||||
},
|
||||
orientation: orientation,
|
||||
colorMode: colorMode,
|
||||
graphMode: graphMode,
|
||||
justifyMode: justifyMode,
|
||||
},
|
||||
fieldConfig: {
|
||||
defaults: {
|
||||
unit: unit,
|
||||
[if min != null then 'min']: min,
|
||||
[if max != null then 'max']: max,
|
||||
[if decimals != null then 'decimals']: decimals,
|
||||
[if displayName != null then 'displayName']: displayName,
|
||||
[if noValue != null then 'noValue']: noValue,
|
||||
thresholds: {
|
||||
mode: thresholdsMode,
|
||||
steps: [],
|
||||
},
|
||||
mappings: [],
|
||||
links: [],
|
||||
},
|
||||
},
|
||||
|
||||
// thresholds
|
||||
addThreshold(step):: self {
|
||||
fieldConfig+: { defaults+: { thresholds+: { steps+: [step] } } },
|
||||
},
|
||||
|
||||
// mappings
|
||||
_nextMapping:: 0,
|
||||
addMapping(mapping):: self {
|
||||
local nextMapping = super._nextMapping,
|
||||
_nextMapping: nextMapping + 1,
|
||||
fieldConfig+: { defaults+: { mappings+: [mapping { id: nextMapping }] } },
|
||||
},
|
||||
|
||||
// data links
|
||||
addDataLink(link):: self {
|
||||
fieldConfig+: { defaults+: { links+: [link] } },
|
||||
},
|
||||
} else {
|
||||
options: {
|
||||
fieldOptions: {
|
||||
values: allValues,
|
||||
[if allValues && valueLimit != null then 'limit']: valueLimit,
|
||||
calcs: [
|
||||
reducerFunction,
|
||||
],
|
||||
fields: fields,
|
||||
defaults: {
|
||||
unit: unit,
|
||||
[if min != null then 'min']: min,
|
||||
[if max != null then 'max']: max,
|
||||
[if decimals != null then 'decimals']: decimals,
|
||||
[if displayName != null then 'displayName']: displayName,
|
||||
[if noValue != null then 'noValue']: noValue,
|
||||
thresholds: {
|
||||
mode: thresholdsMode,
|
||||
steps: [],
|
||||
},
|
||||
mappings: [],
|
||||
links: [],
|
||||
},
|
||||
},
|
||||
orientation: orientation,
|
||||
colorMode: colorMode,
|
||||
graphMode: graphMode,
|
||||
justifyMode: justifyMode,
|
||||
},
|
||||
|
||||
// thresholds
|
||||
addThreshold(step):: self {
|
||||
options+: { fieldOptions+: { defaults+: { thresholds+: { steps+: [step] } } } },
|
||||
},
|
||||
|
||||
// mappings
|
||||
_nextMapping:: 0,
|
||||
addMapping(mapping):: self {
|
||||
local nextMapping = super._nextMapping,
|
||||
_nextMapping: nextMapping + 1,
|
||||
options+: { fieldOptions+: { defaults+: { mappings+: [mapping { id: nextMapping }] } } },
|
||||
},
|
||||
|
||||
// data links
|
||||
addDataLink(link):: self {
|
||||
options+: { fieldOptions+: { defaults+: { links+: [link] } } },
|
||||
},
|
||||
}
|
||||
|
||||
) + {
|
||||
addThresholds(steps):: std.foldl(function(p, s) p.addThreshold(s), steps, self),
|
||||
addMappings(mappings):: std.foldl(function(p, m) p.addMapping(m), mappings, self),
|
||||
addDataLinks(links):: std.foldl(function(p, l) p.addDataLink(l), links, self),
|
||||
},
|
||||
}
|
|
@ -250,3 +250,8 @@ While the community has not yet fully agreed on alert severities and their to be
|
|||
* For more motivation, see
|
||||
"[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin.
|
||||
* For more information about monitoring mixins, see this [design doc](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit#).
|
||||
|
||||
## Note
|
||||
|
||||
You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance.
|
||||
Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.
|
|
@ -150,15 +150,35 @@
|
|||
{
|
||||
alert: 'KubeDaemonSetRolloutStuck',
|
||||
expr: |||
|
||||
kube_daemonset_status_number_ready{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
/
|
||||
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} < 1.00
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.',
|
||||
message: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
|
||||
},
|
||||
'for': '15m',
|
||||
},
|
||||
|
@ -208,12 +228,12 @@
|
|||
expr: |||
|
||||
kube_job_spec_completions{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} - kube_job_status_succeeded{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
'for': '12h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.',
|
||||
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
|
@ -84,14 +84,14 @@ local utils = import 'utils.libsonnet';
|
|||
{
|
||||
alert: 'AggregatedAPIDown',
|
||||
expr: |||
|
||||
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
|
||||
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. It has not been available at least for the past five minutes.',
|
||||
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.',
|
||||
},
|
||||
},
|
||||
(import '../lib/absent_alert.libsonnet') {
|
||||
|
|
|
@ -2,6 +2,11 @@
|
|||
_config+:: {
|
||||
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
|
||||
kubeletSelector: error 'must provide selector for kubelet',
|
||||
kubeNodeUnreachableIgnoreKeys: [
|
||||
'ToBeDeletedByClusterAutoscaler',
|
||||
'cloud.google.com/impending-node-termination',
|
||||
'aws-node-termination-handler/spot-itn',
|
||||
],
|
||||
},
|
||||
|
||||
prometheusAlerts+:: {
|
||||
|
@ -24,8 +29,10 @@
|
|||
},
|
||||
{
|
||||
expr: |||
|
||||
(kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="ToBeDeletedByClusterAutoscaler"}) == 1
|
||||
||| % $._config,
|
||||
(kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key=~"%(kubeNodeUnreachableIgnoreKeys)s"}) == 1
|
||||
||| % $._config {
|
||||
kubeNodeUnreachableIgnoreKeys: std.join('|', super.kubeNodeUnreachableIgnoreKeys),
|
||||
},
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
|
@ -39,7 +46,13 @@
|
|||
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
|
||||
// We have to ignore this special node in the KubeletTooManyPods alert.
|
||||
expr: |||
|
||||
max(max(kubelet_running_pod_count{%(kubeletSelector)s}) by(instance) * on(instance) group_left(node) kubelet_node_name{%(kubeletSelector)s}) by(node) / max(kube_node_status_capacity_pods{%(kubeStateMetricsSelector)s} != 1) by(node) > 0.95
|
||||
count by(node) (
|
||||
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{%(kubeStateMetricsSelector)s})
|
||||
)
|
||||
/
|
||||
max by(node) (
|
||||
kube_node_status_capacity_pods{%(kubeStateMetricsSelector)s} != 1
|
||||
) > 0.95
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
|
|
|
@ -116,7 +116,7 @@
|
|||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
severity: 'info',
|
||||
},
|
||||
annotations: {
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
{
|
||||
alert: 'KubeVersionMismatch',
|
||||
expr: |||
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
|
|
|
@ -22,7 +22,7 @@ local singlestat = grafana.singlestat;
|
|||
decimals=3,
|
||||
description='How many percent of requests (both read and write) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
|
||||
)
|
||||
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="all"}' % $._config.SLOs.apiserver.days));
|
||||
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="all", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
|
||||
|
||||
local errorBudget =
|
||||
graphPanel.new(
|
||||
|
@ -34,7 +34,7 @@ local singlestat = grafana.singlestat;
|
|||
fill=10,
|
||||
description='How much error budget is left looking at our %.3f%% availability gurantees?' % $._config.SLOs.apiserver.target,
|
||||
)
|
||||
.addTarget(prometheus.target('100 * (apiserver_request:availability%dd{verb="all"} - %f)' % [$._config.SLOs.apiserver.days, $._config.SLOs.apiserver.target], legendFormat='errorbudget'));
|
||||
.addTarget(prometheus.target('100 * (apiserver_request:availability%dd{verb="all", %(clusterLabel)s="$cluster"} - %f)' % [$._config.SLOs.apiserver.days, $._config.clusterLabel, $._config.SLOs.apiserver.target], legendFormat='errorbudget'));
|
||||
|
||||
local readAvailability =
|
||||
singlestat.new(
|
||||
|
@ -45,7 +45,7 @@ local singlestat = grafana.singlestat;
|
|||
decimals=3,
|
||||
description='How many percent of read requests (LIST,GET) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
|
||||
)
|
||||
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="read"}' % $._config.SLOs.apiserver.days));
|
||||
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="read", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
|
||||
|
||||
local readRequests =
|
||||
graphPanel.new(
|
||||
|
@ -61,7 +61,7 @@ local singlestat = grafana.singlestat;
|
|||
.addSeriesOverride({ alias: '/3../i', color: '#F2CC0C' })
|
||||
.addSeriesOverride({ alias: '/4../i', color: '#3274D9' })
|
||||
.addSeriesOverride({ alias: '/5../i', color: '#E02F44' })
|
||||
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="read"})', legendFormat='{{ code }}'));
|
||||
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="read", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ code }}'));
|
||||
|
||||
local readErrors =
|
||||
graphPanel.new(
|
||||
|
@ -72,7 +72,7 @@ local singlestat = grafana.singlestat;
|
|||
format='percentunit',
|
||||
description='How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?',
|
||||
)
|
||||
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read",code=~"5.."}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read"})', legendFormat='{{ resource }}'));
|
||||
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read",code=~"5..", %(clusterLabel)s="$cluster"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ resource }}'));
|
||||
|
||||
local readDuration =
|
||||
graphPanel.new(
|
||||
|
@ -82,7 +82,7 @@ local singlestat = grafana.singlestat;
|
|||
format='s',
|
||||
description='How many seconds is the 99th percentile for reading (LIST|GET) a given resource?',
|
||||
)
|
||||
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="read"}', legendFormat='{{ resource }}'));
|
||||
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
|
||||
|
||||
local writeAvailability =
|
||||
singlestat.new(
|
||||
|
@ -93,7 +93,7 @@ local singlestat = grafana.singlestat;
|
|||
decimals=3,
|
||||
description='How many percent of write requests (POST|PUT|PATCH|DELETE) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
|
||||
)
|
||||
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="write"}' % $._config.SLOs.apiserver.days));
|
||||
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="write", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
|
||||
|
||||
local writeRequests =
|
||||
graphPanel.new(
|
||||
|
@ -109,7 +109,7 @@ local singlestat = grafana.singlestat;
|
|||
.addSeriesOverride({ alias: '/3../i', color: '#F2CC0C' })
|
||||
.addSeriesOverride({ alias: '/4../i', color: '#3274D9' })
|
||||
.addSeriesOverride({ alias: '/5../i', color: '#E02F44' })
|
||||
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="write"})', legendFormat='{{ code }}'));
|
||||
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="write", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ code }}'));
|
||||
|
||||
local writeErrors =
|
||||
graphPanel.new(
|
||||
|
@ -120,7 +120,7 @@ local singlestat = grafana.singlestat;
|
|||
format='percentunit',
|
||||
description='How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?',
|
||||
)
|
||||
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write",code=~"5.."}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write"})', legendFormat='{{ resource }}'));
|
||||
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write",code=~"5..", %(clusterLabel)s="$cluster"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ resource }}'));
|
||||
|
||||
local writeDuration =
|
||||
graphPanel.new(
|
||||
|
@ -130,13 +130,13 @@ local singlestat = grafana.singlestat;
|
|||
format='s',
|
||||
description='How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?',
|
||||
)
|
||||
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="write"}', legendFormat='{{ resource }}'));
|
||||
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
|
||||
|
||||
local workQueueAddRate =
|
||||
graphPanel.new(
|
||||
'Work Queue Add Rate',
|
||||
datasource='$datasource',
|
||||
span=4,
|
||||
span=6,
|
||||
format='ops',
|
||||
legend_show=false,
|
||||
min=0,
|
||||
|
@ -147,7 +147,7 @@ local singlestat = grafana.singlestat;
|
|||
graphPanel.new(
|
||||
'Work Queue Depth',
|
||||
datasource='$datasource',
|
||||
span=4,
|
||||
span=6,
|
||||
format='short',
|
||||
legend_show=false,
|
||||
min=0,
|
||||
|
@ -159,7 +159,7 @@ local singlestat = grafana.singlestat;
|
|||
graphPanel.new(
|
||||
'Work Queue Latency',
|
||||
datasource='$datasource',
|
||||
span=4,
|
||||
span=12,
|
||||
format='s',
|
||||
legend_show=true,
|
||||
legend_values=true,
|
||||
|
@ -169,38 +169,6 @@ local singlestat = grafana.singlestat;
|
|||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, name, le))' % $._config, legendFormat='{{instance}} {{name}}'));
|
||||
|
||||
local etcdCacheEntryTotal =
|
||||
graphPanel.new(
|
||||
'ETCD Cache Entry Total',
|
||||
datasource='$datasource',
|
||||
span=4,
|
||||
format='short',
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target('etcd_helper_cache_entry_total{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'));
|
||||
|
||||
local etcdCacheEntryRate =
|
||||
graphPanel.new(
|
||||
'ETCD Cache Hit/Miss Rate',
|
||||
datasource='$datasource',
|
||||
span=4,
|
||||
format='ops',
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target('sum(rate(etcd_helper_cache_hit_total{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance)' % $._config, legendFormat='{{instance}} hit'))
|
||||
.addTarget(prometheus.target('sum(rate(etcd_helper_cache_miss_total{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance)' % $._config, legendFormat='{{instance}} miss'));
|
||||
|
||||
local etcdCacheLatency =
|
||||
graphPanel.new(
|
||||
'ETCD Cache Duration 99th Quantile',
|
||||
datasource='$datasource',
|
||||
span=4,
|
||||
format='s',
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99,sum(rate(etcd_request_cache_get_duration_seconds_bucket{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}} get'))
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99,sum(rate(etcd_request_cache_add_duration_seconds_bucket{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}} miss'));
|
||||
|
||||
local memory =
|
||||
graphPanel.new(
|
||||
'Memory',
|
||||
|
@ -252,14 +220,13 @@ local singlestat = grafana.singlestat;
|
|||
)
|
||||
.addTemplate(
|
||||
template.new(
|
||||
name='cluster',
|
||||
datasource='$datasource',
|
||||
query='label_values(apiserver_request_total, %(clusterLabel)s)' % $._config,
|
||||
current='prod',
|
||||
'cluster',
|
||||
'$datasource',
|
||||
'label_values(apiserver_request_total, %(clusterLabel)s)' % $._config,
|
||||
label='cluster',
|
||||
refresh='time',
|
||||
hide=if $._config.showMultiCluster then '' else 'variable',
|
||||
refresh=1,
|
||||
includeAll=false,
|
||||
sort=1
|
||||
sort=1,
|
||||
)
|
||||
)
|
||||
.addTemplate(
|
||||
|
@ -309,11 +276,6 @@ local singlestat = grafana.singlestat;
|
|||
.addPanel(workQueueAddRate)
|
||||
.addPanel(workQueueDepth)
|
||||
.addPanel(workQueueLatency)
|
||||
).addRow(
|
||||
row.new()
|
||||
.addPanel(etcdCacheEntryTotal)
|
||||
.addPanel(etcdCacheEntryRate)
|
||||
.addPanel(etcdCacheLatency)
|
||||
).addRow(
|
||||
row.new()
|
||||
.addPanel(memory)
|
||||
|
|
|
@ -81,7 +81,7 @@ local singlestat = grafana.singlestat;
|
|||
format='s',
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
|
||||
local getRequestLatency =
|
||||
graphPanel.new(
|
||||
|
@ -96,7 +96,7 @@ local singlestat = grafana.singlestat;
|
|||
legend_alignAsTable=true,
|
||||
legend_rightSide=true,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
|
||||
local memory =
|
||||
graphPanel.new(
|
||||
|
|
|
@ -292,7 +292,7 @@ local singlestat = grafana.singlestat;
|
|||
legend_alignAsTable=true,
|
||||
legend_rightSide=true,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, verb, url, le))' % $._config, legendFormat='{{instance}} {{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, verb, url, le))' % $._config, legendFormat='{{instance}} {{verb}} {{url}}'));
|
||||
|
||||
local memory =
|
||||
graphPanel.new(
|
||||
|
|
|
@ -253,7 +253,7 @@ local singlestat = grafana.singlestat;
|
|||
template.new(
|
||||
name='type',
|
||||
datasource='$datasource',
|
||||
query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
current='deployment',
|
||||
hide='',
|
||||
refresh=1,
|
||||
|
@ -263,7 +263,7 @@ local singlestat = grafana.singlestat;
|
|||
auto: false,
|
||||
auto_count: 30,
|
||||
auto_min: '10s',
|
||||
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
skipUrlSync: false,
|
||||
};
|
||||
|
||||
|
@ -402,7 +402,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
legendFormat='{{ workload }}',
|
||||
),
|
||||
|
@ -414,7 +414,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
legendFormat='{{ workload }}',
|
||||
),
|
||||
|
@ -427,42 +427,42 @@ local singlestat = grafana.singlestat;
|
|||
|||
|
||||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(sum(irate(container_network_receive_packets_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
|||
|
||||
sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
]
|
||||
),
|
||||
|
@ -476,7 +476,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
legendFormat='{{ workload }}',
|
||||
),
|
||||
|
@ -488,7 +488,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
legendFormat='{{ workload }}',
|
||||
),
|
||||
|
@ -505,7 +505,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
),
|
||||
gridPos={ h: 9, w: 12, x: 0, y: 38 }
|
||||
|
@ -516,7 +516,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
),
|
||||
gridPos={ h: 9, w: 12, x: 12, y: 38 }
|
||||
|
@ -529,7 +529,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_packets_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
@ -541,7 +541,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
@ -557,7 +557,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
@ -569,7 +569,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
|
|
@ -119,7 +119,7 @@ local singlestat = grafana.singlestat;
|
|||
template.new(
|
||||
name='workload',
|
||||
datasource='$datasource',
|
||||
query='label_values(mixin_pod_workload{namespace=~"$namespace"}, workload)',
|
||||
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace"}, workload)',
|
||||
current='',
|
||||
hide='',
|
||||
refresh=1,
|
||||
|
@ -129,7 +129,7 @@ local singlestat = grafana.singlestat;
|
|||
auto: false,
|
||||
auto_count: 30,
|
||||
auto_min: '10s',
|
||||
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace"}, workload)',
|
||||
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace"}, workload)',
|
||||
skipUrlSync: false,
|
||||
};
|
||||
|
||||
|
@ -137,7 +137,7 @@ local singlestat = grafana.singlestat;
|
|||
template.new(
|
||||
name='type',
|
||||
datasource='$datasource',
|
||||
query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
|
||||
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
|
||||
current='deployment',
|
||||
hide='',
|
||||
refresh=1,
|
||||
|
@ -147,7 +147,7 @@ local singlestat = grafana.singlestat;
|
|||
auto: false,
|
||||
auto_count: 30,
|
||||
auto_min: '10s',
|
||||
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
|
||||
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
|
||||
skipUrlSync: false,
|
||||
};
|
||||
|
||||
|
@ -287,7 +287,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
legendFormat='{{ pod }}',
|
||||
),
|
||||
|
@ -299,7 +299,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
legendFormat='{{ pod }}',
|
||||
),
|
||||
|
@ -313,7 +313,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
legendFormat='{{ pod }}',
|
||||
),
|
||||
|
@ -325,7 +325,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
legendFormat='{{ pod }}',
|
||||
),
|
||||
|
@ -342,7 +342,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
),
|
||||
gridPos={ h: 9, w: 12, x: 0, y: 12 }
|
||||
|
@ -353,7 +353,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
),
|
||||
gridPos={ h: 9, w: 12, x: 12, y: 12 }
|
||||
|
@ -366,7 +366,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_packets_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
@ -378,7 +378,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
@ -394,7 +394,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
@ -406,7 +406,7 @@ local singlestat = grafana.singlestat;
|
|||
graphQuery=|||
|
||||
sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
|||,
|
||||
graphFormat='pps'
|
||||
),
|
||||
|
|
|
@ -88,7 +88,7 @@ local singlestat = grafana.singlestat;
|
|||
format='s',
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeProxySelector)s,instance=~"$instance",verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeProxySelector)s,instance=~"$instance",verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
|
||||
local getRequestLatency =
|
||||
graphPanel.new(
|
||||
|
@ -103,7 +103,7 @@ local singlestat = grafana.singlestat;
|
|||
legend_alignAsTable=true,
|
||||
legend_rightSide=true,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeProxySelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeProxySelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
|
||||
local memory =
|
||||
graphPanel.new(
|
||||
|
|
|
@ -40,7 +40,7 @@ local template = grafana.template;
|
|||
|
||||
local podWorkloadColumns = [
|
||||
'sum(kube_pod_owner{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
|
||||
'count(avg(mixin_pod_workload{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config,
|
||||
'count(avg(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config,
|
||||
];
|
||||
|
||||
local networkColumns = [
|
||||
|
|
|
@ -78,7 +78,7 @@ local template = grafana.template;
|
|||
|
||||
local cpuUsageQuery = 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config;
|
||||
|
||||
local memoryUsageQuery = 'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}) by (pod)' % $._config;
|
||||
local memoryUsageQuery = 'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""}) by (pod)' % $._config;
|
||||
|
||||
local cpuQuotaRequestsQuery = 'scalar(kube_resourcequota{%(clusterLabel)s="$cluster", namespace="$namespace", type="hard",resource="requests.cpu"})' % $._config;
|
||||
local cpuQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.cpu');
|
||||
|
@ -105,11 +105,11 @@ local template = grafana.template;
|
|||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Utilization (from requests)') +
|
||||
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"})' % $._config)
|
||||
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"})' % $._config)
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Utilisation (from limits)') +
|
||||
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"})' % $._config)
|
||||
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"})' % $._config)
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
|
@ -209,11 +209,11 @@ local template = grafana.template;
|
|||
.addPanel(
|
||||
g.panel('Memory Quota') +
|
||||
g.tablePanel([
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) by (pod)' % $._config,
|
||||
'sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
|
||||
'sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
|
||||
'sum(container_memory_rss{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
|
||||
'sum(container_memory_cache{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
|
||||
'sum(container_memory_swap{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
|
||||
|
|
|
@ -103,7 +103,7 @@ local template = grafana.template;
|
|||
g.row('CPU Throttling')
|
||||
.addPanel(
|
||||
g.panel('CPU Throttling') +
|
||||
g.queryPanel('sum(increase(container_cpu_cfs_throttled_periods_total{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}[5m])) by (container)' % $._config, '{{container}}') +
|
||||
g.queryPanel('sum(increase(container_cpu_cfs_throttled_periods_total{namespace="$namespace", pod="$pod", container!="POD", container!="", %(clusterLabel)s="$cluster"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace="$namespace", pod="$pod", container!="POD", container!="", %(clusterLabel)s="$cluster"}[5m])) by (container)' % $._config, '{{container}}') +
|
||||
g.stack
|
||||
+ {
|
||||
yaxes: g.yaxes({ format: 'percentunit', max: 1 }),
|
||||
|
@ -148,7 +148,7 @@ local template = grafana.template;
|
|||
.addPanel(
|
||||
g.panel('Memory Usage') +
|
||||
g.queryPanel([
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!="", image!=""}) by (container)' % $._config,
|
||||
memRequestsQuery,
|
||||
memLimitsQuery,
|
||||
], [
|
||||
|
@ -189,11 +189,11 @@ local template = grafana.template;
|
|||
.addPanel(
|
||||
g.panel('Memory Quota') +
|
||||
g.tablePanel([
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!="", image!=""}) by (container)' % $._config,
|
||||
'sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod"}) by (container)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", image!=""}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
|
||||
'sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
|
||||
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="", image!=""}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
|
||||
'sum(container_memory_rss{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container != "", container != "POD"}) by (container)' % $._config,
|
||||
'sum(container_memory_cache{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container != "", container != "POD"}) by (container)' % $._config,
|
||||
'sum(container_memory_swap{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container != "", container != "POD"}) by (container)' % $._config,
|
||||
|
|
|
@ -8,7 +8,7 @@ local template = grafana.template;
|
|||
template.new(
|
||||
name='type',
|
||||
datasource='$datasource',
|
||||
query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
current='deployment',
|
||||
hide='',
|
||||
refresh=1,
|
||||
|
@ -18,7 +18,7 @@ local template = grafana.template;
|
|||
auto: false,
|
||||
auto_count: 30,
|
||||
auto_min: '10s',
|
||||
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
|
||||
skipUrlSync: false,
|
||||
},
|
||||
|
||||
|
@ -61,32 +61,32 @@ local template = grafana.template;
|
|||
|||
|
||||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
|
||||
||| % $._config,
|
||||
];
|
||||
|
||||
|
@ -129,7 +129,7 @@ local template = grafana.template;
|
|||
sum(
|
||||
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}
|
||||
* on(namespace,pod)
|
||||
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
|
||||
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
|
||||
) by (workload, workload_type)
|
||||
||| % $._config;
|
||||
|
||||
|
@ -137,18 +137,18 @@ local template = grafana.template;
|
|||
sum(
|
||||
kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace"}
|
||||
* on(namespace,pod)
|
||||
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
|
||||
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
|
||||
) by (workload, workload_type)
|
||||
||| % $._config;
|
||||
|
||||
local podCountQuery = 'count(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}) by (workload, workload_type)' % $._config;
|
||||
local podCountQuery = 'count(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}) by (workload, workload_type)' % $._config;
|
||||
local cpuLimitsQuery = std.strReplace(cpuRequestsQuery, 'requests', 'limits');
|
||||
|
||||
local memUsageQuery = |||
|
||||
sum(
|
||||
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}
|
||||
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""}
|
||||
* on(namespace,pod)
|
||||
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
|
||||
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
|
||||
) by (workload, workload_type)
|
||||
||| % $._config;
|
||||
local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu_cores', 'memory_bytes');
|
||||
|
@ -287,7 +287,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -300,7 +300,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -313,7 +313,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -326,7 +326,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -339,7 +339,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -352,7 +352,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -365,7 +365,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -378,7 +378,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|
||||
||| % $._config, '{{workload}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
|
|
@ -32,7 +32,7 @@ local template = grafana.template;
|
|||
template.new(
|
||||
name='workload',
|
||||
datasource='$datasource',
|
||||
query='label_values(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace"}, workload)' % $._config.clusterLabel,
|
||||
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace"}, workload)' % $._config.clusterLabel,
|
||||
current='',
|
||||
hide='',
|
||||
refresh=1,
|
||||
|
@ -44,7 +44,7 @@ local template = grafana.template;
|
|||
template.new(
|
||||
name='type',
|
||||
datasource='$datasource',
|
||||
query='label_values(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload"}, workload_type)' % $._config.clusterLabel,
|
||||
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload"}, workload_type)' % $._config.clusterLabel,
|
||||
current='',
|
||||
hide='',
|
||||
refresh=1,
|
||||
|
@ -63,32 +63,32 @@ local template = grafana.template;
|
|||
|||
|
||||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config,
|
||||
|||
|
||||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config,
|
||||
];
|
||||
|
||||
|
@ -128,7 +128,7 @@ local template = grafana.template;
|
|||
sum(
|
||||
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}
|
||||
* on(namespace,pod)
|
||||
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
|
||||
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
|
||||
) by (pod)
|
||||
||| % $._config;
|
||||
|
||||
|
@ -136,7 +136,7 @@ local template = grafana.template;
|
|||
sum(
|
||||
kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace"}
|
||||
* on(namespace,pod)
|
||||
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
|
||||
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
|
||||
) by (pod)
|
||||
||| % $._config;
|
||||
|
||||
|
@ -144,9 +144,9 @@ local template = grafana.template;
|
|||
|
||||
local memUsageQuery = |||
|
||||
sum(
|
||||
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}
|
||||
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""}
|
||||
* on(namespace,pod)
|
||||
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
|
||||
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
|
||||
) by (pod)
|
||||
||| % $._config;
|
||||
local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu_cores', 'memory_bytes');
|
||||
|
@ -229,7 +229,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -242,7 +242,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -255,7 +255,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -268,7 +268,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -281,7 +281,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -294,7 +294,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -307,7 +307,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
@ -320,7 +320,7 @@ local template = grafana.template;
|
|||
g.queryPanel(|||
|
||||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
|
||||
* on (namespace,pod)
|
||||
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|
||||
||| % $._config, '{{pod}}') +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
|
|
|
@ -76,7 +76,7 @@ local singlestat = grafana.singlestat;
|
|||
format='s',
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
|
||||
local getRequestLatency =
|
||||
graphPanel.new(
|
||||
|
@ -91,7 +91,7 @@ local singlestat = grafana.singlestat;
|
|||
legend_alignAsTable=true,
|
||||
legend_rightSide=true,
|
||||
)
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
|
||||
|
||||
local memory =
|
||||
graphPanel.new(
|
||||
|
|
|
@ -100,7 +100,7 @@
|
|||
},
|
||||
// workload aggregation for deployments
|
||||
{
|
||||
record: 'mixin_pod_workload',
|
||||
record: 'namespace_workload_pod:kube_pod_owner:relabel',
|
||||
expr: |||
|
||||
max by (%(clusterLabel)s, namespace, workload, pod) (
|
||||
label_replace(
|
||||
|
@ -121,7 +121,7 @@
|
|||
},
|
||||
},
|
||||
{
|
||||
record: 'mixin_pod_workload',
|
||||
record: 'namespace_workload_pod:kube_pod_owner:relabel',
|
||||
expr: |||
|
||||
max by (%(clusterLabel)s, namespace, workload, pod) (
|
||||
label_replace(
|
||||
|
@ -135,7 +135,7 @@
|
|||
},
|
||||
},
|
||||
{
|
||||
record: 'mixin_pod_workload',
|
||||
record: 'namespace_workload_pod:kube_pod_owner:relabel',
|
||||
expr: |||
|
||||
max by (%(clusterLabel)s, namespace, workload, pod) (
|
||||
label_replace(
|
||||
|
|
|
@ -28,8 +28,14 @@
|
|||
sum(rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
|
||||
-
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(window)s])) +
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(window)s])) +
|
||||
(
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(window)s]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(window)s]))
|
||||
+
|
||||
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(window)s]))
|
||||
)
|
||||
)
|
||||
|
@ -151,8 +157,14 @@
|
|||
sum(increase(apiserver_request_duration_seconds_count{%(kubeApiserverReadSelector)s}[%(SLODays)s]))
|
||||
-
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s])) +
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s]))
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(SLODays)s]))
|
||||
)
|
||||
) +
|
||||
|
@ -174,8 +186,14 @@
|
|||
-
|
||||
(
|
||||
# too slow
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s])) +
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s])) +
|
||||
(
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s]))
|
||||
+
|
||||
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(SLODays)s]))
|
||||
)
|
||||
+
|
||||
|
|
|
@ -10,7 +10,7 @@ local template = grafana.template;
|
|||
{
|
||||
grafanaDashboards+:: {
|
||||
'prometheus.json':
|
||||
g.dashboard('Prometheus')
|
||||
g.dashboard('Prometheus Overview')
|
||||
.addMultiTemplate('job', 'prometheus_build_info', 'job')
|
||||
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
|
||||
.addRow(
|
||||
|
|
Reference in New Issue