update monitoring
continuous-integration/drone/push Build is passing Details

This commit is contained in:
Tobias Brunner 2020-07-28 21:39:59 +02:00
parent db6e205ae7
commit b684265f0d
52 changed files with 1020 additions and 798 deletions

View File

@ -8,8 +8,8 @@
"subdir": "grafana"
}
},
"version": "57b4365eacda291b82e0d55ba7eec573a8198dda",
"sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34="
"version": "014301fd5f71d8305a395b2fb437089a7b1a3999",
"sum": "RHtpk2c0CcliWyt6F4DIgwpi4cEfHADK7nAxIw6RTGs="
},
{
"source": {
@ -18,8 +18,8 @@
"subdir": "Documentation/etcd-mixin"
}
},
"version": "7f726db202a4285597c7076fee156e8b2737928f",
"sum": "pk7mLpdUrHuJKkj2vhD6LGMU7P+oYYooBXAeZyZa398="
"version": "9006d8d4f9d82f6cce6eb93d6f2dfe7c154fa05d",
"sum": "Uv8ysXlEACF7BafoCkHnrBmJ2AHh/VldI5mm3BuMiy0="
},
{
"source": {
@ -28,8 +28,8 @@
"subdir": "jsonnet/kube-prometheus"
}
},
"version": "2c1fc1cc11547ca06a143fce6e430e4f7e0be294",
"sum": "Srp/B6oh85sEpjZxWOVyoBciNn6oA1SkjgLX4hUxsIE="
"version": "6771c9bcc287e8047510207a4ab60fa5e63e48fe",
"sum": "52ukcsyazUhdJWb48PPGQQurdFrGE0xgKYE++yWO7aI="
},
{
"source": {
@ -38,7 +38,7 @@
"subdir": "jsonnet/prometheus-operator"
}
},
"version": "e31c69f9b5c6555e0f4a5c1f39d0f03182dd6b41",
"version": "0dca0f21ffff72a063db8855b5d515e15ab0dccb",
"sum": "WggWVWZ+CBEUThQCztSaRELbtqdXf9s3OFzf06HbYNA="
},
{
@ -48,8 +48,8 @@
"subdir": "grafonnet"
}
},
"version": "5c6e8a8113486cdecd0961730aeaada3e6c69fe7",
"sum": "tDuuSKE9f4Ew2bjBM33Rs6behLEAzkmKkShSt+jpAak="
"version": "ad85aec356b4544a41f62ac8c32f8042c0ffc42e",
"sum": "JHhSwlCa9A+AwG4o+YEXXFDbQ91iwwd9G/FoYnGhObw="
},
{
"source": {
@ -58,7 +58,7 @@
"subdir": "grafana-builder"
}
},
"version": "21b638f4e4922c0b6fde12120ed45d8ef803edc7",
"version": "f2a35172b97a0c944c4a167bb1f6e688624602e4",
"sum": "N65Fv0M2JvFE3GN8ZxP5xh1U5a314ey8geLAioJLzF8="
},
{
@ -79,8 +79,8 @@
"subdir": ""
}
},
"version": "fba82a1c0bc225127b084e91bd142c99b1792cb6",
"sum": "hJ5n6OeumIpKYuZQHwxL/rtpAJaW/qTFE9oOA8RWd7w="
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
"sum": "skD7Rm0m5lOQOn8IrnGEdJyhWUI7qsKPXwcci7Hjn0E="
},
{
"source": {
@ -89,7 +89,7 @@
"subdir": "lib/promgrafonnet"
}
},
"version": "fba82a1c0bc225127b084e91bd142c99b1792cb6",
"version": "6eab5fe2dfde77494c0452ce7ec3ed3ff21d9631",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
@ -99,7 +99,7 @@
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "d667979ed55ad1c4db44d331b51d646f5b903aa7",
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
@ -109,7 +109,7 @@
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "d667979ed55ad1c4db44d331b51d646f5b903aa7",
"version": "e43aaa6d6e3554d050ead73b4814566b771377d1",
"sum": "o5avaguRsfFwYFNen00ZEsub1x4i8Z/ZZ2QoEjFMff8="
},
{
@ -119,7 +119,7 @@
"subdir": "docs/node-mixin"
}
},
"version": "7ad86f7994d6f1f290a28fcc2e16e54193e5ab9e",
"version": "503e4fc8486c0082d6bd8c53fad646bcfafeedf6",
"sum": "3jFV2qsc/GZe2GADswTYqxxP2zGOiANTj73W/VNFGqc="
},
{
@ -129,8 +129,8 @@
"subdir": "documentation/prometheus-mixin"
}
},
"version": "1861bf38f588b288bc66196aba1c2516f97aa90c",
"sum": "lEzhZ8gllSfAO4kmXeTwl4W0anapIeFd5GCaCNuDe18=",
"version": "348ff4285ffa59907c9d7fd7eb3cb7d748f42758",
"sum": "TBq4SL7YsPInARbJqwz25JaBvvAegcnRCsuz3K9niWc=",
"name": "prometheus"
},
{

View File

@ -7,7 +7,7 @@ metadata:
namespace: monitoring
spec:
configSecret: alertmanager-tbrnt-config
image: quay.io/prometheus/alertmanager:v0.20.0
image: quay.io/prometheus/alertmanager:v0.21.0
nodeSelector:
kubernetes.io/os: linux
replicas: 1
@ -16,4 +16,4 @@ spec:
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: v0.20.0
version: v0.21.0

View File

@ -6,7 +6,7 @@ metadata:
namespace: monitoring
spec:
encryptedData:
alertmanager.yaml: AgAt4qHTMHgNlti5rXewe3FaQhwDdM9SuW2ZDqKWE+Pnwr4lx+tl/NDzt+UE7xaJWQJDr7pSMo787g5sbzy5v1OxYG6cOcaNK/n8h4SWXim+S91fwvgVaemZ4l6nB1IJdkctLnqOz9Q1/MtAdVzWMJxk3sxoQFounnWZGfzmVybRhdeg93ionzFHK7lpRVZqRBrQRUnbILqEKYBQKfpnj0HadjHggSBAIDIp0NiNidmEHpeDy3dCjjEujJNwvZtUQoepm7bQ5bsFrcC/HUiqgDCRgo/SGZknaCtZaydXV2IgmyNjI5P/IfwUWPOlhA52ebs2dEDswfzj2OOq7JAmlmzorTT7joO+R1OfwFTvHcdjar/fbTm0ZJg1vbx0asxbL5BsXwwTtQVyr7oX0k25xeKe81folZSwUEy+Is5LhoU3VxUlX/cakYeRgPrwEHjX9CB/xs5/zZUoGeF2GIq6JsrFh4P0ZwPclB2HkXzE9pcjekgTi+cZzZ0MJq97phN7etIlq3HvuBgF+xb6zyZ+vD3W/5HPksUrJQX/2+U1RO+AzZ8pwMgTI7bXYiVuGjfpm0s8yoeBb9e6FkDGwwsop4tiAfrxmanF41sdW9og/YPqIRGFYKyI2hASxWbREKgBwKzUgU8DhoF2YJtL47UV+qq+/q0E3okraUIMwdoERbORA6+YgQaACvefq5PbUrLFVsjsmgWWW3wYL78pt/IZ6DEvD9GvJ0YjGIyEqBrfnPqnZeHP4GzMjn2leXy8SfygmfZ4rd6dPDq6nznOwTRQ4wPzG945rKX8A6vaDm8x57gZm1uVQptALNNCd7T6QaDhj9ZUPcJBhi4YHJIEVdlk/Fuy7asUNOhnL2pCv60QlApZMKIemrD2kyAs6GXipfvjr4uWbs+WjIdGdRdrSErAwglwDULasyHQroLz6e17Z4NMM2HoTdw/cOM1C/3VYqWmLx67oo3BykL8mcQGVdTyDkr2M5631UqR22C9AOjpTqCLeGSo9vmXpudgdrOEv3ZNqBraIhz1nF//MFun6fkTCQENsrcO5pr/yl2hbPRAf5TIh7mFguEblOfCSPgLE8ELYSCQzfb8PTexFq9oKnhyr8mct0fgrp/JXFduLkj5ue6FCKe6H13GbW/igQ02BYZKJVLOr8anF1GtwOpzXZAgy6RZbZc6a1M97i21m1IhHbNiy8Xc5IFOEqg822wU1EV21OdBTL+eDAldVi/NTmRfemOta/m9khzKE6xYFSWuq/ulXRrojtashIqjqCPWxUPt+VaAjispmWT38mo+Bw2A7fxa787MfC7nfCOXpr8x/5xJMBROJSBA//HLRQCdgdXZgEeIWfh/tbdEDbofv53fk4oKZPKiUH+jNiEGZot+Qf4nz2NLCdymTysehiHYCYyDheztXHv898zkhOdM9Oqsk4Pfw7BpWd+LheiRmhqyPBnhvkWz5uHSZ8rwx7ixVXQaWntLtsLILDwlXXZXGhX/JjIGzKBgYe/Gxax8ce5B7NL53HZhmZAJ1PEzXeNCAfRlHRRwejy4S/SF5hu9LiZnPjjYsJuga7XDmdq28k7lC5580s8WfCyHVmTQIsEgSk79H/w0LHYPz/KoXz3jy19Uryzue1RKa1EPeXV2Uw6p06IZiIHOPGidpTcgoTGXCj1oELzhuQ3S4HfvPGhdufWARdcpVNs6WV5ms3EO9vTyeSftNulAsQ==
alertmanager.yaml: AgBzDdfETjZXZ9hJSITSeLkdcoszO9O9tbf2srOAwr0VdE2+4PDinymJoNwojY0xWegaxz+7p383egqxJGgK1kirs4RINgLJEsGeOG46yQk656s8kF00/bV3u44C6MCiKs7bHF117Jg3eL3C7GjeoE+Dv4YHjehfUscF+OS6Ld96WPBIN5t1mxox7qd+ysw0KA2neYjPyYyfHw8y00160yopQizChGCBu2oG7RJPhctthcnnGSxskcVtxOdYhBKwwdEJ1SaPFNCGWs6wAy/NwtAJCcEQ2xBizbNA3GsXzK3zkUtWu7/a8ikjKDruwbb2DkjGxqRTJt3LPpntXPxcVB2HTysgZ02+InrZ7EmMCXW1HosXKQdztEYeaN2Ijtalx/+AHhv1iKA0pvEQj76rPzgXWfV+MkCUlBW+VIZiU8Kg8kqZQqLkfODGALwckPymR7kPzgUqbI2IFiN+6tAbyOOLlttoBtIcNsZzLcKmO59gke6bob0T2ZlIhRpUbZlz0l3kUNoQtn4jnriaFinyeKDcGyBhI/bKl01fEokSXkkRlKibyflGdYXLfTCURZeGn3KAxLwfILoojww8zfUfJClAXoleg01qeWuAfIidVy5KJBTfenHcgDq/o5/wP5bZjhQbT5lyUJoFdEn39j0kxusEAgvGNRrI287zj0E61z9bHK9Z7Rkve6l12awY90CxxmcW6KvNF88iejxTWtD89ti/U4FGHTbRpjXrS2YMk8Shv6QMHybRkk3gqxardmVycKW9DfUGrRLWnN58yw16EHaM828/Hr0+l87Ax73PfslE2SEjHPzdKGuhW24WSTXCHlPAwJv+NIx9+uWAcf8edaCfyB/uWgCNNNNszgNzyyi7EGrSuOmKguMIfqzs++KSXWSkrKnCxtco1Sscr60xtkb/gjHc0rn2hHJfU6bmxCP5naat+5xp1yVINb8gfG+ez+BeZdNBGBxbB32BNH7UtzBtwJGnBtDP5Y16q8Tq42nnFBsKy5lr/uwA0ESg7FxuuufcQ0f+jhkfxNBlUc7lxeLaJpeZJsfwlc6ns6rBTrPD0kGmdiusZTqzTd6kuibEk5/RdLnl6kme6OAxVgnU1bIs5l/j5ebmmvM6AbDXTqtuL6zY04M1QDlLJVldS3oS/fqsvAHe5u/W0Fgk5XN8naFpM0r7tfVzspNqyMch2ZrfFfQOO20QNrpzaln0/kknXcdu/pONxSyvaoVkdOgn6lJLOdpWtUAffSPWd3X+uOUB1KHEp1kCKJbXYNpy1qwUoHaLuX0S57MF/Syyx2UAPU5o8WEqgI1vlY6mo4OPjW6Yet1pKoI3CDgLhTf32a+nnxXfad/zR0yLp+ayw/R95VpdS/t0+FuWcgbFb2Jc16WSFTHXaBM0dUxOPBO/6uWfwLGd0eOHkFE2qniqAdkdki3oSQKHRCeI6De9ZGCEOsV4+z9nNJCGGY1wE/ICuD4vOCo9D3iDeNoxr1S8lqs4l2ssxuCOuyodfP6lAxWhznIT4qDbj4oIR/gi+3m5uweG6V2YDFqaLBmmn6PlOCJLc5dirJZDhED9vtg7gAMFRhvhp+DADfyRZRUXDGVwn/6J86DhKOKwZNcC0YPt04eEqOCuiVKIz74cQgeBbMsidt5R38d/gt1BV6y2/UGZd1aaBC45OOiSiNmnsojuJDuy+8iUNU1hOUgHuHUf1es+LKyi/sDm1g==
template:
metadata:
creationTimestamp: null

File diff suppressed because it is too large Load Diff

View File

@ -12,6 +12,8 @@ spec:
app: grafana
template:
metadata:
annotations:
checksum/grafana-datasources: 7103d054a6e94f976ca59b4ede77cf88
labels:
app: grafana
spec:

View File

@ -30,7 +30,7 @@ spec:
- args:
- --logtostderr
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8081/
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
name: kube-rbac-proxy-main
@ -42,7 +42,7 @@ spec:
- args:
- --logtostderr
- --secure-listen-address=:9443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8082/
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
name: kube-rbac-proxy-self

View File

@ -48,7 +48,7 @@ spec:
- args:
- --logtostderr
- --secure-listen-address=[$(IP)]:9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:9100/
env:
- name: IP

View File

@ -12,7 +12,7 @@ spec:
namespace: monitoring
port: web
externalUrl: http://prometheus-k8s.monitoring:9090
image: quay.io/prometheus/prometheus:v2.17.2
image: quay.io/prometheus/prometheus:v2.20.0
nodeSelector:
kubernetes.io/os: linux
podMonitorNamespaceSelector:
@ -56,4 +56,4 @@ spec:
requests:
storage: 10Gi
storageClassName: local-path
version: v2.17.2
version: v2.20.0

View File

@ -74,8 +74,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d]))
)
)
@ -95,8 +101,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h]))
)
)
@ -116,8 +128,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h]))
)
)
@ -137,8 +155,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m]))
)
)
@ -158,8 +182,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d]))
)
)
@ -179,8 +209,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m]))
)
)
@ -200,8 +236,14 @@ spec:
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) +
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h]))
+
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h]))
)
)
@ -384,8 +426,14 @@ spec:
sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
(
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
+
sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
) +
@ -403,8 +451,14 @@ spec:
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) +
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) +
(
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d]))
+
sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d]))
)
+
@ -592,7 +646,7 @@ spec:
)
labels:
workload_type: deployment
record: mixin_pod_workload
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |
max by (cluster, namespace, workload, pod) (
label_replace(
@ -602,7 +656,7 @@ spec:
)
labels:
workload_type: daemonset
record: mixin_pod_workload
record: namespace_workload_pod:kube_pod_owner:relabel
- expr: |
max by (cluster, namespace, workload, pod) (
label_replace(
@ -612,7 +666,7 @@ spec:
)
labels:
workload_type: statefulset
record: mixin_pod_workload
record: namespace_workload_pod:kube_pod_owner:relabel
- name: kube-scheduler.rules
rules:
- expr: |
@ -711,9 +765,6 @@ spec:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
(instance)
record: instance:node_cpu:rate:sum
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
BY (instance)
record: instance:node_filesystem_usage:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
@ -1087,13 +1138,33 @@ spec:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
{{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
message: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
finished or progressed for at least 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: |
kube_daemonset_status_number_ready{job="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
!=
0
) or (
kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
)
) and (
changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m])
==
0
)
for: 15m
labels:
severity: warning
@ -1132,11 +1203,11 @@ spec:
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than one hour to complete.
than 12 hours to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
expr: |
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
for: 12h
labels:
severity: warning
- alert: KubeJobFailed
@ -1256,7 +1327,7 @@ spec:
> ( 25 / 100 )
for: 15m
labels:
severity: warning
severity: info
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeFillingUp
@ -1308,7 +1379,7 @@ spec:
components running.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
expr: |
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
@ -1412,11 +1483,11 @@ spec:
severity: warning
- alert: AggregatedAPIDown
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down.
It has not been available at least for the past five minutes.
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
been only {{ $value | humanize }}% available over the last 5m.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
expr: |
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
for: 5m
labels:
severity: warning
@ -1445,7 +1516,7 @@ spec:
message: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
expr: |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key="ToBeDeletedByClusterAutoscaler"}) == 1
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
labels:
severity: warning
- alert: KubeletTooManyPods
@ -1454,7 +1525,13 @@ spec:
}} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: |
max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"} != 1) by(node) > 0.95
count by(node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
)
/
max by(node) (
kube_node_status_capacity_pods{job="kube-state-metrics"} != 1
) > 0.95
for: 15m
labels:
severity: warning
@ -1793,6 +1870,24 @@ spec:
severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorListErrors
annotations:
message: Errors while performing List operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorWatchErrors
annotations:
message: Errors while performing Watch operations in controller {{$labels.controller}}
in {{$labels.namespace}} namespace.
expr: |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="monitoring"}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="monitoring"}[1h]))) > 0.4
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace

View File

@ -43,7 +43,7 @@ spec:
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|request|server).*
regex: etcd_(debugging|disk|server).*
sourceLabels:
- __name__
- action: drop

View File

@ -7,7 +7,8 @@ metadata:
namespace: monitoring
spec:
endpoints:
- interval: 30s
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
@ -45,7 +46,10 @@ spec:
regex: etcd_(debugging|disk|request|server).*
sourceLabels:
- __name__
port: http-metrics
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: k8s-app
namespaceSelector:
matchNames:

View File

@ -7,8 +7,12 @@ metadata:
namespace: monitoring
spec:
endpoints:
- interval: 30s
port: http-metrics
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: k8s-app
namespaceSelector:
matchNames:

View File

@ -43,7 +43,7 @@ spec:
- args:
- --logtostderr
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8080/
image: quay.io/coreos/kube-rbac-proxy:v0.4.1
name: kube-rbac-proxy

View File

@ -77,28 +77,33 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local configMap = k.core.v1.configMap;
local dashboardSources = {
apiVersion: 1,
providers: [
{
name: '0',
orgId: 1,
folder: 'Default',
type: 'file',
options: {
path: '/grafana-dashboard-definitions/0',
},
},
] + [
{
name: folder,
orgId: 1,
folder: folder,
type: 'file',
options: {
path: '/grafana-dashboard-definitions/' + folder,
},
}
for folder in std.objectFields($._config.grafana.folderDashboards)
],
providers:
(
if std.length($._config.grafana.dashboards) +
std.length($._config.grafana.rawDashboards) > 0 then [
{
name: '0',
orgId: 1,
folder: 'Default',
type: 'file',
options: {
path: '/grafana-dashboard-definitions/0',
},
},
] else []
) +
[
{
name: folder,
orgId: 1,
folder: folder,
type: 'file',
options: {
path: '/grafana-dashboard-definitions/' + folder,
},
}
for folder in std.objectFields($._config.grafana.folderDashboards)
],
};
configMap.new('grafana-dashboards', { 'dashboards.yaml': std.manifestJsonEx(dashboardSources, ' ') }) +
@ -224,6 +229,10 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
deployment.mixin.metadata.withNamespace($._config.namespace) +
deployment.mixin.metadata.withLabels(podLabels) +
deployment.mixin.spec.selector.withMatchLabels(podLabels) +
deployment.mixin.spec.template.metadata.withAnnotations({
[if std.length($._config.grafana.config) > 0 then 'checksum/grafana-config']: std.md5(std.toString($.grafana.config)),
'checksum/grafana-datasources': std.md5(std.toString($.grafana.dashboardDatasources)),
}) +
deployment.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) +
deployment.mixin.spec.template.spec.withVolumes(volumes) +
deployment.mixin.spec.template.spec.securityContext.withRunAsNonRoot(true) +

View File

@ -1,6 +1,12 @@
{
_config+:: {
etcd_selector: 'job=~".*etcd.*"',
// etcd_instance_labels are the label names that are uniquely
// identifying an instance and need to be aggreated away for alerts
// that are about an etcd cluster as a whole. For example, if etcd
// instances are deployed on K8s, you will likely want to change
// this to 'instance, pod'.
etcd_instance_labels: 'instance',
},
prometheusAlerts+:: {
@ -11,11 +17,11 @@
{
alert: 'etcdMembersDown',
expr: |||
max by (job) (
sum by (job) (up{%(etcd_selector)s} == bool 0)
max without (endpoint) (
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
or
count by (job,endpoint) (
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[3m])) > 0.01
count without (To) (
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01
)
)
> 0
@ -31,7 +37,7 @@
{
alert: 'etcdInsufficientMembers',
expr: |||
sum(up{%(etcd_selector)s} == bool 1) by (job) < ((count(up{%(etcd_selector)s}) by (job) + 1) / 2)
sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2)
||| % $._config,
'for': '3m',
labels: {
@ -57,7 +63,7 @@
{
alert: 'etcdHighNumberOfLeaderChanges',
expr: |||
increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3
increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4
||| % $._config,
'for': '5m',
labels: {
@ -70,9 +76,9 @@
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
> 1
||| % $._config,
'for': '10m',
@ -86,9 +92,9 @@
{
alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: |||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
> 5
||| % $._config,
'for': '5m',
@ -102,7 +108,7 @@
{
alert: 'etcdGRPCRequestsSlow',
expr: |||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
||| % $._config,
'for': '10m',
@ -171,8 +177,8 @@
{
alert: 'etcdHighNumberOfFailedHTTPRequests',
expr: |||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
BY (method) > 0.01
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
without (code) > 0.01
||| % $._config,
'for': '10m',
labels: {
@ -185,8 +191,8 @@
{
alert: 'etcdHighNumberOfFailedHTTPRequests',
expr: |||
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
BY (method) > 0.05
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) without (code) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
without (code) > 0.05
||| % $._config,
'for': '10m',
labels: {

View File

@ -99,7 +99,7 @@ tests:
job: etcd
severity: warning
exp_annotations:
message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
message: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
- interval: 1m
input_series:
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'

View File

@ -5,7 +5,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
namespace: 'default',
versions+:: {
alertmanager: 'v0.20.0',
alertmanager: 'v0.21.0',
},
imageRepos+:: {

View File

@ -4,6 +4,32 @@
{
name: 'prometheus-operator',
rules: [
{
alert: 'PrometheusOperatorListErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorWatchErrors',
expr: |||
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{%(prometheusOperatorSelector)s}[1h])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{%(prometheusOperatorSelector)s}[1h]))) > 0.4
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Errors while performing Watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace.',
},
'for': '15m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||

View File

@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+:: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),

View File

@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+:: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),

View File

@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),

View File

@ -5,12 +5,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{
prometheus+: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
service.new('kube-controller-manager-prometheus-discovery', { component: 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
service.new('kube-scheduler-prometheus-discovery', { component: 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),

View File

@ -6,12 +6,12 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
prometheus+: {
kubeControllerManagerPrometheusDiscoveryService:
service.new('kube-controller-manager-prometheus-discovery', { 'component': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
service.new('kube-controller-manager-prometheus-discovery', { 'component': 'kube-controller-manager' }, servicePort.newNamed('https-metrics', 10257, 10257)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
service.mixin.spec.withClusterIp('None'),
kubeSchedulerPrometheusDiscoveryService:
service.new('kube-scheduler-prometheus-discovery', { 'component': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
service.new('kube-scheduler-prometheus-discovery', { 'component': 'kube-scheduler' }, servicePort.newNamed('https-metrics', 10259, 10259)) +
service.mixin.metadata.withNamespace('kube-system') +
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
service.mixin.spec.withClusterIp('None'),

View File

@ -5,7 +5,7 @@ local servicePort = k.core.v1.service.mixin.spec.portsType;
{
_config+:: {
versions+:: {
thanos: 'v0.10.0',
thanos: 'v0.14.0',
},
imageRepos+:: {
thanos: 'quay.io/thanos/thanos',

View File

@ -104,36 +104,36 @@ local configMapList = k3.core.v1.configMapList;
namespace: 'default',
versions+:: {
grafana: '6.7.4',
grafana: '7.1.0',
},
tlsCipherSuites: [
'TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
'TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256', // required by h2: http://golang.org/cl/30721
// 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
// 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
'TLS_RSA_WITH_AES_128_CBC_SHA256',
// 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2
// 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA',// disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA',// disabled by h2
// 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256',
'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256',
// 'TLS_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
// 'TLS_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
// 'TLS_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
// 'TLS_RSA_WITH_AES_128_GCM_SHA256', // disabled by h2
// 'TLS_RSA_WITH_AES_256_GCM_SHA384', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_RSA_WITH_RC4_128_SHA', // insecure: https://access.redhat.com/security/cve/cve-2013-2566
// 'TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA', // insecure: https://access.redhat.com/articles/2548661
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA', // disabled by h2
// 'TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
// 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256', // insecure: https://access.redhat.com/security/cve/cve-2013-0169
// disabled by h2 means: https://github.com/golang/net/blob/e514e69ffb8bc3c76a71ae40de0118d794855992/http2/ciphers.go
// 'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2
// 'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384', // TODO: Might not work with h2
// 'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
// 'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305', // TODO: Might not work with h2
'TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384',
'TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384',
'TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305',
'TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305',
],
cadvisorSelector: 'job="kubelet", metrics_path="/metrics/cadvisor"',

View File

@ -6,7 +6,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
namespace: 'default',
versions+:: {
prometheus: 'v2.17.2',
prometheus: 'v2.20.0',
},
imageRepos+:: {
@ -246,8 +246,13 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
jobLabel: 'k8s-app',
endpoints: [
{
port: 'http-metrics',
port: 'https-metrics',
interval: '30s',
scheme: "https",
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
tlsConfig: {
insecureSkipVerify: true
}
},
],
selector: {
@ -347,8 +352,13 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
jobLabel: 'k8s-app',
endpoints: [
{
port: 'http-metrics',
port: 'https-metrics',
interval: '30s',
scheme: "https",
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token",
tlsConfig: {
insecureSkipVerify: true
},
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
@ -407,7 +417,7 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
metricRelabelings: (import 'kube-prometheus/dropping-deprecated-metrics-relabelings.libsonnet') + [
{
sourceLabels: ['__name__'],
regex: 'etcd_(debugging|disk|request|server).*',
regex: 'etcd_(debugging|disk|server).*',
action: 'drop',
},
{

View File

@ -8,10 +8,6 @@
expr: 'sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)',
record: 'instance:node_cpu:rate:sum',
},
{
expr: 'sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)',
record: 'instance:node_filesystem_usage:sum',
},
{
expr: 'sum(rate(node_network_receive_bytes_total[3m])) BY (instance)',
record: 'instance:node_network_receive_bytes:rate:sum',

View File

@ -1,39 +0,0 @@
{
/**
* @name gauge.new
*/
new(
title,
datasource=null,
calc='mean',
time_from=null,
span=null,
description='',
height=null,
transparent=null,
)::
{
[if description != '' then 'description']: description,
[if height != null then 'height']: height,
[if transparent != null then 'transparent']: transparent,
[if time_from != null then 'timeFrom']: time_from,
[if span != null then 'span']: span,
title: title,
type: 'gauge',
datasource: datasource,
options: {
fieldOptions: {
calcs: [
calc,
],
},
},
_nextTarget:: 0,
addTarget(target):: self {
local nextTarget = super._nextTarget,
_nextTarget: nextTarget + 1,
targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }],
},
},
}

View File

@ -0,0 +1,193 @@
{
/**
* Creates a [gauge panel](https://grafana.com/docs/grafana/latest/panels/visualizations/gauge-panel/).
*
* @name gaugePanel.new
*
* @param title Panel title.
* @param description Panel description.
* @param transparent Whether to display the panel without a background.
* @param datasource Panel datasource.
* @param allValues Show all values instead of reducing to one.
* @param valueLimit Limit of values in all values mode.
* @param reducerFunction Function to use to reduce values to when using single value.
* @param fields Fields that should be included in the panel.
* @param showThresholdLabels Render the threshold values around the gauge bar.
* @param showThresholdMarkers Render the thresholds as an outer bar.
* @param unit Panel unit field option.
* @param min Leave empty to calculate based on all values.
* @param max Leave empty to calculate based on all values.
* @param decimals Number of decimal places to show.
* @param displayName Change the field or series name.
* @param noValue What to show when there is no value.
* @param thresholdsMode 'absolute' or 'percentage'.
* @param repeat Name of variable that should be used to repeat this panel.
* @param repeatDirection 'h' for horizontal or 'v' for vertical.
* @param repeatMaxPerRow Maximum panels per row in repeat mode.
* @param pluginVersion Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'.
*
* @method addTarget(target) Adds a target object.
* @method addTargets(targets) Adds an array of targets.
* @method addLink(link) Adds a link. Aregument format: `{ title: 'Link Title', url: 'https://...', targetBlank: true }`.
* @method addLinks(links) Adds an array of links.
* @method addThreshold(step) Adds a threshold step. Aregument format: `{ color: 'green', value: 0 }`.
* @method addThresholds(steps) Adds an array of threshold steps.
* @method addMapping(mapping) Adds a value mapping.
* @method addMappings(mappings) Adds an array of value mappings.
* @method addDataLink(link) Adds a data link.
* @method addDataLinks(links) Adds an array of data links.
*/
new(
title,
description=null,
transparent=false,
datasource=null,
allValues=false,
valueLimit=null,
reducerFunction='mean',
fields='',
showThresholdLabels=false,
showThresholdMarkers=true,
unit='percent',
min=0,
max=100,
decimals=null,
displayName=null,
noValue=null,
thresholdsMode='absolute',
repeat=null,
repeatDirection='h',
repeatMaxPerRow=null,
pluginVersion='7',
):: {
type: 'gauge',
title: title,
[if description != null then 'description']: description,
transparent: transparent,
datasource: datasource,
targets: [],
links: [],
[if repeat != null then 'repeat']: repeat,
[if repeat != null then 'repeatDirection']: repeatDirection,
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
// targets
_nextTarget:: 0,
addTarget(target):: self {
local nextTarget = super._nextTarget,
_nextTarget: nextTarget + 1,
targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }],
},
addTargets(targets):: std.foldl(function(p, t) p.addTarget(t), targets, self),
// links
addLink(link):: self {
links+: [link],
},
addLinks(links):: std.foldl(function(p, l) p.addLink(l), links, self),
pluginVersion: pluginVersion,
} + (
if pluginVersion >= '7' then {
options: {
reduceOptions: {
values: allValues,
[if allValues && valueLimit != null then 'limit']: valueLimit,
calcs: [
reducerFunction,
],
fields: fields,
},
showThresholdLabels: showThresholdLabels,
showThresholdMarkers: showThresholdMarkers,
},
fieldConfig: {
defaults: {
unit: unit,
[if min != null then 'min']: min,
[if max != null then 'max']: max,
[if decimals != null then 'decimals']: decimals,
[if displayName != null then 'displayName']: displayName,
[if noValue != null then 'noValue']: noValue,
thresholds: {
mode: thresholdsMode,
steps: [],
},
mappings: [],
links: [],
},
},
// thresholds
addThreshold(step):: self {
fieldConfig+: { defaults+: { thresholds+: { steps+: [step] } } },
},
// mappings
_nextMapping:: 0,
addMapping(mapping):: self {
local nextMapping = super._nextMapping,
_nextMapping: nextMapping + 1,
fieldConfig+: { defaults+: { mappings+: [mapping { id: nextMapping }] } },
},
// data links
addDataLink(link):: self {
fieldConfig+: { defaults+: { links+: [link] } },
},
} else {
options: {
fieldOptions: {
values: allValues,
[if allValues && valueLimit != null then 'limit']: valueLimit,
calcs: [
reducerFunction,
],
fields: fields,
defaults: {
unit: unit,
[if min != null then 'min']: min,
[if max != null then 'max']: max,
[if decimals != null then 'decimals']: decimals,
[if displayName != null then 'displayName']: displayName,
[if noValue != null then 'noValue']: noValue,
thresholds: {
mode: thresholdsMode,
steps: [],
},
mappings: [],
links: [],
},
},
showThresholdLabels: showThresholdLabels,
showThresholdMarkers: showThresholdMarkers,
},
// thresholds
addThreshold(step):: self {
options+: { fieldOptions+: { defaults+: { thresholds+: { steps+: [step] } } } },
},
// mappings
_nextMapping:: 0,
addMapping(mapping):: self {
local nextMapping = super._nextMapping,
_nextMapping: nextMapping + 1,
options+: { fieldOptions+: { defaults+: { mappings+: [mapping { id: nextMapping }] } } },
},
// data links
addDataLink(link):: self {
options+: { fieldOptions+: { defaults+: { links+: [link] } } },
},
}
) + {
addThresholds(steps):: std.foldl(function(p, s) p.addThreshold(s), steps, self),
addMappings(mappings):: std.foldl(function(p, m) p.addMapping(m), mappings, self),
addDataLinks(links):: std.foldl(function(p, l) p.addDataLink(l), links, self),
},
}

View File

@ -23,5 +23,7 @@
heatmapPanel:: import 'heatmap_panel.libsonnet',
dashlist:: import 'dashlist.libsonnet',
pluginlist:: import 'pluginlist.libsonnet',
gauge:: import 'gauge.libsonnet',
gauge:: error 'gauge is removed, migrate to gaugePanel',
gaugePanel:: import 'gauge_panel.libsonnet',
statPanel:: import 'stat_panel.libsonnet',
}

View File

@ -0,0 +1,200 @@
{
/**
* Creates a [stat panel](https://grafana.com/docs/grafana/latest/panels/visualizations/stat-panel/).
*
* @name statPanel.new
*
* @param title Panel title.
* @param description Panel description.
* @param transparent Whether to display the panel without a background.
* @param datasource Panel datasource.
* @param allValues Show all values instead of reducing to one.
* @param valueLimit Limit of values in all values mode.
* @param reducerFunction Function to use to reduce values to when using single value.
* @param fields Fields that should be included in the panel.
* @param orientation Stacking direction in case of multiple series or fields.
* @param colorMode 'value' or 'background'.
* @param graphMode 'none' or 'area' to enable sparkline mode.
* @param justifyMode 'auto' or 'center'.
* @param unit Panel unit field option.
* @param min Leave empty to calculate based on all values.
* @param max Leave empty to calculate based on all values.
* @param decimals Number of decimal places to show.
* @param displayName Change the field or series name.
* @param noValue What to show when there is no value.
* @param thresholdsMode 'absolute' or 'percentage'.
* @param repeat Name of variable that should be used to repeat this panel.
* @param repeatDirection 'h' for horizontal or 'v' for vertical.
* @param repeatMaxPerRow Maximum panels per row in repeat mode.
* @param pluginVersion Plugin version the panel should be modeled for. This has been tested with the default, '7', and '6.7'.
*
* @method addTarget(target) Adds a target object.
* @method addTargets(targets) Adds an array of targets.
* @method addLink(link) Adds a link. Aregument format: `{ title: 'Link Title', url: 'https://...', targetBlank: true }`.
* @method addLinks(links) Adds an array of links.
* @method addThreshold(step) Adds a threshold step. Aregument format: `{ color: 'green', value: 0 }`.
* @method addThresholds(steps) Adds an array of threshold steps.
* @method addMapping(mapping) Adds a value mapping.
* @method addMappings(mappings) Adds an array of value mappings.
* @method addDataLink(link) Adds a data link.
* @method addDataLinks(links) Adds an array of data links.
*/
new(
title,
description=null,
transparent=false,
datasource=null,
allValues=false,
valueLimit=null,
reducerFunction='mean',
fields='',
orientation='auto',
colorMode='value',
graphMode='area',
justifyMode='auto',
unit='none',
min=null,
max=null,
decimals=null,
displayName=null,
noValue=null,
thresholdsMode='absolute',
repeat=null,
repeatDirection='h',
repeatMaxPerRow=null,
pluginVersion='7',
):: {
type: 'stat',
title: title,
[if description != null then 'description']: description,
transparent: transparent,
datasource: datasource,
targets: [],
links: [],
[if repeat != null then 'repeat']: repeat,
[if repeat != null then 'repeatDirection']: repeatDirection,
[if repeat != null then 'repeatMaxPerRow']: repeatMaxPerRow,
// targets
_nextTarget:: 0,
addTarget(target):: self {
local nextTarget = super._nextTarget,
_nextTarget: nextTarget + 1,
targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }],
},
addTargets(targets):: std.foldl(function(p, t) p.addTarget(t), targets, self),
// links
addLink(link):: self {
links+: [link],
},
addLinks(links):: std.foldl(function(p, l) p.addLink(l), links, self),
pluginVersion: pluginVersion,
} + (
if pluginVersion >= '7' then {
options: {
reduceOptions: {
values: allValues,
[if allValues && valueLimit != null then 'limit']: valueLimit,
calcs: [
reducerFunction,
],
fields: fields,
},
orientation: orientation,
colorMode: colorMode,
graphMode: graphMode,
justifyMode: justifyMode,
},
fieldConfig: {
defaults: {
unit: unit,
[if min != null then 'min']: min,
[if max != null then 'max']: max,
[if decimals != null then 'decimals']: decimals,
[if displayName != null then 'displayName']: displayName,
[if noValue != null then 'noValue']: noValue,
thresholds: {
mode: thresholdsMode,
steps: [],
},
mappings: [],
links: [],
},
},
// thresholds
addThreshold(step):: self {
fieldConfig+: { defaults+: { thresholds+: { steps+: [step] } } },
},
// mappings
_nextMapping:: 0,
addMapping(mapping):: self {
local nextMapping = super._nextMapping,
_nextMapping: nextMapping + 1,
fieldConfig+: { defaults+: { mappings+: [mapping { id: nextMapping }] } },
},
// data links
addDataLink(link):: self {
fieldConfig+: { defaults+: { links+: [link] } },
},
} else {
options: {
fieldOptions: {
values: allValues,
[if allValues && valueLimit != null then 'limit']: valueLimit,
calcs: [
reducerFunction,
],
fields: fields,
defaults: {
unit: unit,
[if min != null then 'min']: min,
[if max != null then 'max']: max,
[if decimals != null then 'decimals']: decimals,
[if displayName != null then 'displayName']: displayName,
[if noValue != null then 'noValue']: noValue,
thresholds: {
mode: thresholdsMode,
steps: [],
},
mappings: [],
links: [],
},
},
orientation: orientation,
colorMode: colorMode,
graphMode: graphMode,
justifyMode: justifyMode,
},
// thresholds
addThreshold(step):: self {
options+: { fieldOptions+: { defaults+: { thresholds+: { steps+: [step] } } } },
},
// mappings
_nextMapping:: 0,
addMapping(mapping):: self {
local nextMapping = super._nextMapping,
_nextMapping: nextMapping + 1,
options+: { fieldOptions+: { defaults+: { mappings+: [mapping { id: nextMapping }] } } },
},
// data links
addDataLink(link):: self {
options+: { fieldOptions+: { defaults+: { links+: [link] } } },
},
}
) + {
addThresholds(steps):: std.foldl(function(p, s) p.addThreshold(s), steps, self),
addMappings(mappings):: std.foldl(function(p, m) p.addMapping(m), mappings, self),
addDataLinks(links):: std.foldl(function(p, l) p.addDataLink(l), links, self),
},
}

View File

@ -250,3 +250,8 @@ While the community has not yet fully agreed on alert severities and their to be
* For more motivation, see
"[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin.
* For more information about monitoring mixins, see this [design doc](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit#).
## Note
You can use the external tool call [prom-metrics-check](https://github.com/ContainerSolutions/prom-metrics-check) to validate the created dashboards. This tool allows you to check if the metrics installed and used in Grafana dashboards exist in the Prometheus instance.
Please have a look at https://github.com/ContainerSolutions/prom-metrics-check.

View File

@ -150,15 +150,35 @@
{
alert: 'KubeDaemonSetRolloutStuck',
expr: |||
kube_daemonset_status_number_ready{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
/
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} < 1.00
(
(
kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
) or (
kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
0
) or (
kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
) or (
kube_daemonset_status_number_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
)
) and (
changes(kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
==
0
)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.',
message: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
},
'for': '15m',
},
@ -208,12 +228,12 @@
expr: |||
kube_job_spec_completions{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} - kube_job_status_succeeded{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
||| % $._config,
'for': '1h',
'for': '12h',
labels: {
severity: 'warning',
},
annotations: {
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.',
message: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
},
},
{

View File

@ -84,14 +84,14 @@ local utils = import 'utils.libsonnet';
{
alert: 'AggregatedAPIDown',
expr: |||
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[5m]))) * 100 < 90
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down. It has not been available at least for the past five minutes.',
message: 'An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 5m.',
},
},
(import '../lib/absent_alert.libsonnet') {

View File

@ -2,6 +2,11 @@
_config+:: {
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
kubeletSelector: error 'must provide selector for kubelet',
kubeNodeUnreachableIgnoreKeys: [
'ToBeDeletedByClusterAutoscaler',
'cloud.google.com/impending-node-termination',
'aws-node-termination-handler/spot-itn',
],
},
prometheusAlerts+:: {
@ -24,8 +29,10 @@
},
{
expr: |||
(kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="ToBeDeletedByClusterAutoscaler"}) == 1
||| % $._config,
(kube_node_spec_taint{%(kubeStateMetricsSelector)s,key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{%(kubeStateMetricsSelector)s,key=~"%(kubeNodeUnreachableIgnoreKeys)s"}) == 1
||| % $._config {
kubeNodeUnreachableIgnoreKeys: std.join('|', super.kubeNodeUnreachableIgnoreKeys),
},
labels: {
severity: 'warning',
},
@ -39,7 +46,13 @@
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
// We have to ignore this special node in the KubeletTooManyPods alert.
expr: |||
max(max(kubelet_running_pod_count{%(kubeletSelector)s}) by(instance) * on(instance) group_left(node) kubelet_node_name{%(kubeletSelector)s}) by(node) / max(kube_node_status_capacity_pods{%(kubeStateMetricsSelector)s} != 1) by(node) > 0.95
count by(node) (
(kube_pod_status_phase{%(kubeStateMetricsSelector)s,phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{%(kubeStateMetricsSelector)s})
)
/
max by(node) (
kube_node_status_capacity_pods{%(kubeStateMetricsSelector)s} != 1
) > 0.95
||| % $._config,
'for': '15m',
labels: {

View File

@ -116,7 +116,7 @@
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
severity: 'info',
},
annotations: {
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',

View File

@ -11,7 +11,7 @@
{
alert: 'KubeVersionMismatch',
expr: |||
count(count by (gitVersion) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
count(count by (gitVersion) (label_replace(kubernetes_build_info{%(notKubeDnsCoreDnsSelector)s},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1
||| % $._config,
'for': '15m',
labels: {

View File

@ -22,7 +22,7 @@ local singlestat = grafana.singlestat;
decimals=3,
description='How many percent of requests (both read and write) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
)
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="all"}' % $._config.SLOs.apiserver.days));
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="all", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
local errorBudget =
graphPanel.new(
@ -34,7 +34,7 @@ local singlestat = grafana.singlestat;
fill=10,
description='How much error budget is left looking at our %.3f%% availability gurantees?' % $._config.SLOs.apiserver.target,
)
.addTarget(prometheus.target('100 * (apiserver_request:availability%dd{verb="all"} - %f)' % [$._config.SLOs.apiserver.days, $._config.SLOs.apiserver.target], legendFormat='errorbudget'));
.addTarget(prometheus.target('100 * (apiserver_request:availability%dd{verb="all", %(clusterLabel)s="$cluster"} - %f)' % [$._config.SLOs.apiserver.days, $._config.clusterLabel, $._config.SLOs.apiserver.target], legendFormat='errorbudget'));
local readAvailability =
singlestat.new(
@ -45,7 +45,7 @@ local singlestat = grafana.singlestat;
decimals=3,
description='How many percent of read requests (LIST,GET) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
)
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="read"}' % $._config.SLOs.apiserver.days));
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="read", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
local readRequests =
graphPanel.new(
@ -61,7 +61,7 @@ local singlestat = grafana.singlestat;
.addSeriesOverride({ alias: '/3../i', color: '#F2CC0C' })
.addSeriesOverride({ alias: '/4../i', color: '#3274D9' })
.addSeriesOverride({ alias: '/5../i', color: '#E02F44' })
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="read"})', legendFormat='{{ code }}'));
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="read", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ code }}'));
local readErrors =
graphPanel.new(
@ -72,7 +72,7 @@ local singlestat = grafana.singlestat;
format='percentunit',
description='How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?',
)
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read",code=~"5.."}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read"})', legendFormat='{{ resource }}'));
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read",code=~"5..", %(clusterLabel)s="$cluster"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ resource }}'));
local readDuration =
graphPanel.new(
@ -82,7 +82,7 @@ local singlestat = grafana.singlestat;
format='s',
description='How many seconds is the 99th percentile for reading (LIST|GET) a given resource?',
)
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="read"}', legendFormat='{{ resource }}'));
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
local writeAvailability =
singlestat.new(
@ -93,7 +93,7 @@ local singlestat = grafana.singlestat;
decimals=3,
description='How many percent of write requests (POST|PUT|PATCH|DELETE) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
)
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="write"}' % $._config.SLOs.apiserver.days));
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="write", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
local writeRequests =
graphPanel.new(
@ -109,7 +109,7 @@ local singlestat = grafana.singlestat;
.addSeriesOverride({ alias: '/3../i', color: '#F2CC0C' })
.addSeriesOverride({ alias: '/4../i', color: '#3274D9' })
.addSeriesOverride({ alias: '/5../i', color: '#E02F44' })
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="write"})', legendFormat='{{ code }}'));
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="write", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ code }}'));
local writeErrors =
graphPanel.new(
@ -120,7 +120,7 @@ local singlestat = grafana.singlestat;
format='percentunit',
description='How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?',
)
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write",code=~"5.."}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write"})', legendFormat='{{ resource }}'));
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write",code=~"5..", %(clusterLabel)s="$cluster"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ resource }}'));
local writeDuration =
graphPanel.new(
@ -130,13 +130,13 @@ local singlestat = grafana.singlestat;
format='s',
description='How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?',
)
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="write"}', legendFormat='{{ resource }}'));
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
local workQueueAddRate =
graphPanel.new(
'Work Queue Add Rate',
datasource='$datasource',
span=4,
span=6,
format='ops',
legend_show=false,
min=0,
@ -147,7 +147,7 @@ local singlestat = grafana.singlestat;
graphPanel.new(
'Work Queue Depth',
datasource='$datasource',
span=4,
span=6,
format='short',
legend_show=false,
min=0,
@ -159,7 +159,7 @@ local singlestat = grafana.singlestat;
graphPanel.new(
'Work Queue Latency',
datasource='$datasource',
span=4,
span=12,
format='s',
legend_show=true,
legend_values=true,
@ -169,38 +169,6 @@ local singlestat = grafana.singlestat;
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, name, le))' % $._config, legendFormat='{{instance}} {{name}}'));
local etcdCacheEntryTotal =
graphPanel.new(
'ETCD Cache Entry Total',
datasource='$datasource',
span=4,
format='short',
min=0,
)
.addTarget(prometheus.target('etcd_helper_cache_entry_total{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'));
local etcdCacheEntryRate =
graphPanel.new(
'ETCD Cache Hit/Miss Rate',
datasource='$datasource',
span=4,
format='ops',
min=0,
)
.addTarget(prometheus.target('sum(rate(etcd_helper_cache_hit_total{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance)' % $._config, legendFormat='{{instance}} hit'))
.addTarget(prometheus.target('sum(rate(etcd_helper_cache_miss_total{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance)' % $._config, legendFormat='{{instance}} miss'));
local etcdCacheLatency =
graphPanel.new(
'ETCD Cache Duration 99th Quantile',
datasource='$datasource',
span=4,
format='s',
min=0,
)
.addTarget(prometheus.target('histogram_quantile(0.99,sum(rate(etcd_request_cache_get_duration_seconds_bucket{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}} get'))
.addTarget(prometheus.target('histogram_quantile(0.99,sum(rate(etcd_request_cache_add_duration_seconds_bucket{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}} miss'));
local memory =
graphPanel.new(
'Memory',
@ -252,14 +220,13 @@ local singlestat = grafana.singlestat;
)
.addTemplate(
template.new(
name='cluster',
datasource='$datasource',
query='label_values(apiserver_request_total, %(clusterLabel)s)' % $._config,
current='prod',
'cluster',
'$datasource',
'label_values(apiserver_request_total, %(clusterLabel)s)' % $._config,
label='cluster',
refresh='time',
hide=if $._config.showMultiCluster then '' else 'variable',
refresh=1,
includeAll=false,
sort=1
sort=1,
)
)
.addTemplate(
@ -309,11 +276,6 @@ local singlestat = grafana.singlestat;
.addPanel(workQueueAddRate)
.addPanel(workQueueDepth)
.addPanel(workQueueLatency)
).addRow(
row.new()
.addPanel(etcdCacheEntryTotal)
.addPanel(etcdCacheEntryRate)
.addPanel(etcdCacheLatency)
).addRow(
row.new()
.addPanel(memory)

View File

@ -81,7 +81,7 @@ local singlestat = grafana.singlestat;
format='s',
min=0,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
local getRequestLatency =
graphPanel.new(
@ -96,7 +96,7 @@ local singlestat = grafana.singlestat;
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeControllerManagerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
local memory =
graphPanel.new(

View File

@ -292,7 +292,7 @@ local singlestat = grafana.singlestat;
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, verb, url, le))' % $._config, legendFormat='{{instance}} {{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, verb, url, le))' % $._config, legendFormat='{{instance}} {{verb}} {{url}}'));
local memory =
graphPanel.new(

View File

@ -253,7 +253,7 @@ local singlestat = grafana.singlestat;
template.new(
name='type',
datasource='$datasource',
query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
current='deployment',
hide='',
refresh=1,
@ -263,7 +263,7 @@ local singlestat = grafana.singlestat;
auto: false,
auto_count: 30,
auto_min: '10s',
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
skipUrlSync: false,
};
@ -402,7 +402,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
legendFormat='{{ workload }}',
),
@ -414,7 +414,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
legendFormat='{{ workload }}',
),
@ -427,42 +427,42 @@ local singlestat = grafana.singlestat;
|||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(sum(irate(container_network_receive_packets_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
|||
sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
]
),
@ -476,7 +476,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
legendFormat='{{ workload }}',
),
@ -488,7 +488,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
legendFormat='{{ workload }}',
),
@ -505,7 +505,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
),
gridPos={ h: 9, w: 12, x: 0, y: 38 }
@ -516,7 +516,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
),
gridPos={ h: 9, w: 12, x: 12, y: 38 }
@ -529,7 +529,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_packets_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
graphFormat='pps'
),
@ -541,7 +541,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
graphFormat='pps'
),
@ -557,7 +557,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
graphFormat='pps'
),
@ -569,7 +569,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
|||,
graphFormat='pps'
),

View File

@ -119,7 +119,7 @@ local singlestat = grafana.singlestat;
template.new(
name='workload',
datasource='$datasource',
query='label_values(mixin_pod_workload{namespace=~"$namespace"}, workload)',
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace"}, workload)',
current='',
hide='',
refresh=1,
@ -129,7 +129,7 @@ local singlestat = grafana.singlestat;
auto: false,
auto_count: 30,
auto_min: '10s',
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace"}, workload)',
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace"}, workload)',
skipUrlSync: false,
};
@ -137,7 +137,7 @@ local singlestat = grafana.singlestat;
template.new(
name='type',
datasource='$datasource',
query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
current='deployment',
hide='',
refresh=1,
@ -147,7 +147,7 @@ local singlestat = grafana.singlestat;
auto: false,
auto_count: 30,
auto_min: '10s',
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload"}, workload_type)',
skipUrlSync: false,
};
@ -287,7 +287,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
legendFormat='{{ pod }}',
),
@ -299,7 +299,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
legendFormat='{{ pod }}',
),
@ -313,7 +313,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(avg(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
legendFormat='{{ pod }}',
),
@ -325,7 +325,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(avg(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
legendFormat='{{ pod }}',
),
@ -342,7 +342,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
),
gridPos={ h: 9, w: 12, x: 0, y: 12 }
@ -353,7 +353,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_bytes_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
),
gridPos={ h: 9, w: 12, x: 12, y: 12 }
@ -366,7 +366,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_packets_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
graphFormat='pps'
),
@ -378,7 +378,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_packets_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
graphFormat='pps'
),
@ -394,7 +394,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_receive_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
graphFormat='pps'
),
@ -406,7 +406,7 @@ local singlestat = grafana.singlestat;
graphQuery=|||
sort_desc(sum(irate(container_network_transmit_packets_dropped_total{namespace=~"$namespace"}[$interval:$resolution])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
|||,
graphFormat='pps'
),

View File

@ -88,7 +88,7 @@ local singlestat = grafana.singlestat;
format='s',
min=0,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeProxySelector)s,instance=~"$instance",verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeProxySelector)s,instance=~"$instance",verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
local getRequestLatency =
graphPanel.new(
@ -103,7 +103,7 @@ local singlestat = grafana.singlestat;
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeProxySelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeProxySelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
local memory =
graphPanel.new(

View File

@ -40,7 +40,7 @@ local template = grafana.template;
local podWorkloadColumns = [
'sum(kube_pod_owner{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
'count(avg(mixin_pod_workload{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config,
'count(avg(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster"}) by (workload, namespace)) by (namespace)' % $._config,
];
local networkColumns = [

View File

@ -78,7 +78,7 @@ local template = grafana.template;
local cpuUsageQuery = 'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config;
local memoryUsageQuery = 'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}) by (pod)' % $._config;
local memoryUsageQuery = 'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""}) by (pod)' % $._config;
local cpuQuotaRequestsQuery = 'scalar(kube_resourcequota{%(clusterLabel)s="$cluster", namespace="$namespace", type="hard",resource="requests.cpu"})' % $._config;
local cpuQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.cpu');
@ -105,11 +105,11 @@ local template = grafana.template;
)
.addPanel(
g.panel('Memory Utilization (from requests)') +
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"})' % $._config)
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"})' % $._config)
)
.addPanel(
g.panel('Memory Utilisation (from limits)') +
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"})' % $._config)
g.statPanel('sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"})' % $._config)
)
)
.addRow(
@ -209,11 +209,11 @@ local template = grafana.template;
.addPanel(
g.panel('Memory Quota') +
g.tablePanel([
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) by (pod)' % $._config,
'sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
'sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace"}) by (pod)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace",container!="", image!=""}) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace"}) by (pod)' % $._config,
'sum(container_memory_rss{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
'sum(container_memory_cache{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,
'sum(container_memory_swap{%(clusterLabel)s="$cluster", namespace="$namespace",container!=""}) by (pod)' % $._config,

View File

@ -103,7 +103,7 @@ local template = grafana.template;
g.row('CPU Throttling')
.addPanel(
g.panel('CPU Throttling') +
g.queryPanel('sum(increase(container_cpu_cfs_throttled_periods_total{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace="$namespace", pod="$pod", container!="POD", %(clusterLabel)s="$cluster"}[5m])) by (container)' % $._config, '{{container}}') +
g.queryPanel('sum(increase(container_cpu_cfs_throttled_periods_total{namespace="$namespace", pod="$pod", container!="POD", container!="", %(clusterLabel)s="$cluster"}[5m])) by (container) /sum(increase(container_cpu_cfs_periods_total{namespace="$namespace", pod="$pod", container!="POD", container!="", %(clusterLabel)s="$cluster"}[5m])) by (container)' % $._config, '{{container}}') +
g.stack
+ {
yaxes: g.yaxes({ format: 'percentunit', max: 1 }),
@ -148,7 +148,7 @@ local template = grafana.template;
.addPanel(
g.panel('Memory Usage') +
g.queryPanel([
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!="", image!=""}) by (container)' % $._config,
memRequestsQuery,
memLimitsQuery,
], [
@ -189,11 +189,11 @@ local template = grafana.template;
.addPanel(
g.panel('Memory Quota') +
g.tablePanel([
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!=""}) by (container)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="POD", container!="", image!=""}) by (container)' % $._config,
'sum(kube_pod_container_resource_requests_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod"}) by (container)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod"}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", image!=""}) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
'sum(kube_pod_container_resource_limits_memory_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!=""}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
'sum(container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container!="", image!=""}) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace="$namespace", pod="$pod"}) by (container)' % $._config,
'sum(container_memory_rss{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container != "", container != "POD"}) by (container)' % $._config,
'sum(container_memory_cache{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container != "", container != "POD"}) by (container)' % $._config,
'sum(container_memory_swap{%(clusterLabel)s="$cluster", namespace="$namespace", pod="$pod", container != "", container != "POD"}) by (container)' % $._config,

View File

@ -8,7 +8,7 @@ local template = grafana.template;
template.new(
name='type',
datasource='$datasource',
query='label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
current='deployment',
hide='',
refresh=1,
@ -18,7 +18,7 @@ local template = grafana.template;
auto: false,
auto_count: 30,
auto_min: '10s',
definition: 'label_values(mixin_pod_workload{namespace=~"$namespace", workload=~".+"}, workload_type)',
definition: 'label_values(namespace_workload_pod:kube_pod_owner:relabel{namespace=~"$namespace", workload=~".+"}, workload_type)',
skipUrlSync: false,
},
@ -61,32 +61,32 @@ local template = grafana.template;
|||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
||| % $._config,
|||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
||| % $._config,
|||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
||| % $._config,
|||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
||| % $._config,
|||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
||| % $._config,
|||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload_type="$type"}) by (workload))
||| % $._config,
];
@ -129,7 +129,7 @@ local template = grafana.template;
sum(
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}
* on(namespace,pod)
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
) by (workload, workload_type)
||| % $._config;
@ -137,18 +137,18 @@ local template = grafana.template;
sum(
kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace"}
* on(namespace,pod)
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
) by (workload, workload_type)
||| % $._config;
local podCountQuery = 'count(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}) by (workload, workload_type)' % $._config;
local podCountQuery = 'count(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}) by (workload, workload_type)' % $._config;
local cpuLimitsQuery = std.strReplace(cpuRequestsQuery, 'requests', 'limits');
local memUsageQuery = |||
sum(
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""}
* on(namespace,pod)
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload_type="$type"}
) by (workload, workload_type)
||| % $._config;
local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu_cores', 'memory_bytes');
@ -287,7 +287,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -300,7 +300,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -313,7 +313,7 @@ local template = grafana.template;
g.queryPanel(|||
(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -326,7 +326,7 @@ local template = grafana.template;
g.queryPanel(|||
(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -339,7 +339,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -352,7 +352,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -365,7 +365,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -378,7 +378,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~".+", workload_type="$type"}) by (workload))
||| % $._config, '{{workload}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },

View File

@ -32,7 +32,7 @@ local template = grafana.template;
template.new(
name='workload',
datasource='$datasource',
query='label_values(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace"}, workload)' % $._config.clusterLabel,
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace"}, workload)' % $._config.clusterLabel,
current='',
hide='',
refresh=1,
@ -44,7 +44,7 @@ local template = grafana.template;
template.new(
name='type',
datasource='$datasource',
query='label_values(mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload"}, workload_type)' % $._config.clusterLabel,
query='label_values(namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload"}, workload_type)' % $._config.clusterLabel,
current='',
hide='',
refresh=1,
@ -63,32 +63,32 @@ local template = grafana.template;
|||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config,
|||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config,
|||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config,
|||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config,
|||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config,
|||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config,
];
@ -128,7 +128,7 @@ local template = grafana.template;
sum(
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{%(clusterLabel)s="$cluster", namespace="$namespace"}
* on(namespace,pod)
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
) by (pod)
||| % $._config;
@ -136,7 +136,7 @@ local template = grafana.template;
sum(
kube_pod_container_resource_requests_cpu_cores{%(clusterLabel)s="$cluster", namespace="$namespace"}
* on(namespace,pod)
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
) by (pod)
||| % $._config;
@ -144,9 +144,9 @@ local template = grafana.template;
local memUsageQuery = |||
sum(
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!=""}
container_memory_working_set_bytes{%(clusterLabel)s="$cluster", namespace="$namespace", container!="", image!=""}
* on(namespace,pod)
group_left(workload, workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", namespace="$namespace", workload="$workload", workload_type="$type"}
) by (pod)
||| % $._config;
local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu_cores', 'memory_bytes');
@ -229,7 +229,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -242,7 +242,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -255,7 +255,7 @@ local template = grafana.template;
g.queryPanel(|||
(avg(irate(container_network_receive_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -268,7 +268,7 @@ local template = grafana.template;
g.queryPanel(|||
(avg(irate(container_network_transmit_bytes_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -281,7 +281,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_receive_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -294,7 +294,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_transmit_packets_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -307,7 +307,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_receive_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },
@ -320,7 +320,7 @@ local template = grafana.template;
g.queryPanel(|||
(sum(irate(container_network_transmit_packets_dropped_total{%(clusterLabel)s="$cluster", namespace=~"$namespace"}[$__interval])
* on (namespace,pod)
group_left(workload,workload_type) mixin_pod_workload{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
group_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{%(clusterLabel)s="$cluster", %(namespaceLabel)s=~"$namespace", workload=~"$workload", workload_type="$type"}) by (pod))
||| % $._config, '{{pod}}') +
g.stack +
{ yaxes: g.yaxes('Bps') },

View File

@ -76,7 +76,7 @@ local singlestat = grafana.singlestat;
format='s',
min=0,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="POST"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
local getRequestLatency =
graphPanel.new(
@ -91,7 +91,7 @@ local singlestat = grafana.singlestat;
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_latency_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(kubeSchedulerSelector)s, instance=~"$instance", verb="GET"}[5m])) by (verb, url, le))' % $._config, legendFormat='{{verb}} {{url}}'));
local memory =
graphPanel.new(

View File

@ -100,7 +100,7 @@
},
// workload aggregation for deployments
{
record: 'mixin_pod_workload',
record: 'namespace_workload_pod:kube_pod_owner:relabel',
expr: |||
max by (%(clusterLabel)s, namespace, workload, pod) (
label_replace(
@ -121,7 +121,7 @@
},
},
{
record: 'mixin_pod_workload',
record: 'namespace_workload_pod:kube_pod_owner:relabel',
expr: |||
max by (%(clusterLabel)s, namespace, workload, pod) (
label_replace(
@ -135,7 +135,7 @@
},
},
{
record: 'mixin_pod_workload',
record: 'namespace_workload_pod:kube_pod_owner:relabel',
expr: |||
max by (%(clusterLabel)s, namespace, workload, pod) (
label_replace(

View File

@ -28,8 +28,14 @@
sum(rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
-
(
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(window)s])) +
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(window)s])) +
(
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(window)s]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(window)s]))
+
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(window)s]))
)
)
@ -151,8 +157,14 @@
sum(increase(apiserver_request_duration_seconds_count{%(kubeApiserverReadSelector)s}[%(SLODays)s]))
-
(
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s])) +
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s])) +
(
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s]))
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(SLODays)s]))
)
) +
@ -174,8 +186,14 @@
-
(
# too slow
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s])) +
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s])) +
(
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s]))
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(SLODays)s]))
)
+

View File

@ -10,7 +10,7 @@ local template = grafana.template;
{
grafanaDashboards+:: {
'prometheus.json':
g.dashboard('Prometheus')
g.dashboard('Prometheus Overview')
.addMultiTemplate('job', 'prometheus_build_info', 'job')
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
.addRow(