This repository has been archived on 2023-04-02. You can view files and clone it, but cannot push or open issues or pull requests.
gitops-tbrnt/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/rules/kube_apiserver.libsonnet

247 lines
9.9 KiB
Plaintext

{
_config+:: {
kubeApiserverSelector: 'job="kube-apiserver"',
podLabel: 'pod',
kubeApiserverReadSelector: 'verb=~"LIST|GET"',
kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"',
},
prometheusRules+:: {
local SLODays = $._config.SLOs.apiserver.days + 'd',
local SLOTarget = $._config.SLOs.apiserver.target,
local verbs = [
{ type: 'read', selector: $._config.kubeApiserverReadSelector },
{ type: 'write', selector: $._config.kubeApiserverWriteSelector },
],
groups+: [
{
name: 'kube-apiserver.rules',
rules: [
{
record: 'apiserver_request:burnrate%(window)s' % w,
expr: |||
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
-
(
(
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(window)s]))
or
vector(0)
)
+
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(window)s]))
+
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(window)s]))
)
)
+
# errors
sum(rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,code=~"5.."}[%(window)s]))
)
/
sum(rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(window)s]))
||| % {
window: w,
kubeApiserverSelector: $._config.kubeApiserverSelector,
kubeApiserverReadSelector: $._config.kubeApiserverReadSelector,
},
labels: {
verb: 'read',
},
}
for w in std.set([ // Get the unique array of short and long window rates
w.short
for w in $._config.SLOs.apiserver.windows
] + [
w.long
for w in $._config.SLOs.apiserver.windows
])
] + [
{
record: 'apiserver_request:burnrate%(window)s' % w,
expr: |||
(
(
# too slow
sum(rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s}[%(window)s]))
-
sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,le="1"}[%(window)s]))
)
+
sum(rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s]))
)
/
sum(rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s}[%(window)s]))
||| % {
window: w,
kubeApiserverSelector: $._config.kubeApiserverSelector,
kubeApiserverWriteSelector: $._config.kubeApiserverWriteSelector,
},
labels: {
verb: 'write',
},
}
for w in std.set([ // Get the unique array of short and long window rates
w.short
for w in $._config.SLOs.apiserver.windows
] + [
w.long
for w in $._config.SLOs.apiserver.windows
])
] + [
{
record: 'code_resource:apiserver_request_total:rate5m',
expr: |||
sum by (code,resource) (rate(apiserver_request_total{%s}[5m]))
||| % std.join(',', [$._config.kubeApiserverSelector, verb.selector]),
labels: {
verb: verb.type,
},
}
for verb in verbs
] + [
{
record: 'cluster_quantile:apiserver_request_duration_seconds:histogram_quantile',
expr: |||
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{%s}[5m]))) > 0
||| % std.join(',', [$._config.kubeApiserverSelector, verb.selector]),
labels: {
verb: verb.type,
quantile: '0.99',
},
}
for verb in verbs
] + [
{
record: 'cluster_quantile:apiserver_request_duration_seconds:histogram_quantile',
expr: |||
histogram_quantile(%(quantile)s, sum(rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, %(podLabel)s))
||| % ({ quantile: quantile } + $._config),
labels: {
quantile: quantile,
},
}
for quantile in ['0.99', '0.9', '0.5']
],
},
{
name: 'kube-apiserver-availability.rules',
interval: '3m',
rules: [
{
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
(
# write too slow
sum(increase(apiserver_request_duration_seconds_count{%(kubeApiserverWriteSelector)s}[%(SLODays)s]))
-
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverWriteSelector)s,le="1"}[%(SLODays)s]))
) +
(
# read too slow
sum(increase(apiserver_request_duration_seconds_count{%(kubeApiserverReadSelector)s}[%(SLODays)s]))
-
(
(
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s]))
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(SLODays)s]))
)
) +
# errors
sum(code:apiserver_request_total:increase%(SLODays)s{code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase%(SLODays)s)
||| % ($._config { SLODays: SLODays }),
labels: {
verb: 'all',
},
},
{
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
sum(increase(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s}[%(SLODays)s]))
-
(
# too slow
(
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope=~"resource|",le="0.1"}[%(SLODays)s]))
or
vector(0)
)
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="namespace",le="0.5"}[%(SLODays)s]))
+
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,scope="cluster",le="5"}[%(SLODays)s]))
)
+
# errors
sum(code:apiserver_request_total:increase%(SLODays)s{verb="read",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase%(SLODays)s{verb="read"})
||| % ($._config { SLODays: SLODays }),
labels: {
verb: 'read',
},
},
{
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
(
# too slow
sum(increase(apiserver_request_duration_seconds_count{%(kubeApiserverWriteSelector)s}[%(SLODays)s]))
-
sum(increase(apiserver_request_duration_seconds_bucket{%(kubeApiserverWriteSelector)s,le="1"}[%(SLODays)s]))
)
+
# errors
sum(code:apiserver_request_total:increase%(SLODays)s{verb="write",code=~"5.."} or vector(0))
)
/
sum(code:apiserver_request_total:increase%(SLODays)s{verb="write"})
||| % ($._config { SLODays: SLODays }),
labels: {
verb: 'write',
},
},
] + [
{
record: 'code_verb:apiserver_request_total:increase%s' % SLODays,
expr: |||
sum by (code, verb) (increase(apiserver_request_total{%s,verb="%s",code=~"%s"}[%s]))
||| % [$._config.kubeApiserverSelector, verb, code, SLODays],
}
for code in ['2..', '3..', '4..', '5..']
for verb in ['LIST', 'GET', 'POST', 'PUT', 'PATCH', 'DELETE']
] + [
{
record: 'code:apiserver_request_total:increase%s' % SLODays,
expr: |||
sum by (code) (code_verb:apiserver_request_total:increase%s{%s})
||| % [SLODays, verb.selector],
labels: {
verb: verb.type,
},
}
for verb in verbs
],
},
],
},
}