287 lines
11 KiB
Plaintext
287 lines
11 KiB
Plaintext
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
|
|
local dashboard = grafana.dashboard;
|
|
local row = grafana.row;
|
|
local prometheus = grafana.prometheus;
|
|
local template = grafana.template;
|
|
local graphPanel = grafana.graphPanel;
|
|
local singlestat = grafana.singlestat;
|
|
|
|
{
|
|
_config+:: {
|
|
kubeApiserverSelector: 'job="kube-apiserver"',
|
|
},
|
|
|
|
grafanaDashboards+:: {
|
|
'apiserver.json':
|
|
local availability1d =
|
|
singlestat.new(
|
|
'Availability (%dd) > %.3f%%' % [$._config.SLOs.apiserver.days, 100 * $._config.SLOs.apiserver.target],
|
|
datasource='$datasource',
|
|
span=4,
|
|
format='percentunit',
|
|
decimals=3,
|
|
description='How many percent of requests (both read and write) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
|
|
)
|
|
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="all", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
|
|
|
|
local errorBudget =
|
|
graphPanel.new(
|
|
'ErrorBudget (%dd) > %.3f%%' % [$._config.SLOs.apiserver.days, 100 * $._config.SLOs.apiserver.target],
|
|
datasource='$datasource',
|
|
span=8,
|
|
format='percentunit',
|
|
decimals=3,
|
|
fill=10,
|
|
description='How much error budget is left looking at our %.3f%% availability guarantees?' % $._config.SLOs.apiserver.target,
|
|
)
|
|
.addTarget(prometheus.target('100 * (apiserver_request:availability%dd{verb="all", %(clusterLabel)s="$cluster"} - %f)' % [$._config.SLOs.apiserver.days, $._config.clusterLabel, $._config.SLOs.apiserver.target], legendFormat='errorbudget'));
|
|
|
|
local readAvailability =
|
|
singlestat.new(
|
|
'Read Availability (%dd)' % $._config.SLOs.apiserver.days,
|
|
datasource='$datasource',
|
|
span=3,
|
|
format='percentunit',
|
|
decimals=3,
|
|
description='How many percent of read requests (LIST,GET) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
|
|
)
|
|
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="read", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
|
|
|
|
local readRequests =
|
|
graphPanel.new(
|
|
'Read SLI - Requests',
|
|
datasource='$datasource',
|
|
span=3,
|
|
format='reqps',
|
|
stack=true,
|
|
fill=10,
|
|
description='How many read requests (LIST,GET) per second do the apiservers get by code?',
|
|
)
|
|
.addSeriesOverride({ alias: '/2../i', color: '#56A64B' })
|
|
.addSeriesOverride({ alias: '/3../i', color: '#F2CC0C' })
|
|
.addSeriesOverride({ alias: '/4../i', color: '#3274D9' })
|
|
.addSeriesOverride({ alias: '/5../i', color: '#E02F44' })
|
|
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="read", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ code }}'));
|
|
|
|
local readErrors =
|
|
graphPanel.new(
|
|
'Read SLI - Errors',
|
|
datasource='$datasource',
|
|
min=0,
|
|
span=3,
|
|
format='percentunit',
|
|
description='How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?',
|
|
)
|
|
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read",code=~"5..", %(clusterLabel)s="$cluster"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="read", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ resource }}'));
|
|
|
|
local readDuration =
|
|
graphPanel.new(
|
|
'Read SLI - Duration',
|
|
datasource='$datasource',
|
|
span=3,
|
|
format='s',
|
|
description='How many seconds is the 99th percentile for reading (LIST|GET) a given resource?',
|
|
)
|
|
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
|
|
|
|
local writeAvailability =
|
|
singlestat.new(
|
|
'Write Availability (%dd)' % $._config.SLOs.apiserver.days,
|
|
datasource='$datasource',
|
|
span=3,
|
|
format='percentunit',
|
|
decimals=3,
|
|
description='How many percent of write requests (POST|PUT|PATCH|DELETE) in %d days have been answered successfully and fast enough?' % $._config.SLOs.apiserver.days,
|
|
)
|
|
.addTarget(prometheus.target('apiserver_request:availability%dd{verb="write", %(clusterLabel)s="$cluster"}' % [$._config.SLOs.apiserver.days, $._config.clusterLabel]));
|
|
|
|
local writeRequests =
|
|
graphPanel.new(
|
|
'Write SLI - Requests',
|
|
datasource='$datasource',
|
|
span=3,
|
|
format='reqps',
|
|
stack=true,
|
|
fill=10,
|
|
description='How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?',
|
|
)
|
|
.addSeriesOverride({ alias: '/2../i', color: '#56A64B' })
|
|
.addSeriesOverride({ alias: '/3../i', color: '#F2CC0C' })
|
|
.addSeriesOverride({ alias: '/4../i', color: '#3274D9' })
|
|
.addSeriesOverride({ alias: '/5../i', color: '#E02F44' })
|
|
.addTarget(prometheus.target('sum by (code) (code_resource:apiserver_request_total:rate5m{verb="write", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ code }}'));
|
|
|
|
local writeErrors =
|
|
graphPanel.new(
|
|
'Write SLI - Errors',
|
|
datasource='$datasource',
|
|
min=0,
|
|
span=3,
|
|
format='percentunit',
|
|
description='How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?',
|
|
)
|
|
.addTarget(prometheus.target('sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write",code=~"5..", %(clusterLabel)s="$cluster"}) / sum by (resource) (code_resource:apiserver_request_total:rate5m{verb="write", %(clusterLabel)s="$cluster"})' % $._config, legendFormat='{{ resource }}'));
|
|
|
|
local writeDuration =
|
|
graphPanel.new(
|
|
'Write SLI - Duration',
|
|
datasource='$datasource',
|
|
span=3,
|
|
format='s',
|
|
description='How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?',
|
|
)
|
|
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
|
|
|
|
local workQueueAddRate =
|
|
graphPanel.new(
|
|
'Work Queue Add Rate',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='ops',
|
|
legend_show=false,
|
|
min=0,
|
|
)
|
|
.addTarget(prometheus.target('sum(rate(workqueue_adds_total{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, name)' % $._config, legendFormat='{{instance}} {{name}}'));
|
|
|
|
local workQueueDepth =
|
|
graphPanel.new(
|
|
'Work Queue Depth',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='short',
|
|
legend_show=false,
|
|
min=0,
|
|
)
|
|
.addTarget(prometheus.target('sum(rate(workqueue_depth{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, name)' % $._config, legendFormat='{{instance}} {{name}}'));
|
|
|
|
|
|
local workQueueLatency =
|
|
graphPanel.new(
|
|
'Work Queue Latency',
|
|
datasource='$datasource',
|
|
span=12,
|
|
format='s',
|
|
legend_show=true,
|
|
legend_values=true,
|
|
legend_current=true,
|
|
legend_alignAsTable=true,
|
|
legend_rightSide=true,
|
|
)
|
|
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{%(kubeApiserverSelector)s, instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])) by (instance, name, le))' % $._config, legendFormat='{{instance}} {{name}}'));
|
|
|
|
local memory =
|
|
graphPanel.new(
|
|
'Memory',
|
|
datasource='$datasource',
|
|
span=4,
|
|
format='bytes',
|
|
)
|
|
.addTarget(prometheus.target('process_resident_memory_bytes{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'));
|
|
|
|
local cpu =
|
|
graphPanel.new(
|
|
'CPU usage',
|
|
datasource='$datasource',
|
|
span=4,
|
|
format='short',
|
|
min=0,
|
|
)
|
|
.addTarget(prometheus.target('rate(process_cpu_seconds_total{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}[5m])' % $._config, legendFormat='{{instance}}'));
|
|
|
|
local goroutines =
|
|
graphPanel.new(
|
|
'Goroutines',
|
|
datasource='$datasource',
|
|
span=4,
|
|
format='short',
|
|
)
|
|
.addTarget(prometheus.target('go_goroutines{%(kubeApiserverSelector)s,instance=~"$instance", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'));
|
|
|
|
dashboard.new(
|
|
'%(dashboardNamePrefix)sAPI server' % $._config.grafanaK8s,
|
|
time_from='now-1h',
|
|
uid=($._config.grafanaDashboardIDs['apiserver.json']),
|
|
tags=($._config.grafanaK8s.dashboardTags),
|
|
).addTemplate(
|
|
{
|
|
current: {
|
|
text: 'default',
|
|
value: 'default',
|
|
},
|
|
hide: 0,
|
|
label: null,
|
|
name: 'datasource',
|
|
options: [],
|
|
query: 'prometheus',
|
|
refresh: 1,
|
|
regex: '',
|
|
type: 'datasource',
|
|
},
|
|
)
|
|
.addTemplate(
|
|
template.new(
|
|
'cluster',
|
|
'$datasource',
|
|
'label_values(apiserver_request_total, %(clusterLabel)s)' % $._config,
|
|
label='cluster',
|
|
refresh='time',
|
|
hide=if $._config.showMultiCluster then '' else 'variable',
|
|
sort=1,
|
|
)
|
|
)
|
|
.addTemplate(
|
|
template.new(
|
|
'instance',
|
|
'$datasource',
|
|
'label_values(apiserver_request_total{%(kubeApiserverSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config,
|
|
refresh='time',
|
|
includeAll=true,
|
|
sort=1,
|
|
)
|
|
)
|
|
.addPanel(
|
|
grafana.text.new(
|
|
title='Notice',
|
|
content='The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.',
|
|
description='The SLO (service level objective) and other metrics displayed on this dashboard are for informational purposes only.',
|
|
span=12,
|
|
),
|
|
gridPos={
|
|
h: 2,
|
|
w: 24,
|
|
x: 0,
|
|
y: 0,
|
|
},
|
|
)
|
|
.addRow(
|
|
row.new()
|
|
.addPanel(availability1d)
|
|
.addPanel(errorBudget)
|
|
)
|
|
.addRow(
|
|
row.new()
|
|
.addPanel(readAvailability)
|
|
.addPanel(readRequests)
|
|
.addPanel(readErrors)
|
|
.addPanel(readDuration)
|
|
)
|
|
.addRow(
|
|
row.new()
|
|
.addPanel(writeAvailability)
|
|
.addPanel(writeRequests)
|
|
.addPanel(writeErrors)
|
|
.addPanel(writeDuration)
|
|
).addRow(
|
|
row.new()
|
|
.addPanel(workQueueAddRate)
|
|
.addPanel(workQueueDepth)
|
|
.addPanel(workQueueLatency)
|
|
).addRow(
|
|
row.new()
|
|
.addPanel(memory)
|
|
.addPanel(cpu)
|
|
.addPanel(goroutines)
|
|
) + { refresh: $._config.grafanaK8s.refresh },
|
|
},
|
|
}
|