This repository has been archived on 2023-04-02. You can view files and clone it, but cannot push or open issues or pull requests.
gitops-tbrnt/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/dashboards/kubelet.libsonnet

421 lines
17 KiB
Plaintext

local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local tablePanel = grafana.tablePanel;
local singlestat = grafana.singlestat;
{
grafanaDashboards+:: {
'kubelet.json':
local upCount =
singlestat.new(
'Up',
datasource='$datasource',
span=2,
valueName='min',
)
.addTarget(prometheus.target('sum(up{%(clusterLabel)s="$cluster", %(kubeletSelector)s})' % $._config));
local runningPodCount =
singlestat.new(
'Running Pods',
datasource='$datasource',
span=2,
valueName='min',
)
// TODO: The second query selected by the OR operator is for backward compatibility with kubernetes < 1.19, so this can be retored to a single query once 1.23 is out
.addTarget(prometheus.target('sum(kubelet_running_pods{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}) OR sum(kubelet_running_pod_count{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"})' % $._config, legendFormat='{{instance}}'));
local runningContainerCount =
singlestat.new(
'Running Container',
datasource='$datasource',
span=2,
valueName='min',
)
// TODO: The second query selected by the OR operator is for backward compatibility with kubernetes < 1.19, so this can be retored to a single query once 1.23 is out
.addTarget(prometheus.target('sum(kubelet_running_containers{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}) OR sum(kubelet_running_container_count{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"})' % $._config, legendFormat='{{instance}}'));
local actualVolumeCount =
singlestat.new(
'Actual Volume Count',
datasource='$datasource',
span=2,
valueName='min',
)
.addTarget(prometheus.target('sum(volume_manager_total_volumes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance", state="actual_state_of_world"})' % $._config, legendFormat='{{instance}}'));
local desiredVolumeCount =
singlestat.new(
'Desired Volume Count',
datasource='$datasource',
span=2,
valueName='min',
)
.addTarget(prometheus.target('sum(volume_manager_total_volumes{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance",state="desired_state_of_world"})' % $._config, legendFormat='{{instance}}'));
local configErrorCount =
singlestat.new(
'Config Error Count',
datasource='$datasource',
span=2,
valueName='min',
)
.addTarget(prometheus.target('sum(rate(kubelet_node_config_error{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}[5m]))' % $._config, legendFormat='{{instance}}'));
local operationRate =
graphPanel.new(
'Operation Rate',
datasource='$datasource',
span=6,
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('sum(rate(kubelet_runtime_operations_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (operation_type, instance)' % $._config, legendFormat='{{instance}} {{operation_type}}'));
local operationErrorRate =
graphPanel.new(
'Operation Error Rate',
datasource='$datasource',
span=6,
min=0,
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('sum(rate(kubelet_runtime_operations_errors_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, operation_type)' % $._config, legendFormat='{{instance}} {{operation_type}}'));
local operationLatency =
graphPanel.new(
'Operation duration 99th quantile',
datasource='$datasource',
span=12,
format='s',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, operation_type, le))' % $._config, legendFormat='{{instance}} {{operation_type}}'));
local podStartRate =
graphPanel.new(
'Pod Start Rate',
datasource='$datasource',
span=6,
min=0,
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('sum(rate(kubelet_pod_start_duration_seconds_count{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance)' % $._config, legendFormat='{{instance}} pod'))
.addTarget(prometheus.target('sum(rate(kubelet_pod_worker_duration_seconds_count{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance)' % $._config, legendFormat='{{instance}} worker'));
local podStartLatency =
graphPanel.new(
'Pod Start Duration',
datasource='$datasource',
span=6,
min=0,
format='s',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}} pod'))
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}} worker'));
local storageOperationRate =
graphPanel.new(
'Storage Operation Rate',
datasource='$datasource',
span=6,
min=0,
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
legend_hideEmpty=true,
legend_hideZero=true,
)
.addTarget(prometheus.target('sum(rate(storage_operation_duration_seconds_count{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, operation_name, volume_plugin)' % $._config, legendFormat='{{instance}} {{operation_name}} {{volume_plugin}}'));
local storageOperationErrorRate =
graphPanel.new(
'Storage Operation Error Rate',
datasource='$datasource',
span=6,
min=0,
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
legend_hideEmpty=true,
legend_hideZero=true,
)
.addTarget(prometheus.target('sum(rate(storage_operation_errors_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, operation_name, volume_plugin)' % $._config, legendFormat='{{instance}} {{operation_name}} {{volume_plugin}}'));
local storageOperationLatency =
graphPanel.new(
'Storage Operation Duration 99th quantile',
datasource='$datasource',
span=12,
min=0,
format='s',
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
legend_hideEmpty=true,
legend_hideZero=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, operation_name, volume_plugin, le))' % $._config, legendFormat='{{instance}} {{operation_name}} {{volume_plugin}}'));
local cgroupManagerRate =
graphPanel.new(
'Cgroup manager operation rate',
datasource='$datasource',
span=6,
min=0,
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('sum(rate(kubelet_cgroup_manager_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, operation_type)' % $._config, legendFormat='{{operation_type}}'));
local cgroupManagerDuration =
graphPanel.new(
'Cgroup manager 99th quantile',
datasource='$datasource',
span=6,
min=0,
format='s',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, operation_type, le))' % $._config, legendFormat='{{instance}} {{operation_type}}'));
local plegRelistRate =
graphPanel.new(
'PLEG relist rate',
datasource='$datasource',
span=6,
min=0,
description='Pod lifecycle event generator',
format='ops',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('sum(rate(kubelet_pleg_relist_duration_seconds_count{%(clusterLabel)s="$cluster", %(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance)' % $._config, legendFormat='{{instance}}'));
local plegRelistDuration =
graphPanel.new(
'PLEG relist duration',
datasource='$datasource',
span=12,
min=0,
format='s',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}}'));
local plegRelistInterval =
graphPanel.new(
'PLEG relist interval',
datasource='$datasource',
span=6,
min=0,
format='s',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])) by (instance, le))' % $._config, legendFormat='{{instance}}'));
local rpcRate =
graphPanel.new(
'RPC Rate',
datasource='$datasource',
span=12,
min=0,
format='ops',
)
.addTarget(prometheus.target('sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance",code=~"2.."}[5m]))' % $._config, legendFormat='2xx'))
.addTarget(prometheus.target('sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance",code=~"3.."}[5m]))' % $._config, legendFormat='3xx'))
.addTarget(prometheus.target('sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance",code=~"4.."}[5m]))' % $._config, legendFormat='4xx'))
.addTarget(prometheus.target('sum(rate(rest_client_requests_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance",code=~"5.."}[5m]))' % $._config, legendFormat='5xx'));
local requestDuration =
graphPanel.new(
'Request duration 99th quantile',
datasource='$datasource',
span=12,
min=0,
format='s',
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_rightSide=true,
)
.addTarget(prometheus.target('histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{%(clusterLabel)s="$cluster",%(kubeletSelector)s, instance=~"$instance"}[5m])) by (instance, verb, url, le))' % $._config, legendFormat='{{instance}} {{verb}} {{url}}'));
local memory =
graphPanel.new(
'Memory',
datasource='$datasource',
span=4,
format='bytes',
)
.addTarget(prometheus.target('process_resident_memory_bytes{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}' % $._config, legendFormat='{{instance}}'));
local cpu =
graphPanel.new(
'CPU usage',
datasource='$datasource',
span=4,
format='short',
min=0,
)
.addTarget(prometheus.target('rate(process_cpu_seconds_total{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}[5m])' % $._config, legendFormat='{{instance}}'));
local goroutines =
graphPanel.new(
'Goroutines',
datasource='$datasource',
span=4,
format='short',
)
.addTarget(prometheus.target('go_goroutines{%(clusterLabel)s="$cluster",%(kubeletSelector)s,instance=~"$instance"}' % $._config, legendFormat='{{instance}}'));
dashboard.new(
'%(dashboardNamePrefix)sKubelet' % $._config.grafanaK8s,
time_from='now-1h',
uid=($._config.grafanaDashboardIDs['kubelet.json']),
tags=($._config.grafanaK8s.dashboardTags),
).addTemplate(
{
current: {
text: 'default',
value: 'default',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplate(
template.new(
'cluster',
'$datasource',
'label_values(kube_pod_info, %(clusterLabel)s)' % $._config,
label='cluster',
refresh='time',
hide=if $._config.showMultiCluster then '' else 'variable',
sort=1,
)
)
.addTemplate(
template.new(
'instance',
'$datasource',
'label_values(kubelet_runtime_operations_total{%(clusterLabel)s="$cluster", %(kubeletSelector)s}, instance)' % $._config,
refresh='time',
includeAll=true,
sort=1,
)
)
.addRow(
row.new()
.addPanel(upCount)
.addPanel(runningPodCount)
.addPanel(runningContainerCount)
.addPanel(actualVolumeCount)
.addPanel(desiredVolumeCount)
.addPanel(configErrorCount)
).addRow(
row.new()
.addPanel(operationRate)
.addPanel(operationErrorRate)
).addRow(
row.new()
.addPanel(operationLatency)
).addRow(
row.new()
.addPanel(podStartRate)
.addPanel(podStartLatency)
).addRow(
row.new()
.addPanel(storageOperationRate)
.addPanel(storageOperationErrorRate)
).addRow(
row.new()
.addPanel(storageOperationLatency)
).addRow(
row.new()
.addPanel(cgroupManagerRate)
.addPanel(cgroupManagerDuration)
).addRow(
row.new()
.addPanel(plegRelistRate)
.addPanel(plegRelistInterval)
).addRow(
row.new()
.addPanel(plegRelistDuration)
).addRow(
row.new()
.addPanel(rpcRate)
).addRow(
row.new()
.addPanel(requestDuration)
).addRow(
row.new()
.addPanel(memory)
.addPanel(cpu)
.addPanel(goroutines)
) + { refresh: $._config.grafanaK8s.refresh },
},
}