This repository has been archived on 2023-04-02. You can view files and clone it, but cannot push or open issues or pull requests.
gitops-tbrnt/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/alerts/resource_alerts.libsonnet

170 lines
7.0 KiB
Plaintext

{
_config+:: {
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
nodeExporterSelector: error 'must provide selector for node-exporter',
namespaceSelector: null,
prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
// We alert when the aggregate (CPU, Memory) quota for all namespaces is
// greater than the amount of the resources in the cluster. We do however
// allow you to overcommit if you wish.
namespaceOvercommitFactor: 1.5,
cpuThrottlingPercent: 25,
cpuThrottlingSelector: '',
// Set this selector for seleting namespaces that contains resources used for overprovision
// See https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-configure-overprovisioning-with-cluster-autoscaler
// for more details.
ignoringOverprovisionedWorkloadSelector: '',
},
prometheusAlerts+:: {
groups+: [
{
name: 'kubernetes-resources',
rules: [
{
alert: 'KubeCPUOvercommit',
expr: |||
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{%(ignoringOverprovisionedWorkloadSelector)s})
/
sum(kube_node_status_allocatable_cpu_cores)
>
(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.',
summary: 'Cluster has overcommitted CPU resource requests.',
},
'for': '5m',
},
{
alert: 'KubeMemoryOvercommit',
expr: |||
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{%(ignoringOverprovisionedWorkloadSelector)s})
/
sum(kube_node_status_allocatable_memory_bytes)
>
(count(kube_node_status_allocatable_memory_bytes)-1)
/
count(kube_node_status_allocatable_memory_bytes)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.',
summary: 'Cluster has overcommitted memory resource requests.',
},
'for': '5m',
},
{
alert: 'KubeCPUQuotaOvercommit',
expr: |||
sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="cpu"})
/
sum(kube_node_status_allocatable_cpu_cores)
> %(namespaceOvercommitFactor)s
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
summary: 'Cluster has overcommitted CPU resource requests.',
},
'for': '5m',
},
{
alert: 'KubeMemoryQuotaOvercommit',
expr: |||
sum(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource="memory"})
/
sum(kube_node_status_allocatable_memory_bytes{%(kubeStateMetricsSelector)s})
> %(namespaceOvercommitFactor)s
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
summary: 'Cluster has overcommitted memory resource requests.',
},
'for': '5m',
},
{
alert: 'KubeQuotaAlmostFull',
expr: |||
kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0)
> 0.9 < 1
||| % $._config,
'for': '15m',
labels: {
severity: 'info',
},
annotations: {
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
summary: 'Namespace quota is going to be full.',
},
},
{
alert: 'KubeQuotaFullyUsed',
expr: |||
kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0)
== 1
||| % $._config,
'for': '15m',
labels: {
severity: 'info',
},
annotations: {
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
summary: 'Namespace quota is fully used.',
},
},
{
alert: 'KubeQuotaExceeded',
expr: |||
kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard"} > 0)
> 1
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
description: 'Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.',
summary: 'Namespace quota has exceeded the limits.',
},
},
{
alert: 'CPUThrottlingHigh',
expr: |||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
> ( %(cpuThrottlingPercent)s / 100 )
||| % $._config,
'for': '15m',
labels: {
severity: 'info',
},
annotations: {
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.',
summary: 'Processes experience elevated CPU throttling.',
},
},
],
},
],
},
}