This repository has been archived on 2023-04-02. You can view files and clone it, but cannot push or open issues or pull requests.
gitops-tbrnt/monitoring/vendor/github.com/kubernetes-monitoring/kubernetes-mixin/alerts/apps_alerts.libsonnet

312 lines
14 KiB
Plaintext

{
_config+:: {
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
namespaceSelector: null,
prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '',
},
prometheusAlerts+:: {
groups+: [
{
name: 'kubernetes-apps',
rules: [
{
expr: |||
rate(kube_pod_container_status_restarts_total{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) * 60 * 5 > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.',
summary: 'Pod is crash looping.',
},
'for': '15m',
alert: 'KubePodCrashLooping',
},
{
// We wrap kube_pod_owner with the topk() aggregator to ensure that
// every (namespace, pod) tuple is unique even if the "owner_kind"
// label exists for 2 values. This avoids "many-to-many matching
// not allowed" errors when joining with kube_pod_status_phase.
expr: |||
sum by (namespace, pod) (
max by(namespace, pod) (
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"}
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.',
summary: 'Pod has been in a non-ready state for more than 15 minutes.',
},
'for': '15m',
alert: 'KubePodNotReady',
},
{
expr: |||
kube_deployment_status_observed_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_deployment_metadata_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.',
summary: 'Deployment generation mismatch due to possible roll-back',
},
'for': '15m',
alert: 'KubeDeploymentGenerationMismatch',
},
{
expr: |||
(
kube_deployment_spec_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_deployment_status_replicas_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
) and (
changes(kube_deployment_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
==
0
)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.',
summary: 'Deployment has not matched the expected number of replicas.',
},
'for': '15m',
alert: 'KubeDeploymentReplicasMismatch',
},
{
expr: |||
(
kube_statefulset_status_replicas_ready{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_statefulset_status_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
) and (
changes(kube_statefulset_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
==
0
)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
summary: 'Deployment has not matched the expected number of replicas.',
},
'for': '15m',
alert: 'KubeStatefulSetReplicasMismatch',
},
{
expr: |||
kube_statefulset_status_observed_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_statefulset_metadata_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.',
summary: 'StatefulSet generation mismatch due to possible roll-back',
},
'for': '15m',
alert: 'KubeStatefulSetGenerationMismatch',
},
{
expr: |||
(
max without (revision) (
kube_statefulset_status_current_revision{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
unless
kube_statefulset_status_update_revision{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
)
*
(
kube_statefulset_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_statefulset_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
)
) and (
changes(kube_statefulset_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
==
0
)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.',
summary: 'StatefulSet update has not been rolled out.',
},
'for': '15m',
alert: 'KubeStatefulSetUpdateNotRolledOut',
},
{
alert: 'KubeDaemonSetRolloutStuck',
expr: |||
(
(
kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
) or (
kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
0
) or (
kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
) or (
kube_daemonset_status_number_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
)
) and (
changes(kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m])
==
0
)
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.',
summary: 'DaemonSet rollout is stuck.',
},
'for': '15m',
},
{
expr: |||
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.',
summary: 'Pod container waiting longer than 1 hour',
},
'for': '1h',
alert: 'KubeContainerWaiting',
},
{
alert: 'KubeDaemonSetNotScheduled',
expr: |||
kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
-
kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.',
summary: 'DaemonSet pods are not scheduled.',
},
'for': '10m',
},
{
alert: 'KubeDaemonSetMisScheduled',
expr: |||
kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.',
summary: 'DaemonSet pods are misscheduled.',
},
'for': '15m',
},
{
alert: 'KubeJobCompletion',
expr: |||
kube_job_spec_completions{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} - kube_job_status_succeeded{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
||| % $._config,
'for': '12h',
labels: {
severity: 'warning',
},
annotations: {
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.',
summary: 'Job did not complete in time',
},
},
{
alert: 'KubeJobFailed',
expr: |||
kube_job_failed{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.',
summary: 'Job failed to complete.',
},
},
{
expr: |||
(kube_hpa_status_desired_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
!=
kube_hpa_status_current_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s})
and
(kube_hpa_status_current_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
>
kube_hpa_spec_min_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s})
and
(kube_hpa_status_current_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
<
kube_hpa_spec_max_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s})
and
changes(kube_hpa_status_current_replicas[15m]) == 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.',
summary: 'HPA has not matched descired number of replicas.',
},
'for': '15m',
alert: 'KubeHpaReplicasMismatch',
},
{
expr: |||
kube_hpa_status_current_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
==
kube_hpa_spec_max_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.',
summary: 'HPA is running at max replicas',
},
'for': '15m',
alert: 'KubeHpaMaxedOut',
},
],
},
],
},
}