{ _config+:: { kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', namespaceSelector: null, prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '', }, prometheusAlerts+:: { groups+: [ { name: 'kubernetes-apps', rules: [ { expr: ||| rate(kube_pod_container_status_restarts_total{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) * 60 * 5 > 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.', summary: 'Pod is crash looping.', }, 'for': '15m', alert: 'KubePodCrashLooping', }, { // We wrap kube_pod_owner with the topk() aggregator to ensure that // every (namespace, pod) tuple is unique even if the "owner_kind" // label exists for 2 values. This avoids "many-to-many matching // not allowed" errors when joining with kube_pod_status_phase. expr: ||| sum by (namespace, pod) ( max by(namespace, pod) ( kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"} ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes.', summary: 'Pod has been in a non-ready state for more than 15 minutes.', }, 'for': '15m', alert: 'KubePodNotReady', }, { expr: ||| kube_deployment_status_observed_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_deployment_metadata_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.', summary: 'Deployment generation mismatch due to possible roll-back', }, 'for': '15m', alert: 'KubeDeploymentGenerationMismatch', }, { expr: ||| ( kube_deployment_spec_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_deployment_status_replicas_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) and ( changes(kube_deployment_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) == 0 ) ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes.', summary: 'Deployment has not matched the expected number of replicas.', }, 'for': '15m', alert: 'KubeDeploymentReplicasMismatch', }, { expr: ||| ( kube_statefulset_status_replicas_ready{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_statefulset_status_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) and ( changes(kube_statefulset_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) == 0 ) ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.', summary: 'Deployment has not matched the expected number of replicas.', }, 'for': '15m', alert: 'KubeStatefulSetReplicasMismatch', }, { expr: ||| kube_statefulset_status_observed_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_statefulset_metadata_generation{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.', summary: 'StatefulSet generation mismatch due to possible roll-back', }, 'for': '15m', alert: 'KubeStatefulSetGenerationMismatch', }, { expr: ||| ( max without (revision) ( kube_statefulset_status_current_revision{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} unless kube_statefulset_status_update_revision{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) * ( kube_statefulset_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_statefulset_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) ) and ( changes(kube_statefulset_status_replicas_updated{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) == 0 ) ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.', summary: 'StatefulSet update has not been rolled out.', }, 'for': '15m', alert: 'KubeStatefulSetUpdateNotRolledOut', }, { alert: 'KubeDaemonSetRolloutStuck', expr: ||| ( ( kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) or ( kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != 0 ) or ( kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) or ( kube_daemonset_status_number_available{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ) ) and ( changes(kube_daemonset_updated_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) == 0 ) ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.', summary: 'DaemonSet rollout is stuck.', }, 'for': '15m', }, { expr: ||| sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour.', summary: 'Pod container waiting longer than 1 hour', }, 'for': '1h', alert: 'KubeContainerWaiting', }, { alert: 'KubeDaemonSetNotScheduled', expr: ||| kube_daemonset_status_desired_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} - kube_daemonset_status_current_number_scheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.', summary: 'DaemonSet pods are not scheduled.', }, 'for': '10m', }, { alert: 'KubeDaemonSetMisScheduled', expr: ||| kube_daemonset_status_number_misscheduled{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.', summary: 'DaemonSet pods are misscheduled.', }, 'for': '15m', }, { alert: 'KubeJobCompletion', expr: ||| kube_job_spec_completions{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} - kube_job_status_succeeded{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0 ||| % $._config, 'for': '12h', labels: { severity: 'warning', }, annotations: { description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete.', summary: 'Job did not complete in time', }, }, { alert: 'KubeJobFailed', expr: ||| kube_job_failed{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0 ||| % $._config, 'for': '15m', labels: { severity: 'warning', }, annotations: { description: 'Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.', summary: 'Job failed to complete.', }, }, { expr: ||| (kube_hpa_status_desired_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} != kube_hpa_status_current_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) and changes(kube_hpa_status_current_replicas[15m]) == 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes.', summary: 'HPA has not matched descired number of replicas.', }, 'for': '15m', alert: 'KubeHpaReplicasMismatch', }, { expr: ||| kube_hpa_status_current_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} == kube_hpa_spec_max_replicas{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} ||| % $._config, labels: { severity: 'warning', }, annotations: { description: 'HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes.', summary: 'HPA is running at max replicas', }, 'for': '15m', alert: 'KubeHpaMaxedOut', }, ], }, ], }, }