This repository has been archived on 2023-04-02. You can view files and clone it, but cannot push or open issues or pull requests.
gitops-tbrnt/monitoring/manifests/k8rules.yaml

41 lines
1.3 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8up
labels:
prometheus: k8s
role: alert-rules
spec:
groups:
- name: k8up.rules
rules:
- alert: baas_last_errors
expr: baas_backup_restic_last_errors > 0
for: 1m
labels:
severity: critical
annotations:
summary: Amount of errors of last restic backup
description: This alert is fired when error number is > 0
- alert: K8upBackupFailed
expr: rate(k8up_jobs_failed_counter[1d]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Job in {{ $labels.namespace }} of type {{ $labels.jobType }} failed"
- alert: K8upBackupNotRunning
expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0
for: 1m
labels:
severity: critical
annotations:
summary: "No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock"
- alert: K8upJobStuck
expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0
for: 24h
labels:
severity: critical
annotations:
summary: "K8up jobs are stuck in {{ $labels.namespace }} for the last 24 hours."