41 lines
1.3 KiB
YAML
41 lines
1.3 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: k8up
|
|
labels:
|
|
prometheus: k8s
|
|
role: alert-rules
|
|
spec:
|
|
groups:
|
|
- name: k8up.rules
|
|
rules:
|
|
- alert: baas_last_errors
|
|
expr: baas_backup_restic_last_errors > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Amount of errors of last restic backup
|
|
description: This alert is fired when error number is > 0
|
|
- alert: K8upBackupFailed
|
|
expr: rate(k8up_jobs_failed_counter[1d]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Job in {{ $labels.namespace }} of type {{ $labels.jobType }} failed"
|
|
- alert: K8upBackupNotRunning
|
|
expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock"
|
|
- alert: K8upJobStuck
|
|
expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0
|
|
for: 24h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "K8up jobs are stuck in {{ $labels.namespace }} for the last 24 hours."
|