From 4f9035453f4cce87e2725eff68cd5a4648989b15 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Sat, 7 Mar 2020 20:52:58 +0100 Subject: [PATCH] move k8up rules to monitoring --- monitoring/Makefile | 1 + .../k8rules.yaml | 0 .../alertmanager-tbrnt-config-secret.yaml | 2 +- monitoring/manifests/k8rules.yaml | 40 +++++++++++++++++++ 4 files changed, 42 insertions(+), 1 deletion(-) rename k8up/prometheusrule.yaml => monitoring/k8rules.yaml (100%) create mode 100644 monitoring/manifests/k8rules.yaml diff --git a/monitoring/Makefile b/monitoring/Makefile index 042d228..c9e3553 100644 --- a/monitoring/Makefile +++ b/monitoring/Makefile @@ -2,6 +2,7 @@ build: docker run --rm -v $(shell pwd):$(shell pwd) --workdir $(shell pwd) quay.io/coreos/jsonnet-ci ./build.sh monitoring.jsonnet sudo chown -R tobru. manifests/ kubeseal --controller-namespace sealed-secrets -o yaml -n monitoring < ../../gitops-tbrnt-private/monitoring/alertmanager.yaml > manifests/alertmanager-tbrnt-config-secret.yaml + cp *.yaml manifests/ .PHONY: build update: diff --git a/k8up/prometheusrule.yaml b/monitoring/k8rules.yaml similarity index 100% rename from k8up/prometheusrule.yaml rename to monitoring/k8rules.yaml diff --git a/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml b/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml index 469d999..f514485 100644 --- a/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml +++ b/monitoring/manifests/alertmanager-tbrnt-config-secret.yaml @@ -6,7 +6,7 @@ metadata: namespace: monitoring spec: encryptedData: - alertmanager.yaml: AgCB9ENTl3ptItT5IFDHKxONg+hJ20nAClt94b8ShwMWlaNNpUlxa92HIS8nKJ8sfciyvxPgMTDx/0euv8+RYZiuU5tYSjo6H6KZJUavsFykgxLDiNGN4qORkZyHGD/ZvXjJ4Ns+pDkOcxQ2xZD59Djbnu6CLk010NFENmwAn/b2jTy55fUwW9qPuOxGEmz2RpSaIiPDz5l6LsI2lvZV54hbuefBgpKDgGUS4EnbaU0db8w6APM/rWFrGIgBXDIfj/tM4BthrOCNbbK/clDuKsUkRoEjbApqvbXf36D41uUZIHhLlF2CTT2mi6nT6mTAxsmro1YdKO828wCmBZV879E+jJldsh5RUl7EWW7X7bb+XrVlmCxagRkKQjR/AwUEgi6Zd4XEOTcdC53f9R1e0xm3/0MQpqu24rZR4kIXkJbAmgJnOshKsscW6IlfeRGIiIHWdVXGp7mePcF+hvA8/0nQkxRC8JtNNR8buDYWmXRBihfcrr2zxn/zdhBxYcE9vcc9GEmaTnwqI3f/W0nBgOy6gkMnQ2z2RqoyzkgxNX1l1CoOCIzbyGsxnAhWna47xNvACqb/PvidNI+Ivc14/cUF/uGOaktHnLKzi0r71ebMFKtXSbI/a93qs7d5cghNbnnRrrxHBmtni+lVgAnbfrR7e4FgcjLeUvKhYjTUiGLz8gfSIF4dgDed9GIG+PxklE2na9SJTTWSv99C0JhAe7x+hCxnFnQgxOiB9ThBmwHI/3MBvn6qFvWAFSystOCcjbZ3wad9J/ndSrbwy8LojHIbHMzXf0feuTkmYNsdx3B+77sa/oiZV7ewHWlURMsBEslyNX0OuFNyaSVwL/sUjOxMmUqJe9q0uxfgdSr1W8S+/7v2s/ky7k3FZ9AqsAH5NGP8apZQe1SJzvxt0hFiw2FOFqwKQMNUvacHzXUtQyDeivvu3mPFCQZbwwCFrxnZviOdmrfGQr5dhCAYmxxnUVIetSbvj77sq0c/QXKXjJOiqyqTd7WjAFwKnZDZ6Yn4isSbsaDrnDrZpQ+O4MVwKSXY5q1S0NMqdn4cfjLZYRdsufwWsULeq7Gt+SaSCaI0BHn5grsKpNh89HgNMvgYUwkjJXXRd2/I2lcQDEaTYXEs7RcRnJk1a+8DljdVd51b2VN5FkT69aSL1olaDXrfEztShbJlz+nQUkYZeSYE9dhURfvtAILg1J7tTD0eDwNggJGc7hPI62Ir+Kk0t045WobbTESGElU3hbJ5/WMA7HC6b7Mbiw8ikjglIBK1ur9BBV/8lNrdqSBh6MqWGLpRf/qaaqlfG65jIM9O8Qe70E6hWFAbWFRPcuCNA878NMMiTGJ1PQZ/owm5lrEPn833FtJCKHkAVLIrezhalCIuDvc8OOFMD08vFmvsFwHp3YtPefgZXTNqAalQYuPKsPSDMtUJCCCYRzWI0vyj27dcHwtrOsP9ovRo1U6a20tsDnhBRaAlmUQEolC/fmOsmJrBcK7QBfj9awhtBovXacZKdud8cu0Mbxo1+1fJaF5cosR+s3qSvsWWCM+sCCr0bxqUOS1kqkHe+9AQNOUkwlLKPPNfZ+z8dvjJrtJeZ6dLnJuu3+G4aoB91x4sgAcr6LugEe6MW4fYMO+Td5pMqn51N/dwMUkFQy2pCvvF41xeJuxpu9m8eAZ/EyS2pNfPkq6EeiNwvVp33BtIKA5oAZ7JzJZ18ZrOWg== + alertmanager.yaml: AgBZGe4hWNMQ1DaNFdd2SPT8feODqYuIqqUT1TivOShoVy0XjpPAGQW2v5PNc6Q3qnn6/5CGCBAcLn0K4XxuB0Oi/39rbArAeuP0BsBAoI2KveEL3GIJ6BZ87xvJc9Exup5fJ3zeGb1Az0chgd1f0xzWtZQdgGO6Ba+KZGMtqj9BZLjxSTGLueWOBU/ZUN+q4Qj7fd730u187loSeSSdbGD1okDfEWFQPT8bXz2jeRWkEc8lg/nuyNibmCwWot9fMwUXgaOYN8bTpGHIc1nx+E7MZrvgSSsLbaGH9+UYcWjbKLAju1hOTxAtyHxfgJjNxkLDynYMaNDgfOdd5rY0oQN8HVQGRBSrb2ZFJMqP2m2LfjCy+CHWg/2EStmNeILLyJkdBzq4TqcpncOHfHUADOjuTPkAA2++K1DRwAT+KAA6bzkBuAS4bKhDD2QDjZ2jzfvrT/orAdXelTsljLULnVwQkDID3GxBPJ/7EebQNe/YloHN9KXtgqCyo6yvYUj2V9tFGEfZjzb1DEASINUWJ5GTu2yFVTPRK1s4kcpAs7jYs8d4eYRsYQbCLJdgH5O2fM6zPjSkifE15iGsZ77xDO73qa9XvX9/h/nhpyZRrWj0jwbKLe9ZdopjEV8VnT+vAec1DSHxBFYSHal60Pjhd6V8P6aMKPzWlqa2CI5i9oFY8MRuR4sil7sgGhjjts9FqtaMNFLUuYd5KiK5q02OC5zBRxg1xWisDTjXNGRjg1JVw8c2wxpuFjz0ROcAwGUmo0/xDcMWGl1dQ8hkqpZ2deUTPONLD8Sbz8luv0/UhaBiNY7jK878vMgMhBc/U2QpDbjfXjVx3V6+o4L7pp3ks6PSsXqobzPeMzQMpfitB5iVJ3YuhqtdLYH/VDPalKfsomTjcA+bPy/7XFfVsHoTbZ500TzejP/KBL46Nyw21svGe8+KeM6yA2nd3Jj6YetCsagOA9fJl405Lw6kBK4hiTdkcElVWG65Ro6dSL0cdSliU85lO85EmRwBi98jebTSRxcljndvl/fENxCtHjWbbpCKEz2yOaI0dMjZdg5bLRUkXcF5wOklsolKAb9j0yU6vyQcE8iYxyi7T9pYDWniIb4jchMQHOP8njgeIMPRMexcmOCHutBwMEItYcHMFzq8zHSgRPzpXyKQ1QHy6aDJ8hVxrOCVeOKQncPlZrGTNp+XXk4PfQhBaO7iS/eLB48N4vRevaJr4i0uffL7GvPMUjTeMZYI+7vDK3nMrpvls8RQe6OWVkei8AnBFG3211YLz320Z5FYrL6zj+aSFsko6ZDODRBD9lo949x1RArcxGdKPaNhMBTNEvOX+umMkWzEvtu+xhiLfwQwyKQbm98woFRa4KDcmk7xp49kQ0gIRRMZ0/g5n7wJk0YJyRXWy494YEJNJ4IumIg8lu/iT/wml1Y/+TiI4tt371PqczOhv7FljRST0LYurtHhM1g8SPQVavMddEGNFQCl31zh+S74dHSiABgTDMRp6g7HrEPGmXDoqgMCTlWttx89stFYos8DPtPspWZN2bbg56GKR0ih/tvbzadzzQa4BiIXliPibf1HIyWc3DgfHXjFr7hivliVrIVvgcEklj3KUftMNW6FydBaSXLhdAAaq+eBH81V8Lz+fD03b+fGv8nsKGQnlbVEoaWlBYSbnWnNvYNtlfwe9FaQVJIt/kd+2XNgNzgbKGciISpeZw== template: metadata: creationTimestamp: null diff --git a/monitoring/manifests/k8rules.yaml b/monitoring/manifests/k8rules.yaml new file mode 100644 index 0000000..bef9f77 --- /dev/null +++ b/monitoring/manifests/k8rules.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8up + labels: + prometheus: k8s + role: alert-rules +spec: + groups: + - name: k8up.rules + rules: + - alert: baas_last_errors + expr: baas_backup_restic_last_errors > 0 + for: 1m + labels: + severity: critical + annotations: + summary: Amount of errors of last restic backup + description: This alert is fired when error number is > 0 + - alert: K8upBackupFailed + expr: rate(k8up_jobs_failed_counter[1d]) > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Job in {{ $labels.namespace }} of type {{ $labels.jobType }} failed" + - alert: K8upBackupNotRunning + expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0 + for: 1m + labels: + severity: critical + annotations: + summary: "No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock" + - alert: K8upJobStuck + expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0 + for: 24h + labels: + severity: critical + annotations: + summary: "K8up jobs are stuck in {{ $labels.namespace }} for the last 24 hours."