From eafb22dcec9034fdc76fab46e4f86023d4ed0117 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Sat, 3 Aug 2019 23:58:43 +0200 Subject: [PATCH] add some monitoring sauce --- README.md | 16 ++++++- contrib/alertmanager.yaml | 12 +++++ contrib/pos-blackbox-exporter-scrape.yaml | 48 +++++++++++++++++++ contrib/posmon/00-posmon-ns.yaml | 4 ++ contrib/posmon/alertmanager.yaml | 23 +++++++++ contrib/posmon/alertrules.yaml | 14 ++++++ contrib/{ => posmon}/blackbox-exporter.yaml | 6 ++- .../{ => posmon}/healthchecks-cronjob.yaml | 2 +- contrib/posmon/prometheus-rbac.yaml | 37 ++++++++++++++ contrib/posmon/prometheus.yaml | 42 ++++++++++++++++ contrib/posmon/servicemonitor.yaml | 13 +++++ 11 files changed, 213 insertions(+), 4 deletions(-) create mode 100644 contrib/alertmanager.yaml create mode 100644 contrib/pos-blackbox-exporter-scrape.yaml create mode 100644 contrib/posmon/00-posmon-ns.yaml create mode 100644 contrib/posmon/alertmanager.yaml create mode 100644 contrib/posmon/alertrules.yaml rename contrib/{ => posmon}/blackbox-exporter.yaml (88%) rename contrib/{ => posmon}/healthchecks-cronjob.yaml (97%) create mode 100644 contrib/posmon/prometheus-rbac.yaml create mode 100644 contrib/posmon/prometheus.yaml create mode 100644 contrib/posmon/servicemonitor.yaml diff --git a/README.md b/README.md index cb3bac3..fbe7c58 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,23 @@ provided which regularly pings [Healthchecks.io](https://healthchecks.io/). A secret with the ping URL needs to be added before the CronJobs does it's work: ``` -kubectl -n hc create secret generic healthchecks-io --from-literal=HCURL=https://hc-ping.com/MYUUID +kubectl -n posmon create secret generic healthchecks-io --from-literal=HCURL=https://hc-ping.com/MYUUID ``` +### Application and network monitoring + +Application monitoring is done using Prometheus, Alertmanager and +Blackbox exporter. No application specific exporters are used, so +it's just a base monitoring to answer the question: "Is it up?". + +1. Install [prometheus-operator](https://github.com/coreos/prometheus-operator) + F.e.: `kubectl apply -f https://raw.githubusercontent.com/coreos/prometheus-operator/master/bundle.yaml` +2. Apply manifests: `kubectl apply -f contrib/posmon/` +3. Create secret for extra scrape config: + `kubectl -n posmon create secret generic additional-scrape-configs --from-file=contrib/pos-blackbox-exporter-scrape.yaml` +4. Create secret for Alertmanager config: + `kubectl -n posmon create secret generic alertmanager-posmon --from-file=contrib/alertmanager.yaml` + ## Backup configuration Example contents of `backup.env`: diff --git a/contrib/alertmanager.yaml b/contrib/alertmanager.yaml new file mode 100644 index 0000000..f08a210 --- /dev/null +++ b/contrib/alertmanager.yaml @@ -0,0 +1,12 @@ +global: + resolve_timeout: 5m +route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'webhook' +receivers: +- name: 'webhook' + webhook_configs: + - url: 'http://alertmanagerwh:30500/' diff --git a/contrib/pos-blackbox-exporter-scrape.yaml b/contrib/pos-blackbox-exporter-scrape.yaml new file mode 100644 index 0000000..574719e --- /dev/null +++ b/contrib/pos-blackbox-exporter-scrape.yaml @@ -0,0 +1,48 @@ +- job_name: 'blackbox_http' + metrics_path: /probe + scrape_interval: 1m + params: + module: [http_2xx] + static_configs: + - targets: + - http://odoo.pos.svc.cluster.local + - http://iotbox.pos.svc.cluster.local + - http://192.168.233.1 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 +- job_name: 'blackbox_tcp' + metrics_path: /probe + scrape_interval: 1m + params: + module: [tcp_connect] + static_configs: + - targets: + - db.pos.svc.cluster.local:5432 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 +- job_name: 'blackbox_icmp' + metrics_path: /probe + scrape_interval: 1m + params: + module: [icmp] + static_configs: + - targets: + - 192.168.233.3 + - 192.168.233.5 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 diff --git a/contrib/posmon/00-posmon-ns.yaml b/contrib/posmon/00-posmon-ns.yaml new file mode 100644 index 0000000..c1fa0af --- /dev/null +++ b/contrib/posmon/00-posmon-ns.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: posmon diff --git a/contrib/posmon/alertmanager.yaml b/contrib/posmon/alertmanager.yaml new file mode 100644 index 0000000..3f4a733 --- /dev/null +++ b/contrib/posmon/alertmanager.yaml @@ -0,0 +1,23 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + name: posmon + namespace: posmon +spec: + replicas: 1 +--- +apiVersion: v1 +kind: Service +metadata: + name: alertmanager-posmon + namespace: posmon +spec: + type: NodePort + ports: + - name: web + nodePort: 30903 + port: 9093 + protocol: TCP + targetPort: web + selector: + alertmanager: posmon diff --git a/contrib/posmon/alertrules.yaml b/contrib/posmon/alertrules.yaml new file mode 100644 index 0000000..3b5021b --- /dev/null +++ b/contrib/posmon/alertrules.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: posmon + role: alert-rules + name: posmon-rules + namespace: posmon +spec: + groups: + - name: ./posmon.rules + rules: + - alert: TargetDown + expr: probe_success < 1 diff --git a/contrib/blackbox-exporter.yaml b/contrib/posmon/blackbox-exporter.yaml similarity index 88% rename from contrib/blackbox-exporter.yaml rename to contrib/posmon/blackbox-exporter.yaml index 5e58012..75cc3ed 100644 --- a/contrib/blackbox-exporter.yaml +++ b/contrib/posmon/blackbox-exporter.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: blackbox-exporter - namespace: mon + namespace: posmon spec: replicas: 1 selector: @@ -23,7 +23,9 @@ apiVersion: v1 kind: Service metadata: name: blackbox-exporter - namespace: mon + namespace: posmon + labels: + app: blackbox-exporter spec: ports: - name: http diff --git a/contrib/healthchecks-cronjob.yaml b/contrib/posmon/healthchecks-cronjob.yaml similarity index 97% rename from contrib/healthchecks-cronjob.yaml rename to contrib/posmon/healthchecks-cronjob.yaml index 77a6df5..700eb06 100644 --- a/contrib/healthchecks-cronjob.yaml +++ b/contrib/posmon/healthchecks-cronjob.yaml @@ -2,7 +2,7 @@ apiVersion: batch/v1beta1 kind: CronJob metadata: name: healthchecks-io - namespace: hc + namespace: posmon spec: schedule: "*/1 * * * *" concurrencyPolicy: Forbid diff --git a/contrib/posmon/prometheus-rbac.yaml b/contrib/posmon/prometheus-rbac.yaml new file mode 100644 index 0000000..a121193 --- /dev/null +++ b/contrib/posmon/prometheus-rbac.yaml @@ -0,0 +1,37 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: posmon +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: posmon diff --git a/contrib/posmon/prometheus.yaml b/contrib/posmon/prometheus.yaml new file mode 100644 index 0000000..59d063e --- /dev/null +++ b/contrib/posmon/prometheus.yaml @@ -0,0 +1,42 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: posmon + namespace: posmon +spec: + serviceAccountName: prometheus + serviceMonitorSelector: + matchLabels: + app: pos + resources: + requests: + memory: 400Mi + enableAdminAPI: false + additionalScrapeConfigs: + name: additional-scrape-configs + key: pos-blackbox-exporter-scrape.yaml + alerting: + alertmanagers: + - namespace: posmon + name: alertmanager-posmon + port: web + ruleSelector: + matchLabels: + role: alert-rules + prometheus: posmon +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-posmon + namespace: posmon +spec: + type: NodePort + ports: + - name: web + nodePort: 30909 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: posmon diff --git a/contrib/posmon/servicemonitor.yaml b/contrib/posmon/servicemonitor.yaml new file mode 100644 index 0000000..6d3c3aa --- /dev/null +++ b/contrib/posmon/servicemonitor.yaml @@ -0,0 +1,13 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: blackboxmon + namespace: posmon + labels: + app: pos +spec: + selector: + matchLabels: + app: blackbox-exporter + endpoints: + - port: http