1
0
Fork 0

add some monitoring sauce

This commit is contained in:
Tobias Brunner 2019-08-03 23:58:43 +02:00
parent 1341ebf585
commit eafb22dcec
11 changed files with 213 additions and 4 deletions

View File

@ -57,9 +57,23 @@ provided which regularly pings [Healthchecks.io](https://healthchecks.io/).
A secret with the ping URL needs to be added before the CronJobs does it's work:
```
kubectl -n hc create secret generic healthchecks-io --from-literal=HCURL=https://hc-ping.com/MYUUID
kubectl -n posmon create secret generic healthchecks-io --from-literal=HCURL=https://hc-ping.com/MYUUID
```
### Application and network monitoring
Application monitoring is done using Prometheus, Alertmanager and
Blackbox exporter. No application specific exporters are used, so
it's just a base monitoring to answer the question: "Is it up?".
1. Install [prometheus-operator](https://github.com/coreos/prometheus-operator)
F.e.: `kubectl apply -f https://raw.githubusercontent.com/coreos/prometheus-operator/master/bundle.yaml`
2. Apply manifests: `kubectl apply -f contrib/posmon/`
3. Create secret for extra scrape config:
`kubectl -n posmon create secret generic additional-scrape-configs --from-file=contrib/pos-blackbox-exporter-scrape.yaml`
4. Create secret for Alertmanager config:
`kubectl -n posmon create secret generic alertmanager-posmon --from-file=contrib/alertmanager.yaml`
## Backup configuration
Example contents of `backup.env`:

12
contrib/alertmanager.yaml Normal file
View File

@ -0,0 +1,12 @@
global:
resolve_timeout: 5m
route:
group_by: ['job']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://alertmanagerwh:30500/'

View File

@ -0,0 +1,48 @@
- job_name: 'blackbox_http'
metrics_path: /probe
scrape_interval: 1m
params:
module: [http_2xx]
static_configs:
- targets:
- http://odoo.pos.svc.cluster.local
- http://iotbox.pos.svc.cluster.local
- http://192.168.233.1
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
- job_name: 'blackbox_tcp'
metrics_path: /probe
scrape_interval: 1m
params:
module: [tcp_connect]
static_configs:
- targets:
- db.pos.svc.cluster.local:5432
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
- job_name: 'blackbox_icmp'
metrics_path: /probe
scrape_interval: 1m
params:
module: [icmp]
static_configs:
- targets:
- 192.168.233.3
- 192.168.233.5
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115

View File

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: posmon

View File

@ -0,0 +1,23 @@
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: posmon
namespace: posmon
spec:
replicas: 1
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager-posmon
namespace: posmon
spec:
type: NodePort
ports:
- name: web
nodePort: 30903
port: 9093
protocol: TCP
targetPort: web
selector:
alertmanager: posmon

View File

@ -0,0 +1,14 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: posmon
role: alert-rules
name: posmon-rules
namespace: posmon
spec:
groups:
- name: ./posmon.rules
rules:
- alert: TargetDown
expr: probe_success < 1

View File

@ -2,7 +2,7 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: blackbox-exporter
namespace: mon
namespace: posmon
spec:
replicas: 1
selector:
@ -23,7 +23,9 @@ apiVersion: v1
kind: Service
metadata:
name: blackbox-exporter
namespace: mon
namespace: posmon
labels:
app: blackbox-exporter
spec:
ports:
- name: http

View File

@ -2,7 +2,7 @@ apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: healthchecks-io
namespace: hc
namespace: posmon
spec:
schedule: "*/1 * * * *"
concurrencyPolicy: Forbid

View File

@ -0,0 +1,37 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: posmon
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: posmon

View File

@ -0,0 +1,42 @@
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: posmon
namespace: posmon
spec:
serviceAccountName: prometheus
serviceMonitorSelector:
matchLabels:
app: pos
resources:
requests:
memory: 400Mi
enableAdminAPI: false
additionalScrapeConfigs:
name: additional-scrape-configs
key: pos-blackbox-exporter-scrape.yaml
alerting:
alertmanagers:
- namespace: posmon
name: alertmanager-posmon
port: web
ruleSelector:
matchLabels:
role: alert-rules
prometheus: posmon
---
apiVersion: v1
kind: Service
metadata:
name: prometheus-posmon
namespace: posmon
spec:
type: NodePort
ports:
- name: web
nodePort: 30909
port: 9090
protocol: TCP
targetPort: web
selector:
prometheus: posmon

View File

@ -0,0 +1,13 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: blackboxmon
namespace: posmon
labels:
app: pos
spec:
selector:
matchLabels:
app: blackbox-exporter
endpoints:
- port: http