190 lines
6.2 KiB
Plaintext
190 lines
6.2 KiB
Plaintext
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
|
local service = k.core.v1.service;
|
|
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
|
|
|
{
|
|
prometheus+: {
|
|
serviceWeaveNet:
|
|
service.new('weave-net', { 'name': 'weave-net' }, servicePort.newNamed('weave-net-metrics', 6782, 6782)) +
|
|
service.mixin.metadata.withNamespace('kube-system') +
|
|
service.mixin.metadata.withLabels({ 'k8s-app': 'weave-net' }) +
|
|
service.mixin.spec.withClusterIp('None'),
|
|
serviceMonitorWeaveNet: {
|
|
apiVersion: 'monitoring.coreos.com/v1',
|
|
kind: 'ServiceMonitor',
|
|
metadata: {
|
|
name: 'weave-net',
|
|
labels: {
|
|
'k8s-app': 'weave-net',
|
|
},
|
|
namespace: 'monitoring',
|
|
},
|
|
spec: {
|
|
jobLabel: 'k8s-app',
|
|
endpoints: [
|
|
{
|
|
port: 'weave-net-metrics',
|
|
path: '/metrics',
|
|
interval: '15s',
|
|
},
|
|
],
|
|
namespaceSelector: {
|
|
matchNames: [
|
|
'kube-system',
|
|
],
|
|
},
|
|
selector: {
|
|
matchLabels: {
|
|
'k8s-app': 'weave-net',
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
prometheusRules+: {
|
|
groups+: [
|
|
{
|
|
name: 'weave-net',
|
|
rules: [
|
|
{
|
|
alert: 'WeaveNetIPAMSplitBrain',
|
|
expr: 'max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Percentage of all IP addresses owned by unreachable peers is not same for every node.',
|
|
description: 'actionable: Weave Net network has a split brain problem. Please find the problem and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetIPAMUnreachable',
|
|
expr: 'weave_ipam_unreachable_percentage > 25',
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Percentage of all IP addresses owned by unreachable peers is above threshold.',
|
|
description: 'actionable: Please find the problem and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetIPAMPendingAllocates',
|
|
expr: 'sum(weave_ipam_pending_allocates) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Number of pending allocates is above the threshold.',
|
|
description: 'actionable: Please find the problem and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetIPAMPendingClaims',
|
|
expr: 'sum(weave_ipam_pending_claims) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Number of pending claims is above the threshold.',
|
|
description: 'actionable: Please find the problem and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetFastDPFlowsLow',
|
|
expr: 'sum(weave_flows) < 15000',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Number of FastDP flows is below the threshold.',
|
|
description: 'actionable: Please find the reason for FastDP flows to go below the threshold and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetFastDPFlowsOff',
|
|
expr: 'sum(weave_flows == bool 0) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'FastDP flows is zero.',
|
|
description: 'actionable: Please find the reason for FastDP flows to be off and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetHighConnectionTerminationRate',
|
|
expr: 'rate(weave_connection_terminations_total[5m]) > 0.1',
|
|
'for': '5m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'A lot of connections are getting terminated.',
|
|
description: 'actionable: Please find the reason for the high connection termination rate and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetConnectionsConnecting',
|
|
expr: 'sum(weave_connections{state="connecting"}) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'A lot of connections are in connecting state.',
|
|
description: 'actionable: Please find the reason for this and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetConnectionsRetying',
|
|
expr: 'sum(weave_connections{state="retrying"}) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'A lot of connections are in retrying state.',
|
|
description: 'actionable: Please find the reason for this and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetConnectionsPending',
|
|
expr: 'sum(weave_connections{state="pending"}) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'A lot of connections are in pending state.',
|
|
description: 'actionable: Please find the reason for this and fix it.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'WeaveNetConnectionsFailed',
|
|
expr: 'sum(weave_connections{state="failed"}) > 0',
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'A lot of connections are in failed state.',
|
|
description: 'actionable: Please find the reason and fix it.',
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
grafanaDashboards+:: {
|
|
'weave-net.json': (import 'grafana-weave-net.json'),
|
|
'weave-net-cluster.json': (import 'grafana-weave-net-cluster.json'),
|
|
},
|
|
}
|