From 9baf2ead1500d916e136a11dd81d77a87fa04d12 Mon Sep 17 00:00:00 2001 From: Marcus Noble Date: Mon, 14 Jun 2021 10:09:27 +0100 Subject: [PATCH] Added multi-cluster monitoring --- manifests/_apps/monitoring.yaml | 24 + manifests/inlets/inlets.yaml | 14 + manifests/monitoring/blackbox-exporter.yaml | 182 +++++++ manifests/monitoring/grafana.yaml | 285 +++++++++++ manifests/monitoring/ingress.yaml | 222 ++++++++ manifests/monitoring/kube-state-metrics.yaml | 255 ++++++++++ manifests/monitoring/loki.yaml | 232 +++++++++ manifests/monitoring/node-exporter.yaml | 87 ++++ manifests/monitoring/prometheus-server.yaml | 501 +++++++++++++++++++ manifests/monitoring/promtail.yaml | 449 +++++++++++++++++ 10 files changed, 2251 insertions(+) create mode 100644 manifests/_apps/monitoring.yaml create mode 100644 manifests/monitoring/blackbox-exporter.yaml create mode 100644 manifests/monitoring/grafana.yaml create mode 100644 manifests/monitoring/ingress.yaml create mode 100644 manifests/monitoring/kube-state-metrics.yaml create mode 100644 manifests/monitoring/loki.yaml create mode 100644 manifests/monitoring/node-exporter.yaml create mode 100644 manifests/monitoring/prometheus-server.yaml create mode 100644 manifests/monitoring/promtail.yaml diff --git a/manifests/_apps/monitoring.yaml b/manifests/_apps/monitoring.yaml new file mode 100644 index 0000000..bcdef4d --- /dev/null +++ b/manifests/_apps/monitoring.yaml @@ -0,0 +1,24 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: monitoring + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: cluster.fun + destination: + namespace: monitoring + name: cluster-fun (scaleway) + source: + path: manifests/monitoring + repoURL: "https://git.cluster.fun/AverageMarcus/cluster.fun.git" + targetRevision: HEAD + syncPolicy: + automated: {} + syncOptions: + - CreateNamespace=true + ignoreDifferences: + - kind: Secret + jsonPointers: + - /data diff --git a/manifests/inlets/inlets.yaml b/manifests/inlets/inlets.yaml index be144c0..7e4d358 100644 --- a/manifests/inlets/inlets.yaml +++ b/manifests/inlets/inlets.yaml @@ -212,3 +212,17 @@ spec: targetPort: 8000 selector: app: inlets +--- +kind: Service +apiVersion: v1 +metadata: + name: loki-local + namespace: inlets +spec: + type: ClusterIP + ports: + - port: 80 + protocol: TCP + targetPort: 8000 + selector: + app: inlets diff --git a/manifests/monitoring/blackbox-exporter.yaml b/manifests/monitoring/blackbox-exporter.yaml new file mode 100644 index 0000000..1e34656 --- /dev/null +++ b/manifests/monitoring/blackbox-exporter.yaml @@ -0,0 +1,182 @@ +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: blackbox-exporter-psp + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter +spec: + privileged: false + allowPrivilegeEscalation: false + volumes: + - configMap + - secret + hostNetwork: false + hostIPC: false + hostPID: false + runAsUser: + rule: RunAsAny + seLinux: + rule: RunAsAny + supplementalGroups: + rule: 'MustRunAs' + ranges: + - min: 1 + max: 65535 + fsGroup: + rule: 'MustRunAs' + ranges: + - min: 1 + max: 65535 + readOnlyRootFilesystem: true + allowedCapabilities: + - NET_RAW +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: blackbox-exporter + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: blackbox-exporter + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter +data: + blackbox.yaml: | + modules: + http_2xx: + http: + follow_redirects: true + preferred_ip_protocol: ip4 + tls_config: + insecure_skip_verify: true + valid_http_versions: + - HTTP/1.1 + - HTTP/2.0 + prober: http + timeout: 5s + icmp_ping: + icmp: + preferred_ip_protocol: ip4 + source_ip_address: 127.0.0.1 + prober: icmp + timeout: 5s +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter + name: blackbox-exporter + namespace: monitoring +rules: + - apiGroups: + - policy + resources: + - podsecuritypolicies + resourceNames: + - blackbox-exporter-psp + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter + name: blackbox-exporter + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: blackbox-exporter +subjects: + - kind: ServiceAccount + name: blackbox-exporter +--- +kind: Service +apiVersion: v1 +metadata: + name: blackbox-exporter + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter +spec: + type: ClusterIP + ports: + - name: http + port: 9115 + targetPort: http + protocol: TCP + selector: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: blackbox-exporter + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: blackbox-exporter + spec: + serviceAccountName: blackbox-exporter + restartPolicy: Always + containers: + - name: blackbox-exporter + image: "prom/blackbox-exporter:v0.19.0" + imagePullPolicy: IfNotPresent + securityContext: + readOnlyRootFilesystem: true + capabilities: + add: ["NET_RAW"] + args: + - "--config.file=/config/blackbox.yaml" + ports: + - containerPort: 9115 + name: http + livenessProbe: + httpGet: + path: /health + port: http + readinessProbe: + httpGet: + path: /health + port: http + volumeMounts: + - mountPath: /config + name: config + volumes: + - name: config + configMap: + name: blackbox-exporter + diff --git a/manifests/monitoring/grafana.yaml b/manifests/monitoring/grafana.yaml new file mode 100644 index 0000000..a8c4c77 --- /dev/null +++ b/manifests/monitoring/grafana.yaml @@ -0,0 +1,285 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-credentials + namespace: monitoring + labels: + app.kubernetes.io/name: grafana + annotations: + kube-1password: wpynfxkdipeeacyfxkvtdsuj54 + kube-1password/vault: Kubernetes +type: Opaque +--- +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana + annotations: + seccomp.security.alpha.kubernetes.io/allowedProfileNames: 'docker/default,runtime/default' + seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + apparmor.security.beta.kubernetes.io/allowedProfileNames: 'runtime/default' + apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' +spec: + privileged: false + allowPrivilegeEscalation: false + requiredDropCapabilities: + # Default set from Docker, without DAC_OVERRIDE or CHOWN + - FOWNER + - FSETID + - KILL + - SETGID + - SETUID + - SETPCAP + - NET_BIND_SERVICE + - NET_RAW + - SYS_CHROOT + - MKNOD + - AUDIT_WRITE + - SETFCAP + volumes: + - 'configMap' + - 'emptyDir' + - 'projected' + - 'csi' + - 'secret' + - 'downwardAPI' + - 'persistentVolumeClaim' + hostNetwork: false + hostIPC: false + hostPID: false + runAsUser: + rule: 'RunAsAny' + seLinux: + rule: 'RunAsAny' + supplementalGroups: + rule: 'RunAsAny' + fsGroup: + rule: 'RunAsAny' + readOnlyRootFilesystem: false +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana +data: + grafana.ini: | + [analytics] + check_for_updates = false + reporting_enabled = false + [auth.anonymous] + enabled = true + org_role = Admin + [dataproxy] + timeout = 300 + [grafana_net] + url = https://grafana.net + [log] + mode = console + [paths] + data = /var/lib/grafana/data + logs = /var/log/grafana + plugins = /var/lib/grafana/plugins + provisioning = /etc/grafana/provisioning + datasources.yaml: | + apiVersion: 1 + datasources: + - access: proxy + jsonData: + maxLines: 1000 + name: Loki + type: loki + url: http://loki.monitoring:3100 + - access: proxy + jsonData: + maxLines: 1000 + name: Loki-local + type: loki + url: http://loki-local.inlets.svc:80 + - access: proxy + name: Prometheus + type: prometheus + url: http://prometheus-server.monitoring:80 + dashboardproviders.yaml: | + apiVersion: 1 + providers: + - allowUiUpdates: true + disableDeletion: false + editable: true + folder: "" + name: default + options: + path: /var/lib/grafana/dashboards/default + orgId: 1 + type: file + download_dashboards.sh: | + #!/usr/bin/env sh + set -euf + mkdir -p /var/lib/grafana/dashboards/default +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-default + namespace: monitoring + labels: + app.kubernetes.io/name: grafana + dashboard-provider: default +data: + analytics.json: + |- + {"annotations":{"list":[{"builtIn":1,"datasource":"-- Grafana --","enable":true,"hide":true,"iconColor":"rgba(0, 211, 255, 1)","name":"Annotations & Alerts","type":"dashboard"}]},"editable":true,"gnetId":null,"graphTooltip":0,"id":1,"iteration":1617614944424,"links":[],"panels":[{"datasource":"Loki","description":"","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byRegexp","options":"(4|5).+"},"properties":[{"id":"color","value":{"fixedColor":"red","mode":"fixed"}}]},{"matcher":{"id":"byRegexp","options":"3.+"},"properties":[{"id":"color","value":{"fixedColor":"light-blue","mode":"fixed"}}]},{"matcher":{"id":"byName","options":"Total Requests"},"properties":[{"id":"color","value":{"mode":"fixed"}},{"id":"links","value":[{"targetBlank":true,"title":"Show query","url":"https://grafana.cluster.fun/explore?orgId=1&left=%5B%22${__from}%22,%22${__to}%22,%22Loki%22,%7B%22expr%22:%22%7Bk8s_app%3D%5C%22traefik-ingress-lb%5C%22%7D%20%7C%20json%20%7C%20RequestHost%3D~%5C%22${host:text}%5C%22%20error%3D%5C%22%5C%22%22%7D%5D"}]}]},{"matcher":{"id":"byRegexp","options":"[0-9][0-9][0-9]"},"properties":[{"id":"links","value":[{"targetBlank":true,"title":"Show query","url":"https://grafana.cluster.fun/explore?orgId=1&left=%5B%22${__from}%22,%22${__to}%22,%22Loki%22,%7B%22expr%22:%22%7Bk8s_app%3D%5C%22traefik-ingress-lb%5C%22%7D%20%7C%20json%20%7C%20RequestHost%3D~%5C%22${host:text}%5C%22%20error%3D%5C%22%5C%22 DownstreamStatus%3D\\\"${__field.labels.DownstreamStatus}\\\"\"}]"}]}]}]},"gridPos":{"h":10,"w":9,"x":0,"y":0},"id":2,"options":{"colorMode":"value","graphMode":"none","justifyMode":"center","orientation":"auto","reduceOptions":{"calcs":["sum"],"fields":"","values":false},"textMode":"value_and_name"},"pluginVersion":"7.3.5","targets":[{"expr":"sum (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" [$__interval]))","legendFormat":"Total Requests","refId":"B"},{"expr":"sum by (DownstreamStatus) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" [$__interval]))","legendFormat":"{{DownstreamStatus}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Requests Per Status Code","transformations":[],"type":"stat"},{"datasource":"Loki","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":80}]},{"matcher":{"id":"byName","options":"Page"},"properties":[{"id":"links","value":[{"targetBlank":true,"title":"","url":"https://${host:text}${__data.fields.Page}"}]}]}]},"gridPos":{"h":10,"w":7,"x":9,"y":0},"id":6,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"topk(25, sum by (RequestPath) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" RequestPath=~\"^/([a-zA-Z0-9\\\\-_]+(/|(ht|x)ml?)?)?$\" DownstreamStatus=~\"2.+\" [$__interval])))","legendFormat":"{{RequestPath}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Top Viewed Pages","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"Page"}}}],"type":"table"},{"datasource":"Loki","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":80}]}]},"gridPos":{"h":10,"w":8,"x":16,"y":0},"id":5,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"topk(25, sum by (request_Referer) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" request_Referer!=\"\" request_Referer!~\"https?://.*${host:text}/?.*\" \n | label_format request_Referer=`{{ regexReplaceAll \"https?://(.*)google.co.uk\" .request_Referer \"Google UK\" }}`\n | label_format request_Referer=`{{ regexReplaceAll \"https?://(.*)google..+\" .request_Referer \"Google\" }}`\n | label_format request_Referer=`{{ regexReplaceAll \"https?://t.co/.*\" .request_Referer \"Twitter\" }}` \n | label_format request_Referer=`{{ regexReplaceAll \"https?://.*bing.com/.*\" .request_Referer \"Bing\" }}` \n | label_format request_Referer=`{{ regexReplaceAll \"https?://.*baidu.com/.*\" .request_Referer \"Baidu\" }}` \n | label_format request_Referer=`{{ regexReplaceAll \"https?://.*duckduckgo.com/?\" .request_Referer \"DuckDuckGo\" }}` \n[$__interval])))","legendFormat":"{{request_Referer}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Top Referers","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"Referer"}}}],"type":"table"},{"aliasColors":{},"bars":false,"dashLength":10,"dashes":false,"datasource":"Loki","decimals":0,"description":"","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[]},"fill":1,"fillGradient":10,"gridPos":{"h":10,"w":9,"x":0,"y":10},"hiddenSeries":false,"id":3,"interval":"30m","legend":{"avg":false,"current":false,"max":false,"min":false,"show":true,"total":true,"values":true},"lines":true,"linewidth":1,"nullPointMode":"null","options":{"alertThreshold":false},"percentage":false,"pluginVersion":"7.3.5","pointradius":1,"points":true,"renderer":"flot","seriesOverrides":[{}],"spaceLength":10,"stack":false,"steppedLine":false,"targets":[{"expr":"sum by (DownstreamStatus) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" [$__interval]))","legendFormat":"{{DownstreamStatus}}","queryType":"randomWalk","refId":"A"}],"thresholds":[],"timeFrom":null,"timeRegions":[],"timeShift":null,"title":"Requests Per Status Code","tooltip":{"shared":true,"sort":0,"value_type":"individual"},"transformations":[],"type":"graph","xaxis":{"buckets":null,"mode":"time","name":null,"show":true,"values":[]},"yaxes":[{"decimals":0,"format":"short","label":"No. of Requests","logBase":1,"max":null,"min":null,"show":true},{"format":"short","label":null,"logBase":1,"max":null,"min":null,"show":false}],"yaxis":{"align":false,"alignLevel":null}},{"datasource":"Loki","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":80}]}]},"gridPos":{"h":10,"w":7,"x":9,"y":10},"id":8,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"topk(25, sum by (RequestPath) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" DownstreamStatus=\"404\" [$__interval])))","legendFormat":"{{RequestPath}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Top 404","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"Requests"}}}],"type":"table"},{"datasource":"Loki","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":80}]}]},"gridPos":{"h":10,"w":8,"x":16,"y":10},"id":7,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"topk(25, sum by (request_User_Agent) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" request_User_Agent!=\"\" [$__interval])))","legendFormat":"{{request_User_Agent}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Top User Agents","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"User Agent"}}}],"type":"table"},{"datasource":"Loki","description":"","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":116},{"id":"links","value":[]}]},{"matcher":{"id":"byName","options":"Requests"},"properties":[{"id":"custom.width","value":1132}]}]},"gridPos":{"h":12,"w":9,"x":0,"y":20},"id":9,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"sum by (RequestMethod, RequestPath, DownstreamStatus) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" [$__interval]))","legendFormat":"{{RequestMethod}} - {{RequestPath}} - {{DownstreamStatus}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"All Requests","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"Requests"}}}],"type":"table"},{"datasource":"Loki","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":80}]}]},"gridPos":{"h":12,"w":7,"x":9,"y":20},"id":10,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"topk(25, sum by (DownstreamStatus, RequestPath, downstream_Location) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" DownstreamStatus=~\"3.+\" [$__interval])))","legendFormat":"{{DownstreamStatus}} - {{RequestPath}} => {{downstream_Location}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Top Redirects","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"Requests"}}}],"type":"table"},{"datasource":"Loki","fieldConfig":{"defaults":{"custom":{"align":null,"filterable":false},"mappings":[],"thresholds":{"mode":"absolute","steps":[{"color":"green","value":null}]}},"overrides":[{"matcher":{"id":"byName","options":"Total"},"properties":[{"id":"custom.width","value":80}]}]},"gridPos":{"h":12,"w":8,"x":16,"y":20},"id":11,"options":{"showHeader":true,"sortBy":[{"desc":true,"displayName":"Total"}]},"pluginVersion":"7.3.5","targets":[{"expr":"topk(25, sum by (RequestMethod, DownstreamStatus, RequestPath) (count_over_time({k8s_app=\"traefik-ingress-lb\"} | json | RequestHost=~\"${host:text}\" error=\"\" DownstreamStatus=~\"[54].+\" DownstreamStatus!=\"404\" [$__interval])))","legendFormat":"{{DownstreamStatus}} - {{RequestMethod}} - {{RequestPath}}","queryType":"randomWalk","refId":"A"}],"timeFrom":null,"timeShift":null,"title":"Top Errors","transformations":[{"id":"reduce","options":{"reducers":["sum"]}},{"id":"organize","options":{"excludeByName":{},"indexByName":{},"renameByName":{"Field":"Requests"}}}],"type":"table"}],"refresh":false,"schemaVersion":26,"style":"dark","tags":[],"templating":{"list":[{"allValue":null,"current":{"selected":true,"text":"marcusnoble.co.uk","value":"marcusnoble.co.uk"},"error":null,"hide":0,"includeAll":false,"label":"Host","multi":false,"name":"host","options":[{"selected":false,"text":"marcusnoble.co.uk","value":"marcusnoble.co.uk"},{"selected":false,"text":"til.marcusnoble.co.uk","value":"til.marcusnoble.co.uk"},{"selected":false,"text":"dash.cluster.fun","value":"dash.cluster.fun"},{"selected":true,"text":"tweet.cluster.fun","value":"tweet.cluster.fun"},{"selected":false,"text":"base64.cluster.fun","value":"base64.cluster.fun"},{"selected":false,"text":"feed-fetcher.cluster.fun","value":"feed-fetcher.cluster.fun"},{"selected":false,"text":"cors-proxy.cluster.fun","value":"cors-proxy.cluster.fun"}],"query":"marcusnoble.co.uk,til.marcusnoble.co.uk,dash.cluster.fun,tweet.cluster.fun,base64.cluster.fun,feed-fetcher.cluster.fun,cors-proxy.cluster.fun","queryValue":"","skipUrlSync":false,"type":"custom"}]},"time":{"from":"now-24h","to":"now"},"timepicker":{},"timezone":"","title":"Analytics","uid":"3hJJhOLMz","version":9} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana +rules: +- apiGroups: ['extensions'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: [grafana] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: grafana +subjects: +- kind: ServiceAccount + name: grafana + namespace: monitoring +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana +spec: + type: ClusterIP + ports: + - name: service + port: 80 + protocol: TCP + targetPort: 3000 + selector: + app.kubernetes.io/name: grafana +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: monitoring + labels: + app.kubernetes.io/name: grafana +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: grafana + strategy: + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: grafana + spec: + serviceAccountName: grafana + securityContext: + fsGroup: 472 + runAsGroup: 472 + runAsUser: 472 + initContainers: + - name: download-dashboards + image: "curlimages/curl:7.73.0" + imagePullPolicy: IfNotPresent + command: ["/bin/sh"] + args: [ "-c", "mkdir -p /var/lib/grafana/dashboards/default && /bin/sh /etc/grafana/download_dashboards.sh" ] + volumeMounts: + - name: config + mountPath: "/etc/grafana/download_dashboards.sh" + subPath: download_dashboards.sh + - name: storage + mountPath: "/var/lib/grafana" + containers: + - name: grafana + image: "grafana/grafana:8.0.1" + imagePullPolicy: IfNotPresent + volumeMounts: + - name: config + mountPath: "/etc/grafana/grafana.ini" + subPath: grafana.ini + - name: storage + mountPath: "/var/lib/grafana" + - name: dashboards-default + mountPath: "/var/lib/grafana/dashboards/default/analytics.json" + subPath: "analytics.json" + - name: config + mountPath: "/etc/grafana/provisioning/datasources/datasources.yaml" + subPath: datasources.yaml + - name: config + mountPath: "/etc/grafana/provisioning/dashboards/dashboardproviders.yaml" + subPath: dashboardproviders.yaml + ports: + - name: service + containerPort: 80 + protocol: TCP + - name: grafana + containerPort: 3000 + protocol: TCP + env: + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: username + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: password + livenessProbe: + failureThreshold: 10 + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 60 + timeoutSeconds: 30 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + volumes: + - name: config + configMap: + name: grafana + - name: dashboards-default + configMap: + name: grafana-dashboards-default + - name: storage + emptyDir: {} + diff --git a/manifests/monitoring/ingress.yaml b/manifests/monitoring/ingress.yaml new file mode 100644 index 0000000..4a27fb1 --- /dev/null +++ b/manifests/monitoring/ingress.yaml @@ -0,0 +1,222 @@ +apiVersion: v1 +kind: Secret +metadata: + name: proxy-auth + namespace: monitoring + annotations: + kube-1password: mr6spkkx7n3memkbute6ojaarm + kube-1password/vault: Kubernetes +type: Opaque +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-auth + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server-auth +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server-auth + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server-auth + spec: + containers: + - args: + - --cookie-secure=false + - --provider=oidc + - --provider-display-name=Auth0 + - --upstream=http://prometheus-server.monitoring.svc.cluster.local + - --http-address=$(HOST_IP):8080 + - --redirect-url=https://prometheus.cluster.fun/oauth2/callback + - --email-domain=marcusnoble.co.uk + - --pass-basic-auth=false + - --pass-access-token=false + - --oidc-issuer-url=https://marcusnoble.eu.auth0.com/ + - --cookie-secret=KDGD6rrK6cBmryyZ4wcJ9xAUNW9AQN + env: + - name: HOST_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: OAUTH2_PROXY_CLIENT_ID + valueFrom: + secretKeyRef: + key: username + name: proxy-auth + - name: OAUTH2_PROXY_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: password + name: proxy-auth + image: quay.io/oauth2-proxy/oauth2-proxy:v5.1.1 + name: oauth-proxy + ports: + - containerPort: 8080 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus-auth + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server-auth +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server-auth + type: ClusterIP +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus-auth + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server-auth + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/frontend-entry-points: http,https + traefik.ingress.kubernetes.io/redirect-entry-point: https + traefik.ingress.kubernetes.io/redirect-permanent: "true" +spec: + tls: + - hosts: + - prometheus.cluster.fun + secretName: prometheus-ingress + rules: + - host: prometheus.cluster.fun + http: + paths: + - backend: + service: + name: prometheus-auth + port: + number: 80 + path: / + pathType: ImplementationSpecific + + +--- + + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana-auth + namespace: monitoring + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: server-auth +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: server-auth + template: + metadata: + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: server-auth + spec: + containers: + - args: + - --cookie-secure=false + - --provider=oidc + - --provider-display-name=Auth0 + - --upstream=http://grafana.monitoring.svc.cluster.local + - --http-address=$(HOST_IP):8080 + - --redirect-url=https://grafana.cluster.fun/oauth2/callback + - --email-domain=marcusnoble.co.uk + - --pass-basic-auth=false + - --pass-access-token=false + - --oidc-issuer-url=https://marcusnoble.eu.auth0.com/ + - --cookie-secret=KDGD6rrK6cBmryyZ4wcJ9xAUNW9AQN + env: + - name: HOST_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: OAUTH2_PROXY_CLIENT_ID + valueFrom: + secretKeyRef: + key: username + name: proxy-auth + - name: OAUTH2_PROXY_CLIENT_SECRET + valueFrom: + secretKeyRef: + key: password + name: proxy-auth + image: quay.io/oauth2-proxy/oauth2-proxy:v5.1.1 + name: oauth-proxy + ports: + - containerPort: 8080 + protocol: TCP +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana-auth + namespace: monitoring + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: server-auth +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 8080 + selector: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: server-auth + type: ClusterIP +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana-auth + namespace: monitoring + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: server-auth + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/frontend-entry-points: http,https + traefik.ingress.kubernetes.io/redirect-entry-point: https + traefik.ingress.kubernetes.io/redirect-permanent: "true" +spec: + tls: + - hosts: + - grafana.cluster.fun + secretName: grafana-ingress + rules: + - host: grafana.cluster.fun + http: + paths: + - backend: + service: + name: grafana-auth + port: + number: 80 + path: / + pathType: ImplementationSpecific diff --git a/manifests/monitoring/kube-state-metrics.yaml b/manifests/monitoring/kube-state-metrics.yaml new file mode 100644 index 0000000..9b310be --- /dev/null +++ b/manifests/monitoring/kube-state-metrics.yaml @@ -0,0 +1,255 @@ + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app.kubernetes.io/name: kube-state-metrics +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics +rules: + - apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - configmaps + verbs: ["list", "watch"] + + - apiGroups: ["batch"] + resources: + - cronjobs + verbs: ["list", "watch"] + + - apiGroups: ["extensions", "apps"] + resources: + - daemonsets + verbs: ["list", "watch"] + + - apiGroups: ["extensions", "apps"] + resources: + - deployments + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - endpoints + verbs: ["list", "watch"] + + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["list", "watch"] + + - apiGroups: ["batch"] + resources: + - jobs + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - limitranges + verbs: ["list", "watch"] + + - apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - namespaces + verbs: ["list", "watch"] + + - apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - persistentvolumeclaims + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - persistentvolumes + verbs: ["list", "watch"] + + - apiGroups: ["policy"] + resources: + - poddisruptionbudgets + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - pods + verbs: ["list", "watch"] + + - apiGroups: ["extensions", "apps"] + resources: + - replicasets + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - replicationcontrollers + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - resourcequotas + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - secrets + verbs: ["list", "watch"] + + - apiGroups: [""] + resources: + - services + verbs: ["list", "watch"] + + - apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] + + - apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + verbs: ["list", "watch"] + + - apiGroups: ["admissionregistration.k8s.io"] + resources: + - validatingwebhookconfigurations + verbs: ["list", "watch"] + + - apiGroups: ["storage.k8s.io"] + resources: + - volumeattachments + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring +--- + +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app.kubernetes.io/name: kube-state-metrics + annotations: + prometheus.io/scrape: 'true' +spec: + type: "ClusterIP" + ports: + - name: "http" + protocol: TCP + port: 8080 + targetPort: 8080 + selector: + app.kubernetes.io/name: kube-state-metrics +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app.kubernetes.io/name: kube-state-metrics +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsUser: 65534 + containers: + - name: kube-state-metrics + args: + - --resources=certificatesigningrequests + - --resources=configmaps + - --resources=cronjobs + - --resources=daemonsets + - --resources=deployments + - --resources=endpoints + - --resources=horizontalpodautoscalers + - --resources=ingresses + - --resources=jobs + - --resources=limitranges + - --resources=mutatingwebhookconfigurations + - --resources=namespaces + - --resources=networkpolicies + - --resources=nodes + - --resources=persistentvolumeclaims + - --resources=persistentvolumes + - --resources=poddisruptionbudgets + - --resources=pods + - --resources=replicasets + - --resources=replicationcontrollers + - --resources=resourcequotas + - --resources=secrets + - --resources=services + - --resources=statefulsets + - --resources=storageclasses + - --resources=validatingwebhookconfigurations + - --resources=volumeattachments + imagePullPolicy: IfNotPresent + image: "k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.1.0" + ports: + - containerPort: 8080 + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 +--- diff --git a/manifests/monitoring/loki.yaml b/manifests/monitoring/loki.yaml new file mode 100644 index 0000000..6c1db40 --- /dev/null +++ b/manifests/monitoring/loki.yaml @@ -0,0 +1,232 @@ +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: loki + labels: + app.kubernetes.io/name: loki +spec: + privileged: false + allowPrivilegeEscalation: false + volumes: + - 'configMap' + - 'emptyDir' + - 'persistentVolumeClaim' + - 'secret' + - 'projected' + - 'downwardAPI' + hostNetwork: false + hostIPC: false + hostPID: false + runAsUser: + rule: 'MustRunAsNonRoot' + seLinux: + rule: 'RunAsAny' + supplementalGroups: + rule: 'MustRunAs' + ranges: + - min: 1 + max: 65535 + fsGroup: + rule: 'MustRunAs' + ranges: + - min: 1 + max: 65535 + readOnlyRootFilesystem: true + requiredDropCapabilities: + - ALL +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: loki + namespace: monitoring + labels: + app.kubernetes.io/name: loki +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki + namespace: monitoring + labels: + app.kubernetes.io/name: loki +data: + loki.yaml: | + auth_enabled: false + chunk_store_config: + max_look_back_period: 0s + compactor: + shared_store: filesystem + working_directory: /data/loki/boltdb-shipper-compactor + ingester: + chunk_block_size: 262144 + chunk_idle_period: 3m + chunk_retain_period: 1m + lifecycler: + ring: + kvstore: + store: inmemory + replication_factor: 1 + max_transfer_retries: 0 + limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + schema_config: + configs: + - from: "2020-10-24" + index: + period: 24h + prefix: index_ + object_store: filesystem + schema: v11 + store: boltdb-shipper + server: + http_listen_port: 3100 + storage_config: + boltdb_shipper: + active_index_directory: /data/loki/boltdb-shipper-active + cache_location: /data/loki/boltdb-shipper-cache + cache_ttl: 24h + shared_store: filesystem + filesystem: + directory: /data/loki/chunks + table_manager: + retention_deletes_enabled: true + retention_period: 720h +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: loki + namespace: monitoring + labels: + app.kubernetes.io/name: loki +rules: +- apiGroups: ['extensions'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: [loki] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: loki + namespace: monitoring + labels: + app.kubernetes.io/name: loki +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: loki +subjects: +- kind: ServiceAccount + name: loki +--- +apiVersion: v1 +kind: Service +metadata: + name: loki-headless + namespace: monitoring + labels: + app.kubernetes.io/name: loki + variant: headless +spec: + clusterIP: None + ports: + - port: 3100 + protocol: TCP + name: http-metrics + targetPort: http-metrics + selector: + app.kubernetes.io/name: loki +--- +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: monitoring + labels: + app.kubernetes.io/name: loki +spec: + type: ClusterIP + ports: + - port: 3100 + protocol: TCP + name: http-metrics + targetPort: http-metrics + selector: + app.kubernetes.io/name: loki +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: loki + namespace: monitoring + labels: + app.kubernetes.io/name: loki +spec: + podManagementPolicy: OrderedReady + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: loki + serviceName: loki-headless + template: + metadata: + labels: + app.kubernetes.io/name: loki + annotations: + prometheus.io/port: http-metrics + prometheus.io/scrape: "true" + spec: + serviceAccountName: loki + securityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + containers: + - name: loki + image: "grafana/loki:2.2.1" + imagePullPolicy: IfNotPresent + args: + - "-config.file=/etc/loki/loki.yaml" + volumeMounts: + - name: config + mountPath: /etc/loki + - name: storage + mountPath: "/data" + subPath: + ports: + - name: http-metrics + containerPort: 3100 + protocol: TCP + livenessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 45 + readinessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 45 + securityContext: + readOnlyRootFilesystem: true + terminationGracePeriodSeconds: 4800 + volumes: + - name: config + configMap: + name: loki + volumeClaimTemplates: + - metadata: + name: storage + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "10Gi" + storageClassName: scw-bssd + diff --git a/manifests/monitoring/node-exporter.yaml b/manifests/monitoring/node-exporter.yaml new file mode 100644 index 0000000..3d5d1bc --- /dev/null +++ b/manifests/monitoring/node-exporter.yaml @@ -0,0 +1,87 @@ + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-node-exporter + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: node-exporter +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: "true" + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: node-exporter + name: prometheus-node-exporter + namespace: monitoring +spec: + clusterIP: None + ports: + - name: metrics + port: 9100 + protocol: TCP + targetPort: 9100 + selector: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: node-exporter + type: "ClusterIP" +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: node-exporter + name: prometheus-node-exporter + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: node-exporter + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: node-exporter + spec: + serviceAccountName: prometheus-node-exporter + containers: + - name: prometheus-node-exporter + image: "prom/node-exporter:v1.1.2" + imagePullPolicy: "IfNotPresent" + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --no-collector.wifi + - --no-collector.hwmon + - --no-collector.netclass + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/) + - --web.listen-address=:9100 + ports: + - name: metrics + containerPort: 9100 + hostPort: 9100 + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + hostNetwork: true + hostPID: true + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys +--- diff --git a/manifests/monitoring/prometheus-server.yaml b/manifests/monitoring/prometheus-server.yaml new file mode 100644 index 0000000..db32df0 --- /dev/null +++ b/manifests/monitoring/prometheus-server.yaml @@ -0,0 +1,501 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-server + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-server + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server +data: + alerting_rules.yml: | + {} + alerts: | + {} + prometheus.yml: | + global: + evaluation_interval: 1m + scrape_interval: 1m + scrape_timeout: 10s + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + - /etc/config/rules + - /etc/config/alerts + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: kubernetes.default.svc:443 + target_label: __address__ + - regex: (.+) + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + source_labels: + - __meta_kubernetes_node_name + target_label: __metrics_path__ + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + - job_name: kubernetes-service-endpoints-slow + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + scrape_interval: 5m + scrape_timeout: 30s + - honor_labels: true + job_name: prometheus-pushgateway + kubernetes_sd_configs: + - role: service + relabel_configs: + - action: keep + regex: pushgateway + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + - action: drop + regex: Pending|Succeeded|Failed + source_labels: + - __meta_kubernetes_pod_phase + - job_name: kubernetes-pods-slow + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + - action: drop + regex: Pending|Succeeded|Failed + source_labels: + - __meta_kubernetes_pod_phase + scrape_interval: 5m + scrape_timeout: 30s + - job_name: 'prometheus-blackbox-exporter-ping' + metrics_path: /probe + params: + module: [icmp_ping] + static_configs: + - targets: [] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + - job_name: 'prometheus-blackbox-exporter-http' + metrics_path: /probe + params: + module: [http_2xx] + static_configs: + - targets: [] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115 + - job_name: 'node-exporter' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_endpoints_name] + regex: 'node-exporter' + action: keep + - job_name: 'federated-clusters' + scrape_interval: 15s + honor_labels: true + params: + 'match[]': + - '{job="prometheus"}' + - '{job="node"}' + - '{job="node_exporter"}' + - '{job="zfs_exporter"}' + - '{job=~"kubernetes.*"}' + metrics_path: '/federate' + static_configs: + - targets: + - 'prometheus-local.inlets.svc:80' + recording_rules.yml: | + {} + rules: | + {} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-server + namespace: monitoring + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "8Gi" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + name: prometheus-server +rules: + - apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + - ingresses + - configmaps + verbs: + - get + - list + - watch + - apiGroups: + - "extensions" + - "networking.k8s.io" + resources: + - ingresses/status + - ingresses + verbs: + - get + - list + - watch + - nonResourceURLs: + - "/metrics" + verbs: + - get +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + name: prometheus-server +subjects: + - kind: ServiceAccount + name: prometheus-server + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-server +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + name: prometheus-server + namespace: monitoring +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9090 + selector: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + sessionAffinity: None + type: "ClusterIP" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + name: prometheus-server + namespace: monitoring +spec: + selector: + matchLabels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + replicas: 1 + template: + metadata: + labels: + app.kubernetes.io/name: prometheus + app.kubernetes.io/component: server + spec: + serviceAccountName: prometheus-server + containers: + - name: prometheus-server-configmap-reload + image: "jimmidyson/configmap-reload:v0.5.0" + imagePullPolicy: "IfNotPresent" + args: + - --volume-dir=/etc/config + - --webhook-url=http://127.0.0.1:9090/-/reload + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + + - name: prometheus-server + image: "prom/prometheus:v2.27.1" + imagePullPolicy: "IfNotPresent" + args: + - --storage.tsdb.retention.time=15d + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 5 + timeoutSeconds: 30 + failureThreshold: 3 + successThreshold: 1 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 15 + timeoutSeconds: 30 + failureThreshold: 3 + successThreshold: 1 + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: /data + subPath: "" + securityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: prometheus-server + - name: storage-volume + persistentVolumeClaim: + claimName: prometheus-server +--- diff --git a/manifests/monitoring/promtail.yaml b/manifests/monitoring/promtail.yaml new file mode 100644 index 0000000..b7a7bb7 --- /dev/null +++ b/manifests/monitoring/promtail.yaml @@ -0,0 +1,449 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: promtail + namespace: monitoring + labels: + app.kubernetes.io/name: promtail +--- +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: promtail + namespace: monitoring + labels: + app.kubernetes.io/name: promtail +spec: + allowPrivilegeEscalation: false + fsGroup: + rule: RunAsAny + hostIPC: false + hostNetwork: false + hostPID: false + privileged: false + readOnlyRootFilesystem: true + requiredDropCapabilities: + - ALL + runAsUser: + rule: RunAsAny + seLinux: + rule: RunAsAny + supplementalGroups: + rule: RunAsAny + volumes: + - secret + - configMap + - hostPath + - projected + - downwardAPI + - emptyDir +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: promtail + namespace: monitoring + labels: + app.kubernetes.io/name: promtail +data: + promtail.yaml: | + client: + backoff_config: + max_period: 5m + max_retries: 10 + min_period: 500ms + batchsize: 1048576 + batchwait: 1s + external_labels: {} + timeout: 10s + positions: + filename: /run/promtail/positions.yaml + server: + http_listen_port: 3101 + target_config: + sync_period: 10s + scrape_configs: + - job_name: kubernetes-pods-name + pipeline_stages: + - docker: {} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: + - __meta_kubernetes_pod_label_name + target_label: __service__ + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: drop + regex: '' + source_labels: + - __service__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: / + source_labels: + - __meta_kubernetes_namespace + - __service__ + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - job_name: kubernetes-pods-app + pipeline_stages: + - docker: {} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: drop + regex: .+ + source_labels: + - __meta_kubernetes_pod_label_name + - source_labels: + - __meta_kubernetes_pod_label_app + target_label: __service__ + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: drop + regex: '' + source_labels: + - __service__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: / + source_labels: + - __meta_kubernetes_namespace + - __service__ + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - job_name: kubernetes-pods-direct-controllers + pipeline_stages: + - docker: {} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: drop + regex: .+ + separator: '' + source_labels: + - __meta_kubernetes_pod_label_name + - __meta_kubernetes_pod_label_app + - action: drop + regex: '[0-9a-z-.]+-[0-9a-f]{8,10}' + source_labels: + - __meta_kubernetes_pod_controller_name + - source_labels: + - __meta_kubernetes_pod_controller_name + target_label: __service__ + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: drop + regex: '' + source_labels: + - __service__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: / + source_labels: + - __meta_kubernetes_namespace + - __service__ + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - job_name: kubernetes-pods-indirect-controller + pipeline_stages: + - docker: {} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: drop + regex: .+ + separator: '' + source_labels: + - __meta_kubernetes_pod_label_name + - __meta_kubernetes_pod_label_app + - action: keep + regex: '[0-9a-z-.]+-[0-9a-f]{8,10}' + source_labels: + - __meta_kubernetes_pod_controller_name + - action: replace + regex: '([0-9a-z-.]+)-[0-9a-f]{8,10}' + source_labels: + - __meta_kubernetes_pod_controller_name + target_label: __service__ + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: drop + regex: '' + source_labels: + - __service__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: / + source_labels: + - __meta_kubernetes_namespace + - __service__ + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - job_name: kubernetes-pods-static + pipeline_stages: + - docker: {} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: drop + regex: '' + source_labels: + - __meta_kubernetes_pod_annotation_kubernetes_io_config_mirror + - action: replace + source_labels: + - __meta_kubernetes_pod_label_component + target_label: __service__ + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: drop + regex: '' + source_labels: + - __service__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: / + source_labels: + - __meta_kubernetes_namespace + - __service__ + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_annotation_kubernetes_io_config_mirror + - __meta_kubernetes_pod_container_name + target_label: __path__ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: promtail-clusterrole + labels: + app.kubernetes.io/name: promtail +rules: +- apiGroups: [""] # "" indicates the core API group + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "watch", "list"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: promtail-clusterrolebinding + labels: + app.kubernetes.io/name: promtail +subjects: + - kind: ServiceAccount + name: promtail + namespace: monitoring +roleRef: + kind: ClusterRole + name: promtail-clusterrole + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: promtail + namespace: monitoring + labels: + app.kubernetes.io/name: promtail +rules: +- apiGroups: ['extensions'] + resources: ['podsecuritypolicies'] + verbs: ['use'] + resourceNames: [promtail] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: promtail + namespace: monitoring + labels: + app.kubernetes.io/name: promtail +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: promtail +subjects: +- kind: ServiceAccount + name: promtail +--- + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: promtail + namespace: monitoring + labels: + app.kubernetes.io/name: promtail +spec: + selector: + matchLabels: + app.kubernetes.io/name: promtail + template: + metadata: + labels: + app.kubernetes.io/name: promtail + annotations: + prometheus.io/port: http-metrics + prometheus.io/scrape: "true" + spec: + serviceAccountName: promtail + containers: + - name: promtail + image: "grafana/promtail:2.2.1" + imagePullPolicy: IfNotPresent + args: + - "-config.file=/etc/promtail/promtail.yaml" + - "-client.url=http://loki:3100/loki/api/v1/push" + volumeMounts: + - name: config + mountPath: /etc/promtail + - name: run + mountPath: /run/promtail + - mountPath: /var/lib/docker/containers + name: docker + readOnly: true + - mountPath: /var/log/pods + name: pods + readOnly: true + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + ports: + - containerPort: 3101 + name: http-metrics + securityContext: + readOnlyRootFilesystem: true + runAsGroup: 0 + runAsUser: 0 + readinessProbe: + failureThreshold: 5 + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + volumes: + - name: config + configMap: + name: promtail + - name: run + hostPath: + path: /run/promtail + - hostPath: + path: /var/lib/docker/containers + name: docker + - hostPath: + path: /var/log/pods + name: pods +---