共计 16130 个字符,预计需要花费 41 分钟才能阅读完成。
服务器监控,整顿分 2 局部,一部分是退出 kubernetes,另外一部分未退出的。选用 Promtheus 如何来监控
一.kubernetes 监控
1. 增加一个 namespace
kubectl create namespace monitor
2.prometheus 部署
prometheus-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/name: prometheus
name: prometheus
namespace: monitor
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- pods
- services
- endpoints
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- nonResourceURLs:
- "/metrics"
verbs:
- get
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
prometheus-config-kubernetes.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
prometheus.yml: |
global:
scrape_configs:
- job_name: 'kubernetes-kubelet'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: 192.168.11.210:8443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: 192.168.11.210:8443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: kube-state-metrics
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
regex: kube-state-metrics
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: k8s_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: k8s_sname
prometheus.yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: prometheus
name: prometheus
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: prometheus-server
template:
metadata:
labels:
app: prometheus-server
spec:
serviceAccountName: prometheus
containers:
- name: prometheus
image: prom/prometheus:v2.22.0
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/etc/prometheus"
name: config-prometheus
imagePullSecrets:
- name: authllzg
volumes:
- name: config-prometheus
configMap:
name: prometheus-config
---
kind: Service
apiVersion: v1
metadata:
name: prometheus
namespace: monitor
spec:
selector:
app: prometheus-server
ports:
- protocol: TCP
port: 9090
targetPort: 9090
name: prom
type: ClusterIP
---
apiVersion: networking.k8s.io/v1beta1
#apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: prometheus
namespace: monitor
spec:
rules:
- host: test-prometheus.bsg.com
http:
paths:
- path: /
backend:
serviceName: prometheus
servicePort: prom
别离将以上配置文件利用到 kubernetes(独自执行后不便是否部署胜利)
kubectl apply -f *.yaml
3. 部署 kube-state-metrics
Prometheus 须要能采集到 cadvisor
与kube-state-metrics
的指标
因为 cAdvisor 作为 kubelet 内置的一部分程序能够间接应用,所以只须要部署 kube-state-metrics, 参考以下链接下方局部内容
https://grafana.com/grafana/d…
cluster-role-binding.yaml
cluster-role.yaml
service-account.yaml
参考链接模板
kube-state-metrics.yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
name: kube-state-metrics
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
template:
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
spec:
containers:
- image: quay.mirrors.ustc.edu.cn/coreos/kube-state-metrics:v1.9.7
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
- containerPort: 8081
name: telemetry
readinessProbe:
httpGet:
path: /
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
serviceAccountName: kube-state-metrics
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: v1.9.7
name: kube-state-metrics
namespace: monitor
annotations:
prometheus.io/scrape: "true" ## 增加此参数,容许 prometheus 主动发现
spec:
clusterIP: None
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
- name: telemetry
port: 8081
targetPort: telemetry
selector:
app.kubernetes.io/name: kube-state-metrics
4. 部署 node-exporter
node-exporter.yaml
{
"kind": "DaemonSet",
"apiVersion": "apps/v1",
"metadata": {
"name": "node-exporter",
"namespace": "monitor"
},
"spec": {
"selector": {
"matchLabels": {
"daemon": "node-exporter",
"grafanak8sapp": "true"
}
},
"template": {
"metadata": {
"name": "node-exporter",
"labels": {
"daemon": "node-exporter",
"grafanak8sapp": "true"
}
},
"spec": {
"volumes": [
{
"name": "proc",
"hostPath": {"path": "/proc"}
},
{
"name": "sys",
"hostPath": {"path": "/sys"}
}
],
"containers": [
{
"name": "node-exporter",
"image": "prom/node-exporter:v1.0.1",
"args": [
"--path.procfs=/proc_host",
"--path.sysfs=/host_sys"
],
"ports": [
{
"name": "node-exporter",
"hostPort": 9100,
"containerPort": 9100
}
],
"volumeMounts": [
{
"name": "sys",
"readOnly": true,
"mountPath": "/host_sys"
},
{
"name": "proc",
"readOnly": true,
"mountPath": "/proc_host"
}
],
"imagePullPolicy": "IfNotPresent"
}
],
"restartPolicy": "Always",
"hostNetwork": true,
"hostPID": true
}
}
}
}
kubectl apply -f node-exporter.yaml -o json
到这里,能够先看下 prometheus 有没有监控到数据
5.grafana 部署
留神:因为数据须要存储到 nfs,先配置好 nfs 服务器,而后创立存储类 StorageClass 并关联 nfs 参数。
grafana.yaml
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
namespace: monitor
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: "40Gi"
volumeName:
storageClassName: monitor-store
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: grafana-server
name: grafana-server
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: grafana-server
template:
metadata:
labels:
app: grafana-server
spec:
serviceAccountName: prometheus
containers:
- name: grafana
image: grafana/grafana:7.3.4
ports:
- containerPort: 3000
protocol: TCP
volumeMounts:
- mountPath: "/var/lib/grafana"
readOnly: false
name: grafana-pvc
#env:
#- name: GF_INSTALL_PLUGINS
# value: "grafana-kubernetes-app"
imagePullSecrets:
- name: IfNotPresent
volumes:
- name: grafana-pvc
persistentVolumeClaim:
claimName: grafana-pvc
---
apiVersion: v1
kind: Service
metadata:
name: grafana-server
namespace: monitor
spec:
selector:
app: grafana-server
ports:
- protocol: TCP
port: 3000
name: grafana
---
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: grafana-server
namespace: monitor
#annotations:
# kubernetes.io/ingress.class: traefik
spec:
rules:
- host: test-grafana.bsg.com
http:
paths:
- path: /
backend:
serviceName: grafana-server
servicePort: grafana
6. 最初,配置好 grafana
增加数据源
导入模板
举荐的 2 个模板
13105 8919
参考链接
https://sre.ink/kube-state-me…
二、服务器监控告警
1.docker-compose
docker-compose.yml
version: '3.1'
services:
grafana:
image: grafana/grafana:6.7.4
restart: on-failure
container_name: grafana
environment:
- GF_SERVER_ROOT_URL=http://192.168.11.229:3000
volumes:
- ./grafana/data:/var/lib/grafana:rw
ports:
- 3000:3000
user: "root"
prometheus:
image: prom/prometheus:v2.22.0
restart: on-failure
container_name: prometheus
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alert-rules.yml:/etc/prometheus/alert-rules.yml
- ./prometheus/data:/prometheus:rw
ports:
- 9090:9090
user: "root"
depends_on:
- alertmanager
alertmanager:
image: prom/alertmanager:latest
restart: on-failure
container_name: alertmanager
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- 9093:9093
- 9094:9094
depends_on:
- dingtalk
dingtalk:
image: timonwong/prometheus-webhook-dingtalk:latest
restart: on-failure
container_name: dingtalk
volumes:
- ./alertmanager/config.yml:/etc/prometheus-webhook-dingtalk/config.yml
ports:
- 8060:8060
2.altermanager
alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.exmail.qq.com:465' #邮箱 smtp 服务器代理,启用 SSL 发信, 端口个别是 465
smtp_from: 'yc@bsg.com' #发送邮箱名称
smtp_auth_username: 'yc@bsg.com' #邮箱名称
smtp_auth_password: 'password' #邮箱明码或受权码
smtp_require_tls: false
route:
receiver: 'default'
group_wait: 10s
group_interval: 1m
repeat_interval: 1h
group_by: ['alertname']
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
receivers:
- name: 'default'
email_configs:
- to: 'yc@bsg.com'
send_resolved: true
webhook_configs:
- url: 'http://192.168.11.229:8060/dingtalk/webhook/send'
send_resolved: true
config.yml
targets:
webhook:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxx #批改为钉钉机器人的 webhook
mention:
all: true
3.grafana
挂载 data 目录如下
4.prometheus
alert-rules.yml
groups:
- name: node-alert
rules:
- alert: NodeDown
expr: up{job="node"} == 0
for: 5m
labels:
severity: critical
instance: "{{$labels.instance}}"
annotations:
summary: "Instance 节点曾经宕机 5 分钟"
description: "instance: {{$labels.instance}} down"
value: "{{$value}}"
- alert: NodeCpuHigh
expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))) * 100 > 80
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "CPU 使用率超过 80%"
description: "instance: {{$labels.instance}} cpu 使用率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeCpuIowaitHigh
expr: avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="iowait"}[5m])) * 100 > 50
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "CPU iowait 使用率超过 50%"
description: "instance: {{$labels.instance}} cpu iowait 使用率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeLoad5High
expr: node_load5 > (count by (instance) (node_cpu_seconds_total{job="node",mode='system'})) * 1.2
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Load(5m) 过高,超出 cpu 核数 1.2 倍"
description: "instance: {{$labels.instance}} load(5m) 过高,current value is {{$value}})"
value: "{{$value}}"
- alert: NodeMemoryHigh
expr: (1 - node_memory_MemAvailable_bytes{job="node"} / node_memory_MemTotal_bytes{job="node"}) * 100 > 90
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Memory 使用率超过 90%"
description: "instance: {{$labels.instance}} memory 使用率过高,current value is {{$value}})"
value: "{{$value}}"
- alert: NodeDiskRootHigh
expr: max((1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.?|xfs"} / node_filesystem_size_bytes{job="node",fstype=~"ext.?|xfs"}) * 100)by(instance) > 85
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk(/ 分区) 使用率超过 85%"
description: "instance: {{$labels.instance}} disk(/ 分区) 使用率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeDiskBootHigh
expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 80
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk(/boot 分区) 使用率超过 80%"
description: "instance: {{$labels.instance}} disk(/boot 分区) 使用率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeDiskReadHigh
expr: irate(node_disk_read_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk 读取字节数 速率超过 20 MB/s"
description: "instance: {{$labels.instance}} disk 读取字节数 速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeDiskWriteHigh
expr: irate(node_disk_written_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk 写入字节数 速率超过 20 MB/s"
description: "instance: {{$labels.instance}} disk 写入字节数 速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeDiskReadRateCountHigh
expr: irate(node_disk_reads_completed_total{job="node"}[2m]) > 3000
for: 2m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk iops 每秒读取速率超过 3000 iops"
description: "instance: {{$labels.instance}} disk iops 每秒读取速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeDiskWriteRateCountHigh
expr: irate(node_disk_writes_completed_total{job="node"}[5m]) > 3000
for: 5m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk iops 每秒写入速率超过 3000 iops"
description: "instance: {{$labels.instance}} disk iops 每秒写入速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeInodeRootUsedPercentHigh
expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Disk (/ 分区) inode 使用率超过 80%"
description: "instance: {{$labels.instance}} disk(/ 分区) inode 使用率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeFilefdAllocatedPercentHigh
expr: node_filefd_allocated{job="node"} / node_filefd_maximum{job="node"} * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Filefd 关上百分比 超过 80%"
description: "instance: {{$labels.instance}} filefd 关上百分比过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeNetworkNetinBitRateHigh
expr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens160|ens192|enp3s0"}[1m]) * 8) > 10 * (1024 ^ 2) * 8
for: 3m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Network 接管比特数 速率超过 10MB/s"
description: "instance: {{$labels.instance}} network 接管比特数 速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeNetworkNetoutBitRateHigh
expr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens160|ens192|enp3s0"}[1m]) * 8) > 10 * (1024 ^ 2) * 8
for: 3m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Network 发送比特数 速率超过 10MB/s"
description: "instance: {{$labels.instance}} network 发送比特数 速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeNetworkNetinPacketErrorRateHigh
expr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens160|ens192|enp3s0"}[1m])) > 15
for: 3m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Network 接管谬误包 速率超过 15 个 / 秒"
description: "instance: {{$labels.instance}} 接管谬误包 速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeNetworkNetoutPacketErrorRateHigh
expr: avg by (instance) (irate(node_network_transmit_errs_total{device=~"eth0|eth1|ens160|ens192|enp3s0"}[1m])) > 15
for: 3m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Network 发送谬误包 速率超过 15 个 / 秒"
description: "instance: {{$labels.instance}} 发送谬误包 速率过高,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeProcessBlockedHigh
expr: node_procs_blocked{job="node"} > 10
for: 10m
labels:
severity: warning
instance: "{{$labels.instance}}"
annotations:
summary: "Process 以后被阻塞的工作的数量超过 10 个"
description: "instance: {{$labels.instance}} 以后被阻塞的工作的数量过多,(current value is {{ $value}})"
value: "{{$value}}"
- alert: NodeTimeOffsetHigh
expr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60
for: 2m
labels:
severity: info
instance: "{{$labels.instance}}"
annotations:
summary: "Time 节点的工夫偏差超过 3m"
description: "instance: {{$labels.instance}} 工夫偏差过大,(current value is {{ $value}})"
value: "{{$value}}"
prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "*rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
# add-2020/11/12
- job_name: 'node'
static_configs:
- targets: ['192.168.11.235:9100'] #devops01
- targets: ['192.168.11.237:9100'] #devops02
- targets: ['192.168.11.236:9100'] #samba
- targets: ['192.168.11.219:9100'] #middle01
- targets: ['192.168.11.242:9100'] #k8s-master1
- targets: ['192.168.11.212:9100'] #k8s-worker1
- targets: ['192.168.11.213:9100'] #k8s-worker2
- targets: ['192.168.11.214:9100'] #k8s-worker3
- targets: ['192.168.11.223:9100'] #k8s-worker4
- job_name: 'alertmanager'
static_configs:
- targets: ['192.168.11.229:9093']
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['192.168.11.242:30808']
服务器的监控告警实现