共计 21604 个字符,预计需要花费 55 分钟才能阅读完成。
阐明:prometheus、alertmanager、grafana、dingtalk、node_exporter 均应用 k8s 部署
k8s 版本:v1.20.15
所有利用部署形式:docker 启动,且都能失常应用
一、yaml 筹备
alertmanager-all-in-one.yaml
apiVersion: v1
data:
config.yml: |
global:
resolve_timeout: 30s
route:
group_by: ['docker', 'node', 'prometheus', 'instance']
group_interval: 30s
group_wait: 5m
repeat_interval: 1h
receiver: webhook
routes:
- match:
severity: 'Critical'
receiver: 'webhook'
- match_re:
severity: ^(Warning|Disaster)$
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: http://dingtalk.prom-test.svc.cluster.local:8060/dingtalk/webhook/send
send_resolved: true
kind: ConfigMap
metadata:
name: alertmanager
namespace: prom-test
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: prom-test
labels:
app: alertmanager
spec:
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
nodeName: node-5
volumes:
- name: config
configMap:
name: alertmanager
containers:
- name: alertmanager
image: prom/alertmanager
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--log.level=debug"
ports:
- containerPort: 9093
name: http
volumeMounts:
- mountPath: "/etc/alertmanager"
name: config
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 2
memory: 2Gi
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: prom-test
spec:
type: NodePort
ports:
- name: web
port: 9093
targetPort: http
nodePort: 30333
selector:
app: alertmanager
prometheus-all-in-one.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prom-test
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
# alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# 告警规定门路, 相对路径
rule_files:
- "*-rule.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'coredns'
static_configs:
- targets: ['10.96.0.10:9153']
- job_name: 'kubernetes-apiserver'
static_configs:
- targets: ['10.96.0.1']
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: 'kubernetes-sd-node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- source_labels: [__address__]
regex: '(.*):(9.*)'
replacement: '$1'
target_label: ip
action: replace
- job_name: 'kubernetes-sd-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- target_label: __address__
replacement: 10.96.0.1
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: "huaweicloud-linux"
static_configs:
- targets: ["xxx.xxx.xx.xx:9100","xxx.xxx.xx.xx:9100"]
- job_name: "icmp_ping"
metrics_path: /probe
params:
module: [icmp] # 应用 icmp 模块
static_configs:
- targets: ["xxx.xxx.xx.xx"]
relabel_configs:
- source_labels: [__address__]
regex: (.*)(:80)?
target_label: __param_target
replacement: ${1}
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
regex: (.*)
target_label: ping
replacement: ${1}
- source_labels: []
regex: .*
target_label: __address__
replacement: xxx.xxx.xx.xx:9115
- job_name: "http_get_status"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- "https://xxx.xxx.com.cn:{port}"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: "xxx.xxx.x.xx:9115" # 指向理论的 Blackbox exporter.
- target_label: region
replacement: "xxx.xxx.x.xx:9115"
- job_name: 'tcp_port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ["https://xxx.xxx.com.cn:8090","xxx.xxx.xx.xx:{port}","xxx.xxx.xx.xx:{port}"]
labels:
instance: 'port_status'
group: 'tcp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: xxx.xxx.xx.xx:9115
- job_name: "nacos-server"
metrics_path: '/nacos/actuator/prometheus'
static_configs:
- targets: ["xxx.xxx.xx.xx:8848"]
- job_name: 'Rabbitmq'
scrape_interval: 5s
static_configs:
- targets:
- xxx.xxx.xx.xx:9419
labels:
instance: RabbitMQ-xxx.xxx.xx.xx
- job_name: 'redis'
static_configs:
- targets: ['xxx.xxx.xx.xx:9121']
- job_name: 'mysql-service'
static_configs:
- targets: ["xxx.xxx.xx.xx:9104"]
node-rule.yml: |
groups:
- name: node
rules:
- alert: CPU_High
expr: floor((1 - (sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) / sum(increase(node_cpu_seconds_total[1m])) by (instance))) * 100) > 1
for: 3s
labels:
status: Critical
annotations:
summary: "in prometheus {{$labels.instance}}: High CPU usage"
description: "{{$labels.instance}} of job {{$labels.job}} CPU usage is {{$value}}%"
- alert: Free_High
expr: floor(100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100) > 0
for: 3s
labels:
status: Critical
annotations:
summary: "in prometheus {{$labels.instance}}: High Free usage"
description: "{{$labels.instance}} of job {{$labels.job}} Free usage is {{$value}}%"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: prom-test
labels:
app: prometheus
spec:
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus # 援用这个 servicecount
initContainers:
- name: "change-permission-of-directory"
image: swr.cn-north-1.myhuaweicloud.com/htsd/busybox:latest
command: ["/bin/sh"]
args: ["-c", "chown -R 65534:65534 /prometheus"]
securityContext:
privileged: true
volumeMounts:
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/prometheus" # 挂载到容器的门路, 挂载点
name: data
containers:
- image: swr.cn-north-1.myhuaweicloud.com/htsd/prometheus:v2.33.2
name: prometheus
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus" # 指定 tsdb 数据门路
- "--web.enable-lifecycle" # 反对热更新,间接执行 localhost:9090/-/reload 立刻失效
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
ports:
- containerPort: 9090
name: http
volumeMounts:
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/prometheus"
name: data
resources:
requests:
cpu: 200m
memory: 512Mi
limits:
cpu: 4
memory: 4Gi
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus-pvc
- configMap:
name: prometheus-config
name: config-volume
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prom-test
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs: # 非资源类型的 URL, 这里就是用于抓取指标采集的接口.
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prom-test
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: prom-test
labels:
app: prometheus
spec:
selector:
app: prometheus
type: NodePort
ports:
- name: web
port: 9090
targetPort: http
nodePort: 30650
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
namespace: prom-test
#annotations:
#volume.beta.kubernetes.io/storage-class: "nfs"
spec:
storageClassName: huaweicloud-nfs # 这肯定要写明是关联的哪个 storageclass
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi # 依据理论状况批改
grafana-all-in-one.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: prom-test
spec:
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
volumes:
- name: storage
persistentVolumeClaim:
claimName: grafana-pvc
securityContext:
runAsUser: 0
containers:
- name: grafana
image: swr.cn-north-1.myhuaweicloud.com/htsd/grafana:8.5.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: grafana
env:
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: admin@123
readinessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
# livenessProbe:
# failureThreshold: 3
# httpGet:
# path: /api/health
# port: 3000
# scheme: HTTP
# periodSeconds: 10
# successThreshold: 1
# timeoutSeconds: 1
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 150m
memory: 512Mi
volumeMounts:
- mountPath: /var/lib/grafana
name: storage
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: prom-test
spec:
type: ClusterIP
ports:
- port: 3000
selector:
app: grafana
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
namespace: prom-test
#annotations:
#volume.beta.kubernetes.io/storage-class: "nfs"
spec:
storageClassName: huaweicloud-nfs
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi
dingtalk-all-in-one.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: dingtalk-config
namespace: prom-test
data:
config.yml: |-
templates:
- /etc/prometheus-webhook-dingtalk/dingding.tmpl
targets:
webhook:
# 钉钉上创立机器人给的 token, 须要改成本人的
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx
mention:
all: true
dingding.tmpl: |-
{{define "dingtalk.to.message"}}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ** 监控告警 ** =========
** 告警集群:** k8s
** 告警类型:** {{$alert.Labels.alertname}}
** 告警级别:** {{$alert.Labels.severity}}
** 告警状态:** {{.Status}}
** 故障主机:** {{$alert.Labels.instance}} {{$alert.Labels.device}}
** 告警主题:** {{.Annotations.summary}}
** 告警详情:** {{$alert.Annotations.message}}{{$alert.Annotations.description}}
** 主机标签:** {{range .Labels.SortedPairs}} </br> [{{.Name}}: {{.Value | markdown | html}} ]
{{- end}} </br>
** 故障工夫:** {{($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = **end** = =========
{{- end}}
{{- end}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ** 故障复原 ** =========
** 告警集群:** k8s
** 告警主题:** {{$alert.Annotations.summary}}
** 告警主机:** {{.Labels.instance}}
** 告警类型:** {{.Labels.alertname}}
** 告警级别:** {{$alert.Labels.severity}}
** 告警状态:** {{.Status}}
** 告警详情:** {{$alert.Annotations.message}}{{$alert.Annotations.description}}
** 故障工夫:** {{($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
** 复原工夫:** {{($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = **end** = =========
{{- end}}
{{- end}}
{{- end}}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: dingtalk
namespace: prom-test
spec:
replicas: 1
selector:
matchLabels:
app: dingtalk
template:
metadata:
name: dingtalk
labels:
app: dingtalk
spec:
nodeName: node-5
containers:
- name: dingtalk
image: timonwong/prometheus-webhook-dingtalk
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8060
volumeMounts:
- name: config
mountPath: /etc/prometheus-webhook-dingtalk
volumes:
- name: config
configMap:
name: dingtalk-config
---
apiVersion: v1
kind: Service
metadata:
name: dingtalk
namespace: prom-test
labels:
app: dingtalk
annotations:
prometheus.io/scrape: 'false'
spec:
selector:
app: dingtalk
ports:
- name: dingtalk
port: 8060
protocol: TCP
targetPort: 8060
node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: prom-test
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
hostPID: true
hostIPC: true
hostNetwork: true
nodeSelector:
kubernetes.io/os: linux
containers:
- name: node-exporter
image: swr.cn-north-1.myhuaweicloud.com/htsd/node-exporter:v1.3.0
# 因的 containerd 配置文件写的 state = "/data/containerd/run", 故排除 data/containerd/
# docker 默认的为 /var/lib/docker/
args:
- --web.listen-address=$(HOSTIP):9100
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.mount-points-exclude==^/(dev|proc|sys|data/containerd/|var/lib/docker/.+)($|/)
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
ports:
- containerPort: 9100
env:
- name: HOSTIP
valueFrom:
fieldRef:
fieldPath: status.hostIP
resources:
requests:
cpu: 500m
memory: 200Mi
limits:
cpu: 500m
memory: 200Mi
securityContext:
runAsNonRoot: true
runAsUser: 65534
volumeMounts:
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
# master 节点都有污点, 然而 master 节点也须要被监控, 退出容忍污点, 也就能够在 master 上部署一个客户端
tolerations:
- operator: "Exists"
volumes:
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
二、部署
阐明:prometheus 的 configmap 外面配置了除 k8s 集群之外的监控(mysql、redis 等等),dingtalk 的 configmap 外面配置 webhook url 也须要改成本人的,在这里咱们先能够将 prometheus、alertmanager、grafana、dingtalk、node_exporter 先部署起来,前面再依据本人的需要进行配置。
这里咱们将所有的 yaml 放在同一目录上面。
# 先创立一个名字为 prom-test 的 namespace, 也可间接改为本人的
[root@master /home/yaml/prometheus-all]# kubectl create ns prom-test
# 执行部署
[root@master /home/yaml/prometheus-all]# kubectl apply -f .
......
# 查看该名称空间下的 pod, 保障 pod 状态都是为 Running 就 ok 了
[root@master /home/yaml/prometheus-all]# kubectl get po -n prom-test
NAME READY STATUS RESTARTS AGE
alertmanager-64977b58cc-bslk7 1/1 Running 0 2d4h
dingtalk-68698c87b5-rnck8 1/1 Running 0 2d4h
grafana-79647d8956-8j9cq 1/1 Running 0 8d
node-exporter-478z9 1/1 Running 0 9d
node-exporter-6nrhl 1/1 Running 0 9d
node-exporter-94v9c 1/1 Running 0 9d
node-exporter-9z55c 1/1 Running 0 9d
node-exporter-bpm95 1/1 Running 0 9d
node-exporter-rpjnl 1/1 Running 0 9d
prometheus-56744b95c7-p8kjx 1/1 Running 0 2d4h
# 查看该名称空间下的 service
[root@master /home/yaml/prometheus-all]# kubectl get svc -n prom-test
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager NodePort 10.99.144.191 <none> 9093:30333/TCP 2d4h
dingtalk ClusterIP 10.101.207.48 <none> 8060/TCP 2d4h
grafana NodePort 10.102.61.131 <none> 3000:31430/TCP 8d
prometheus NodePort 10.103.228.196 <none> 9090:30650/TCP 2d4h
# 除了 dingtalk 之外,TYPE 不是 nodeport 的咱们能够应用以下命令批改
kubectl patch svc grafana -n prom-test -p '{"spec": {"type":"NodePort"}}'
dashboard 的拜访形式均为:{k8s-master ip} : {port}
三、探针部署
blackbox_exporter
# 安装包筹备
curl -LO https://github.com/prometheus/blackbox_exporter/releases/download/v0.22.0/blackbox_exporter-0.22.0.linux-amd64.tar.gz
tar xf blackbox_exporter-0.22.0.linux-amd64.tar.gz -C /usr/local/
ln -sv /usr/local/blackbox_exporter-0.22.0.linux-amd64 /usr/local/blackbox_exporter
# 后盾启动
cd /usr/local/blackbox_exporter
./blackbox_exporter &
# 查看是否能采集到数据
[root@master /usr/local/blackbox_exporter-0.22.0.linux-amd64]# curl localhost:9115/metrics
# HELP blackbox_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, and goversion from which blackbox_exporter was built.
# TYPE blackbox_exporter_build_info gauge
blackbox_exporter_build_info{branch="HEAD",goversion="go1.18.5",revision="0bbd65d1264722f7afb87a72ec4128b9214e5840",version="0.22.0"} 1
# HELP blackbox_exporter_config_last_reload_success_timestamp_seconds Timestamp of the last successful configuration reload.
# TYPE blackbox_exporter_config_last_reload_success_timestamp_seconds gauge
blackbox_exporter_config_last_reload_success_timestamp_seconds 1.6945018367674305e+09
# HELP blackbox_exporter_config_last_reload_successful Blackbox exporter config loaded successfully.
# TYPE blackbox_exporter_config_last_reload_successful gauge
blackbox_exporter_config_last_reload_successful 1
# HELP blackbox_module_unknown_total Count of unknown modules requested by probes
......
# 接下来咱们就能够开始配置各类监控了
# icmp 监控,监控主机存活状态
- job_name: "icmp_ping"
metrics_path: /probe
params:
module: [icmp] # 应用 icmp 模块
static_configs:
- targets: ["xxx.xxx.xx.xx"]
relabel_configs:
- source_labels: [__address__]
regex: (.*)(:80)?
target_label: __param_target
replacement: ${1}
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
regex: (.*)
target_label: ping
replacement: ${1}
- source_labels: []
regex: .*
target_label: __address__
replacement: xxx.xxx.xx.xx:9115
# http 监控
- job_name: "http_get_status"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- "https://xxx.xxx.com.cn:8090"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: "xxx.xxx.xx.xx:9115" # 指向理论的 Blackbox exporter.
- target_label: region
replacement: "xxx.xxx.xx.xx:9115"
# tcp 端口监控
- job_name: 'tcp_port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ["https://xxx.xxx.com.cn:8090","xxx.xxx.xx.xx:20096","xxx.xxx.xx.xx:18836"]
labels:
instance: 'port_status'
group: 'tcp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: xxx.xxx.xx.xx:9115
mysqld_exporter
mysqld_exporter 官网下载地址
# 自行下载安装包
# 解压安装包
tar -zvxf mysqld_exporter-0.14.0.linux-amd64.tar.gz
cd mysqld_exporter-0.14.0.linux-amd64
# 配置 my.cnf
vim my.cnf
[client]
host=127.0.0.1
port=3306
user=exporter
password=xxxxxxxxxxxxxx
# 数据库受权
CREATE USER 'exporter'@'%' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'%';
flush privileges;
# 启动
./mysqld_exporter --config.my-cnf=my.cnf &
# 测试是否采集到数据
[root@data-server ~]# curl localhost:9104/metrics
# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 4.0662e-05
go_gc_duration_seconds{quantile="0.25"} 4.0662e-05
go_gc_duration_seconds{quantile="0.5"} 4.0662e-05
go_gc_duration_seconds{quantile="0.75"} 4.0662e-05
go_gc_duration_seconds{quantile="1"} 4.0662e-05
go_gc_duration_seconds_sum 4.0662e-05
go_gc_duration_seconds_count 1
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 8
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
......
# prometheus 配置
- job_name: 'mysql-service'
static_configs:
- targets: ["xxx.xxx.xx.xx:9104"]
redis_exporter
github 官网下载的文件:redis_exporter-v1.44.0.linux-amd64.tar.gz
# 解压包
tar -zxvf redis_exporter-v1.44.0.linux-amd64.tar.gz
# 启动探针
./redis_exporter -redis.addr 127.0.0.1:6379 -redis.password xxxxxxxxxxx -web.listen-address 127.0.0.1:9121
# 查看是否采集到数据
curl localhost:9121/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 9.382e-06
go_gc_duration_seconds{quantile="0.25"} 2.6133e-05
go_gc_duration_seconds{quantile="0.5"} 3.2812e-05
go_gc_duration_seconds{quantile="0.75"} 6.6195e-05
go_gc_duration_seconds{quantile="1"} 0.000299789
go_gc_duration_seconds_sum 0.195450594
go_gc_duration_seconds_count 4333
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 8
......
# prometheus 配置
- job_name: 'redis'
static_configs:
- targets: ['192.168.1.213:9121']
rabbitmq_exporter
自行下载安装包:rabbitmq_exporter_1.0.0-RC19_linux_amd64.tar.gz
# 解压
tar -zxvf rabbitmq_exporter_1.0.0-RC19_linux_amd64.tar.gz
cd rabbitmq_exporter
# 编辑配置文件
vim config.json
{
"rabbit_url": "http://127.0.0.1:15672", # 本人的 rabbitmq 地址
"rabbit_user": "admin", # 本人的 rabbitmq 用户
"rabbit_pass": "xxxxxxxxxxxxx", # 本人的 rabbitmq 明码
"publish_port": "9419",
"publish_addr": "","output_format":"TTY","ca_file":"ca.pem","cert_file":"client-cert.pem","key_file":"client-key.pem","insecure_skip_verify": false,"exlude_metrics": [],"include_queues":".*","skip_queues":"^$","skip_vhost":"^$","include_vhost":".*","rabbit_capabilities":"no_sort,bert","enabled_exporters": ["exchange","node","overview","queue"],"timeout": 30,"max_queues": 0
}
# 启动探针
./rabbitmq_exporter -config-file=/usr/local/rabbitmq_exporter/config.json
# 查看是否采集到数据
curl localhost:9419/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 3.6221e-05
go_gc_duration_seconds{quantile="0.25"} 5.6432e-05
go_gc_duration_seconds{quantile="0.5"} 6.2474e-05
go_gc_duration_seconds{quantile="0.75"} 7.4399e-05
go_gc_duration_seconds{quantile="1"} 0.000156892
go_gc_duration_seconds_sum 3.721871902
go_gc_duration_seconds_count 50404
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 12
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.18.4"} 1
......
# prometheus 配置
- job_name: 'Rabbitmq'
scrape_interval: 5s
static_configs:
- targets:
- xxx.xxx.xx.xx:9419
labels:
instance: RabbitMQ-xxx.xxx.xx.xx
nacos 监控
默认 nacos 环境曾经搭建好
# 配置 application.properties 文件,裸露 metrics 数据,如果仅裸露 prometheus 只配置 prometheus 即可,所有的都须要裸露,间接配置 *
management.endpoints.web.exposure.include=prometheus
# 查看是否采集到 metrics 数据
curl localhost:8848/nacos/actuator/prometheus
# HELP process_start_time_seconds Start time of the process since unix epoch.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.693794903445E9
# HELP system_load_average_1m The sum of the number of runnable entities queued to available processors and the number of runnable entities running on the available processors averaged over a period of time
# TYPE system_load_average_1m gauge
system_load_average_1m 0.0
# HELP process_files_open_files The open file descriptor count
# TYPE process_files_open_files gauge
process_files_open_files 281.0
......
# prometheus 配置文件
- job_name: "nacos-server"
metrics_path: '/nacos/actuator/prometheus'
static_configs:
- targets: ["xxx.xxx.xx.xx:8848"]
kafka_exporter
默认 kafka 已部署实现(这里是应用 docker 启动的)
kafka_exporter 下载地址
wget https://github.com/danielqsj/kafka_exporter/releases/download/v1.2.0/kafka_exporter-1.2.0.linux-amd64.tar.gz
cd kafka_exporter-1.2.0.linux-amd64
# 启动探针
./kafka_exporter --kafka.server=xxx.xxx.xx.xx:9092 &
# 查看是否能获取到 metrics 数据
curl localhost:9308/metrics
# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0
go_gc_duration_seconds{quantile="0.25"} 0
go_gc_duration_seconds{quantile="0.5"} 0
go_gc_duration_seconds{quantile="0.75"} 0
go_gc_duration_seconds{quantile="1"} 0
go_gc_duration_seconds_sum 0
go_gc_duration_seconds_count 0
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 11
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 3.083232e+06
# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed.
# TYPE go_memstats_alloc_bytes_total counter
go_memstats_alloc_bytes_total 3.100496e+06
......
# prometheus.yml 配置
- job_name: "kafka"
static_configs:
- targets: ['xxx.xxx.xx.xx:9308']
四、其余
prometheus-targets
prometheus-datasource
正文完
发表至: prometheus
2023-09-22