阐明:prometheus、alertmanager、grafana、dingtalk 、node_exporter 均应用k8s部署
k8s版本:v1.20.15
所有利用部署形式:docker启动,且都能失常应用
一、yaml 筹备
alertmanager-all-in-one.yaml
apiVersion: v1data: config.yml: | global: resolve_timeout: 30s route: group_by: ['docker', 'node', 'prometheus', 'instance'] group_interval: 30s group_wait: 5m repeat_interval: 1h receiver: webhook routes: - match: severity: 'Critical' receiver: 'webhook' - match_re: severity: ^(Warning|Disaster)$ receiver: 'webhook' receivers: - name: 'webhook' webhook_configs: - url: http://dingtalk.prom-test.svc.cluster.local:8060/dingtalk/webhook/send send_resolved: truekind: ConfigMapmetadata: name: alertmanager namespace: prom-test ---apiVersion: apps/v1kind: Deploymentmetadata: name: alertmanager namespace: prom-test labels: app: alertmanagerspec: selector: matchLabels: app: alertmanager template: metadata: labels: app: alertmanager spec: nodeName: node-5 volumes: - name: config configMap: name: alertmanager containers: - name: alertmanager image: prom/alertmanager imagePullPolicy: IfNotPresent args: - "--config.file=/etc/alertmanager/config.yml" - "--log.level=debug" ports: - containerPort: 9093 name: http volumeMounts: - mountPath: "/etc/alertmanager" name: config resources: requests: cpu: 100m memory: 256Mi limits: cpu: 2 memory: 2Gi ---apiVersion: v1kind: Servicemetadata: name: alertmanager namespace: prom-testspec: type: NodePort ports: - name: web port: 9093 targetPort: http nodePort: 30333 selector: app: alertmanager
prometheus-all-in-one.yaml
apiVersion: v1kind: ConfigMapmetadata: name: prometheus-config namespace: prom-testdata: prometheus.yml: | global: scrape_interval: 15s evaluation_interval: 15s # alertmanager alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 # 告警规定门路,相对路径 rule_files: - "*-rule.yml" scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'coredns' static_configs: - targets: ['10.96.0.10:9153'] - job_name: 'kubernetes-apiserver' static_configs: - targets: ['10.96.0.1'] scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: 'kubernetes-sd-node-exporter' kubernetes_sd_configs: - role: node relabel_configs: - source_labels: [__address__] regex: '(.*):10250' replacement: '${1}:9100' target_label: __address__ action: replace - source_labels: [__address__] regex: '(.*):(9.*)' replacement: '$1' target_label: ip action: replace - job_name: 'kubernetes-sd-cadvisor' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - target_label: __address__ replacement: 10.96.0.1 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - job_name: "huaweicloud-linux" static_configs: - targets: ["xxx.xxx.xx.xx:9100","xxx.xxx.xx.xx:9100"] - job_name: "icmp_ping" metrics_path: /probe params: module: [icmp] # 应用icmp模块 static_configs: - targets: ["xxx.xxx.xx.xx"] relabel_configs: - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target replacement: ${1} - source_labels: [__param_target] target_label: instance - source_labels: [__param_target] regex: (.*) target_label: ping replacement: ${1} - source_labels: [] regex: .* target_label: __address__ replacement: xxx.xxx.xx.xx:9115 - job_name: "http_get_status" metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - "https://xxx.xxx.com.cn:{port}" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: "xxx.xxx.x.xx:9115" # 指向理论的Blackbox exporter. - target_label: region replacement: "xxx.xxx.x.xx:9115" - job_name: 'tcp_port_status' metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: ["https://xxx.xxx.com.cn:8090","xxx.xxx.xx.xx:{port}","xxx.xxx.xx.xx:{port}"] labels: instance: 'port_status' group: 'tcp' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: xxx.xxx.xx.xx:9115 - job_name: "nacos-server" metrics_path: '/nacos/actuator/prometheus' static_configs: - targets: ["xxx.xxx.xx.xx:8848"] - job_name: 'Rabbitmq' scrape_interval: 5s static_configs: - targets: - xxx.xxx.xx.xx:9419 labels: instance: RabbitMQ-xxx.xxx.xx.xx - job_name: 'redis' static_configs: - targets: ['xxx.xxx.xx.xx:9121'] - job_name: 'mysql-service' static_configs: - targets: ["xxx.xxx.xx.xx:9104"] node-rule.yml: | groups: - name: node rules: - alert: CPU_High expr: floor((1 - (sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) / sum(increase(node_cpu_seconds_total[1m])) by (instance))) * 100) > 1 for: 3s labels: status: Critical annotations: summary: "in prometheus {{ $labels.instance }}: High CPU usage" description: "{{ $labels.instance }} of job {{ $labels.job }} CPU usage is {{ $value }}%" - alert: Free_High expr: floor(100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100) > 0 for: 3s labels: status: Critical annotations: summary: "in prometheus {{ $labels.instance }}: High Free usage" description: "{{ $labels.instance }} of job {{ $labels.job }} Free usage is {{ $value }}%" ---apiVersion: apps/v1kind: Deploymentmetadata: name: prometheus namespace: prom-test labels: app: prometheusspec: selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus # 援用这个servicecount initContainers: - name: "change-permission-of-directory" image: swr.cn-north-1.myhuaweicloud.com/htsd/busybox:latest command: ["/bin/sh"] args: ["-c", "chown -R 65534:65534 /prometheus"] securityContext: privileged: true volumeMounts: - mountPath: "/etc/prometheus" name: config-volume - mountPath: "/prometheus" # 挂载到容器的门路,挂载点 name: data containers: - image: swr.cn-north-1.myhuaweicloud.com/htsd/prometheus:v2.33.2 name: prometheus args: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" # 指定tsdb数据门路 - "--web.enable-lifecycle" # 反对热更新,间接执行localhost:9090/-/reload立刻失效 - "--web.console.libraries=/usr/share/prometheus/console_libraries" - "--web.console.templates=/usr/share/prometheus/consoles" ports: - containerPort: 9090 name: http volumeMounts: - mountPath: "/etc/prometheus" name: config-volume - mountPath: "/prometheus" name: data resources: requests: cpu: 200m memory: 512Mi limits: cpu: 4 memory: 4Gi volumes: - name: data persistentVolumeClaim: claimName: prometheus-pvc - configMap: name: prometheus-config name: config-volume ---apiVersion: v1kind: ServiceAccountmetadata: name: prometheus namespace: prom-test---apiVersion: rbac.authorization.k8s.io/v1kind: ClusterRolemetadata: name: prometheusrules:- apiGroups: - "" resources: - nodes - services - endpoints - pods - nodes/proxy verbs: - get - list - watch- apiGroups: - "extensions" resources: - ingresses verbs: - get - list - watch- apiGroups: - "" resources: - configmaps - nodes/metrics verbs: - get- nonResourceURLs: # 非资源类型的URL,这里就是用于抓取指标采集的接口. - /metrics verbs: - get---apiVersion: rbac.authorization.k8s.io/v1kind: ClusterRoleBindingmetadata: name: prometheusroleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheussubjects:- kind: ServiceAccount name: prometheus namespace: prom-test ---apiVersion: v1kind: Servicemetadata: name: prometheus namespace: prom-test labels: app: prometheusspec: selector: app: prometheus type: NodePort ports: - name: web port: 9090 targetPort: http nodePort: 30650 ---apiVersion: v1kind: PersistentVolumeClaimmetadata: name: prometheus-pvc namespace: prom-test #annotations: #volume.beta.kubernetes.io/storage-class: "nfs"spec: storageClassName: huaweicloud-nfs # 这肯定要写明是关联的哪个storageclass accessModes: - ReadWriteMany resources: requests: storage: 10Gi # 依据理论状况批改
grafana-all-in-one.yaml
apiVersion: apps/v1kind: Deploymentmetadata: name: grafana namespace: prom-testspec: selector: matchLabels: app: grafana template: metadata: labels: app: grafana spec: volumes: - name: storage persistentVolumeClaim: claimName: grafana-pvc securityContext: runAsUser: 0 containers: - name: grafana image: swr.cn-north-1.myhuaweicloud.com/htsd/grafana:8.5.0 imagePullPolicy: IfNotPresent ports: - containerPort: 3000 name: grafana env: - name: GF_SECURITY_ADMIN_USER value: admin - name: GF_SECURITY_ADMIN_PASSWORD value: admin@123 readinessProbe: failureThreshold: 10 httpGet: path: /api/health port: 3000 scheme: HTTP initialDelaySeconds: 60 periodSeconds: 10 successThreshold: 1 timeoutSeconds: 30# livenessProbe:# failureThreshold: 3# httpGet:# path: /api/health# port: 3000# scheme: HTTP# periodSeconds: 10# successThreshold: 1# timeoutSeconds: 1 resources: limits: cpu: 500m memory: 512Mi requests: cpu: 150m memory: 512Mi volumeMounts: - mountPath: /var/lib/grafana name: storage---apiVersion: v1kind: Servicemetadata: name: grafana namespace: prom-testspec: type: ClusterIP ports: - port: 3000 selector: app: grafana ---apiVersion: v1kind: PersistentVolumeClaimmetadata: name: grafana-pvc namespace: prom-test #annotations: #volume.beta.kubernetes.io/storage-class: "nfs"spec: storageClassName: huaweicloud-nfs accessModes: - ReadWriteMany resources: requests: storage: 10Gi
dingtalk-all-in-one.yaml
apiVersion: v1kind: ConfigMapmetadata: name: dingtalk-config namespace: prom-testdata: config.yml: |- templates: - /etc/prometheus-webhook-dingtalk/dingding.tmpl targets: webhook: # 钉钉上创立机器人给的token, 须要改成本人的 url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx mention: all: true dingding.tmpl: |- {{ define "dingtalk.to.message" }} {{- if gt (len .Alerts.Firing) 0 -}} {{- range $index, $alert := .Alerts -}} ========= **监控告警** ========= **告警集群:** k8s **告警类型:** {{ $alert.Labels.alertname }} **告警级别:** {{ $alert.Labels.severity }} **告警状态:** {{ .Status }} **故障主机:** {{ $alert.Labels.instance }} {{ $alert.Labels.device }} **告警主题:** {{ .Annotations.summary }} **告警详情:** {{ $alert.Annotations.message }}{{ $alert.Annotations.description}} **主机标签:** {{ range .Labels.SortedPairs }} </br> [{{ .Name }}: {{ .Value | markdown | html }} ] {{- end }} </br> **故障工夫:** {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} ========= = **end** = ========= {{- end }} {{- end }} {{- if gt (len .Alerts.Resolved) 0 -}} {{- range $index, $alert := .Alerts -}} ========= **故障复原** ========= **告警集群:** k8s **告警主题:** {{ $alert.Annotations.summary }} **告警主机:** {{ .Labels.instance }} **告警类型:** {{ .Labels.alertname }} **告警级别:** {{ $alert.Labels.severity }} **告警状态:** {{ .Status }} **告警详情:** {{ $alert.Annotations.message }}{{ $alert.Annotations.description}} **故障工夫:** {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} **复原工夫:** {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} ========= = **end** = ========= {{- end }} {{- end }} {{- end }} ---apiVersion: apps/v1kind: Deploymentmetadata: name: dingtalk namespace: prom-testspec: replicas: 1 selector: matchLabels: app: dingtalk template: metadata: name: dingtalk labels: app: dingtalk spec: nodeName: node-5 containers: - name: dingtalk image: timonwong/prometheus-webhook-dingtalk imagePullPolicy: IfNotPresent ports: - containerPort: 8060 volumeMounts: - name: config mountPath: /etc/prometheus-webhook-dingtalk volumes: - name: config configMap: name: dingtalk-config ---apiVersion: v1kind: Servicemetadata: name: dingtalk namespace: prom-test labels: app: dingtalk annotations: prometheus.io/scrape: 'false'spec: selector: app: dingtalk ports: - name: dingtalk port: 8060 protocol: TCP targetPort: 8060
node-exporter.yaml
apiVersion: apps/v1kind: DaemonSetmetadata: name: node-exporter namespace: prom-test labels: app: node-exporterspec: selector: matchLabels: app: node-exporter template: metadata: labels: app: node-exporter spec: hostPID: true hostIPC: true hostNetwork: true nodeSelector: kubernetes.io/os: linux containers: - name: node-exporter image: swr.cn-north-1.myhuaweicloud.com/htsd/node-exporter:v1.3.0 # 因的containerd配置文件写的state = "/data/containerd/run", 故排除data/containerd/ # docker默认的为/var/lib/docker/ args: - --web.listen-address=$(HOSTIP):9100 - --path.procfs=/host/proc - --path.sysfs=/host/sys - --path.rootfs=/host/root - --collector.filesystem.mount-points-exclude==^/(dev|proc|sys|data/containerd/|var/lib/docker/.+)($|/) - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ ports: - containerPort: 9100 env: - name: HOSTIP valueFrom: fieldRef: fieldPath: status.hostIP resources: requests: cpu: 500m memory: 200Mi limits: cpu: 500m memory: 200Mi securityContext: runAsNonRoot: true runAsUser: 65534 volumeMounts: - name: proc mountPath: /host/proc - name: sys mountPath: /host/sys - name: root mountPath: /host/root mountPropagation: HostToContainer readOnly: true # master节点都有污点,然而master节点也须要被监控,退出容忍污点,也就能够在master上部署一个客户端 tolerations: - operator: "Exists" volumes: - name: proc hostPath: path: /proc - name: dev hostPath: path: /dev - name: sys hostPath: path: /sys - name: root hostPath: path: /
二、部署
阐明:prometheus的configmap外面配置了除k8s集群之外的监控(mysql、redis等等),dingtalk的configmap外面配置webhook url也须要改成本人的,在这里咱们先能够将prometheus、alertmanager、grafana、dingtalk、node_exporter先部署起来,前面再依据本人的需要进行配置。
这里咱们将所有的yaml放在同一目录上面。
# 先创立一个名字为 prom-test的namespace,也可间接改为本人的[root@master /home/yaml/prometheus-all]# kubectl create ns prom-test # 执行部署[root@master /home/yaml/prometheus-all]# kubectl apply -f ....... # 查看该名称空间下的pod,保障pod状态都是为Running就ok了[root@master /home/yaml/prometheus-all]# kubectl get po -n prom-test NAME READY STATUS RESTARTS AGEalertmanager-64977b58cc-bslk7 1/1 Running 0 2d4hdingtalk-68698c87b5-rnck8 1/1 Running 0 2d4hgrafana-79647d8956-8j9cq 1/1 Running 0 8dnode-exporter-478z9 1/1 Running 0 9dnode-exporter-6nrhl 1/1 Running 0 9dnode-exporter-94v9c 1/1 Running 0 9dnode-exporter-9z55c 1/1 Running 0 9dnode-exporter-bpm95 1/1 Running 0 9dnode-exporter-rpjnl 1/1 Running 0 9dprometheus-56744b95c7-p8kjx 1/1 Running 0 2d4h # 查看该名称空间下的service[root@master /home/yaml/prometheus-all]# kubectl get svc -n prom-test NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGEalertmanager NodePort 10.99.144.191 <none> 9093:30333/TCP 2d4hdingtalk ClusterIP 10.101.207.48 <none> 8060/TCP 2d4hgrafana NodePort 10.102.61.131 <none> 3000:31430/TCP 8dprometheus NodePort 10.103.228.196 <none> 9090:30650/TCP 2d4h # 除了dingtalk之外,TYPE不是nodeport的咱们能够应用以下命令批改kubectl patch svc grafana -n prom-test -p '{"spec": {"type": "NodePort"}}'
dashboard的拜访形式均为:{k8s-master ip} : {port}
三、探针部署
blackbox_exporter
# 安装包筹备curl -LO https://github.com/prometheus/blackbox_exporter/releases/download/v0.22.0/blackbox_exporter-0.22.0.linux-amd64.tar.gztar xf blackbox_exporter-0.22.0.linux-amd64.tar.gz -C /usr/local/ln -sv /usr/local/blackbox_exporter-0.22.0.linux-amd64 /usr/local/blackbox_exporter # 后盾启动cd /usr/local/blackbox_exporter./blackbox_exporter & # 查看是否能采集到数据[root@master /usr/local/blackbox_exporter-0.22.0.linux-amd64]# curl localhost:9115/metrics# HELP blackbox_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, and goversion from which blackbox_exporter was built.# TYPE blackbox_exporter_build_info gaugeblackbox_exporter_build_info{branch="HEAD",goversion="go1.18.5",revision="0bbd65d1264722f7afb87a72ec4128b9214e5840",version="0.22.0"} 1# HELP blackbox_exporter_config_last_reload_success_timestamp_seconds Timestamp of the last successful configuration reload.# TYPE blackbox_exporter_config_last_reload_success_timestamp_seconds gaugeblackbox_exporter_config_last_reload_success_timestamp_seconds 1.6945018367674305e+09# HELP blackbox_exporter_config_last_reload_successful Blackbox exporter config loaded successfully.# TYPE blackbox_exporter_config_last_reload_successful gaugeblackbox_exporter_config_last_reload_successful 1# HELP blackbox_module_unknown_total Count of unknown modules requested by probes...... # 接下来咱们就能够开始配置各类监控了# icmp监控,监控主机存活状态- job_name: "icmp_ping" metrics_path: /probe params: module: [icmp] # 应用icmp模块 static_configs: - targets: ["xxx.xxx.xx.xx"] relabel_configs: - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target replacement: ${1} - source_labels: [__param_target] target_label: instance - source_labels: [__param_target] regex: (.*) target_label: ping replacement: ${1} - source_labels: [] regex: .* target_label: __address__ replacement: xxx.xxx.xx.xx:9115 # http监控- job_name: "http_get_status" metrics_path: /probe params: module: [http_2xx] # Look for a HTTP 200 response. static_configs: - targets: - "https://xxx.xxx.com.cn:8090" relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: "xxx.xxx.xx.xx:9115" # 指向理论的Blackbox exporter. - target_label: region replacement: "xxx.xxx.xx.xx:9115" # tcp端口监控- job_name: 'tcp_port_status' metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: ["https://xxx.xxx.com.cn:8090","xxx.xxx.xx.xx:20096","xxx.xxx.xx.xx:18836"] labels: instance: 'port_status' group: 'tcp' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: xxx.xxx.xx.xx:9115
mysqld_exporter
mysqld_exporter官网下载地址
# 自行下载安装包# 解压安装包tar -zvxf mysqld_exporter-0.14.0.linux-amd64.tar.gzcd mysqld_exporter-0.14.0.linux-amd64 # 配置my.cnfvim my.cnf[client]host=127.0.0.1port=3306user=exporterpassword=xxxxxxxxxxxxxx # 数据库受权CREATE USER 'exporter'@'%' WITH MAX_USER_CONNECTIONS 3;GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'%';flush privileges; # 启动./mysqld_exporter --config.my-cnf=my.cnf & # 测试是否采集到数据[root@data-server ~]# curl localhost:9104/metrics# HELP go_gc_duration_seconds A summary of the GC invocation durations.# TYPE go_gc_duration_seconds summarygo_gc_duration_seconds{quantile="0"} 4.0662e-05go_gc_duration_seconds{quantile="0.25"} 4.0662e-05go_gc_duration_seconds{quantile="0.5"} 4.0662e-05go_gc_duration_seconds{quantile="0.75"} 4.0662e-05go_gc_duration_seconds{quantile="1"} 4.0662e-05go_gc_duration_seconds_sum 4.0662e-05go_gc_duration_seconds_count 1# HELP go_goroutines Number of goroutines that currently exist.# TYPE go_goroutines gaugego_goroutines 8# HELP go_info Information about the Go environment.# TYPE go_info gauge...... # prometheus配置- job_name: 'mysql-service' static_configs: - targets: ["xxx.xxx.xx.xx:9104"]
redis_exporter
github官网下载的文件:redis_exporter-v1.44.0.linux-amd64.tar.gz
# 解压包tar -zxvf redis_exporter-v1.44.0.linux-amd64.tar.gz # 启动探针./redis_exporter -redis.addr 127.0.0.1:6379 -redis.password xxxxxxxxxxx -web.listen-address 127.0.0.1:9121 # 查看是否采集到数据curl localhost:9121/metrics# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.# TYPE go_gc_duration_seconds summarygo_gc_duration_seconds{quantile="0"} 9.382e-06go_gc_duration_seconds{quantile="0.25"} 2.6133e-05go_gc_duration_seconds{quantile="0.5"} 3.2812e-05go_gc_duration_seconds{quantile="0.75"} 6.6195e-05go_gc_duration_seconds{quantile="1"} 0.000299789go_gc_duration_seconds_sum 0.195450594go_gc_duration_seconds_count 4333# HELP go_goroutines Number of goroutines that currently exist.# TYPE go_goroutines gaugego_goroutines 8...... # prometheus配置- job_name: 'redis' static_configs: - targets: ['192.168.1.213:9121']
rabbitmq_exporter
自行下载安装包:rabbitmq_exporter_1.0.0-RC19_linux_amd64.tar.gz
# 解压tar -zxvf rabbitmq_exporter_1.0.0-RC19_linux_amd64.tar.gzcd rabbitmq_exporter # 编辑配置文件vim config.json{ "rabbit_url": "http://127.0.0.1:15672", # 本人的rabbitmq地址 "rabbit_user": "admin", # 本人的rabbitmq用户 "rabbit_pass": "xxxxxxxxxxxxx", # 本人的rabbitmq明码 "publish_port": "9419", "publish_addr": "", "output_format": "TTY", "ca_file": "ca.pem", "cert_file": "client-cert.pem", "key_file": "client-key.pem", "insecure_skip_verify": false, "exlude_metrics": [], "include_queues": ".*", "skip_queues": "^$", "skip_vhost": "^$", "include_vhost": ".*", "rabbit_capabilities": "no_sort,bert", "enabled_exporters": [ "exchange", "node", "overview", "queue" ], "timeout": 30, "max_queues": 0} # 启动探针./rabbitmq_exporter -config-file=/usr/local/rabbitmq_exporter/config.json # 查看是否采集到数据curl localhost:9419/metrics# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.# TYPE go_gc_duration_seconds summarygo_gc_duration_seconds{quantile="0"} 3.6221e-05go_gc_duration_seconds{quantile="0.25"} 5.6432e-05go_gc_duration_seconds{quantile="0.5"} 6.2474e-05go_gc_duration_seconds{quantile="0.75"} 7.4399e-05go_gc_duration_seconds{quantile="1"} 0.000156892go_gc_duration_seconds_sum 3.721871902go_gc_duration_seconds_count 50404# HELP go_goroutines Number of goroutines that currently exist.# TYPE go_goroutines gaugego_goroutines 12# HELP go_info Information about the Go environment.# TYPE go_info gaugego_info{version="go1.18.4"} 1...... # prometheus配置- job_name: 'Rabbitmq' scrape_interval: 5s static_configs: - targets: - xxx.xxx.xx.xx:9419 labels: instance: RabbitMQ-xxx.xxx.xx.xx
nacos监控
默认nacos环境曾经搭建好
# 配置application.properties文件,裸露metrics数据,如果仅裸露prometheus只配置prometheus即可,所有的都须要裸露,间接配置*management.endpoints.web.exposure.include=prometheus # 查看是否采集到metrics数据curl localhost:8848/nacos/actuator/prometheus# HELP process_start_time_seconds Start time of the process since unix epoch.# TYPE process_start_time_seconds gaugeprocess_start_time_seconds 1.693794903445E9# HELP system_load_average_1m The sum of the number of runnable entities queued to available processors and the number of runnable entities running on the available processors averaged over a period of time# TYPE system_load_average_1m gaugesystem_load_average_1m 0.0# HELP process_files_open_files The open file descriptor count# TYPE process_files_open_files gaugeprocess_files_open_files 281.0...... # prometheus配置文件- job_name: "nacos-server" metrics_path: '/nacos/actuator/prometheus' static_configs: - targets: ["xxx.xxx.xx.xx:8848"]
kafka_exporter
默认kafka已部署实现(这里是应用docker启动的)
kafka_exporter下载地址
wget https://github.com/danielqsj/kafka_exporter/releases/download/v1.2.0/kafka_exporter-1.2.0.linux-amd64.tar.gzcd kafka_exporter-1.2.0.linux-amd64# 启动探针./kafka_exporter --kafka.server=xxx.xxx.xx.xx:9092 &# 查看是否能获取到metrics数据curl localhost:9308/metrics# HELP go_gc_duration_seconds A summary of the GC invocation durations.# TYPE go_gc_duration_seconds summarygo_gc_duration_seconds{quantile="0"} 0go_gc_duration_seconds{quantile="0.25"} 0go_gc_duration_seconds{quantile="0.5"} 0go_gc_duration_seconds{quantile="0.75"} 0go_gc_duration_seconds{quantile="1"} 0go_gc_duration_seconds_sum 0go_gc_duration_seconds_count 0# HELP go_goroutines Number of goroutines that currently exist.# TYPE go_goroutines gaugego_goroutines 11# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.# TYPE go_memstats_alloc_bytes gaugego_memstats_alloc_bytes 3.083232e+06# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed.# TYPE go_memstats_alloc_bytes_total countergo_memstats_alloc_bytes_total 3.100496e+06......# prometheus.yml配置 - job_name: "kafka" static_configs: - targets: ['xxx.xxx.xx.xx:9308']