共计 13389 个字符,预计需要花费 34 分钟才能阅读完成。
Prometheus
target
[root@vm11 prometheus]# cat prometheus.yml
global:
scrape_interval: 60s
evaluation_interval: 65s
scrape_timeout: 15s
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:3xxx"]
rule_files:
- "rule.yml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["192.168.1x4.xx:3xx"]
labels:
appservice: "prometheus"
city: "苏州"
instancehost: "192.1x.1x4.xx"
rule
[root@vm11 prometheus]# cat rule.yml
groups:
- name: Hosts.rules
rules:
- alert: HostDown
expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0
for: 0m
labels:
severity: critical
annotations:
description: "主机:【{{$labels.instance}}】service is down-- Das ist ein Test."
summary: "主机:【{{$labels.instance}}】service is resolved"
- alert: HostCpuLoadAvage
expr: sum(node_load5) by (instance) > 10
for: 1m
annotations:
title: "5 分钟内 CPU 负载过高"
description: "主机:【{{$labels.instance}}】5 五分钟内 CPU 负载超过 10 (以后值:{{ $value}})"
labels:
severity: 'warning'
- alert: HostCpuUsage
expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
for: 1m
annotations:
title: "CPU 使用率过高"
description: "主机:【{{$labels.instance}}】5 五分钟内 CPU 使用率超过 80% (以后值:{{ $value}})"
labels:
severity: 'warning'
- alert: HostMemoryUsage
expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
for: 1m
annotations:
title: "主机内存使用率超过 80%"
description: "主机:【{{$labels.instance}}】内存使用率超过 80% (以后使用率:{{ $value}}%)"
labels:
severity: 'warning'
- alert: HostIOWait
expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
for: 1m
annotations:
title: "磁盘负载过高"
description: "主机:【{{$labels.instance}}】5 五分钟内磁盘负载过高 (以后负载值:{{ $value}})"
labels:
severity: 'warning'
- alert: HostFileSystemUsage
expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"}/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"}))*100 > 70
for: 1m
annotations:
title: "磁盘空间残余有余"
description: "主机:【{{$labels.instance}}】{{$labels.mountpoint}} 分区使用率超过 70%, 以后值使用率:{{$value}}%"
labels:
severity: 'warning'
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机 swap 分区有余"
description: "主机:【{{$labels.instance}}】swap 分区应用超过 (>80%), 以后值使用率: {{$value}}%"
- alert: HostNetworkConnection-ESTABLISHED
expr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机 ESTABLISHED 连接数过高"
description: "主机:【{{$labels.instance}}】ESTABLISHED 连接数超过 1000, 以后 ESTABLISHED 连接数: {{$value}}"
- alert: HostNetworkConnection-TIME_WAIT
expr: sum(node_sockstat_TCP_tw) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机 TIME_WAIT 连接数过高"
description: "主机:【{{$labels.instance}}】TIME_WAIT 连接数超过 1000, 以后 TIME_WAIT 连接数: {{$value}}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡入口流量过高"
description: "主机:【{{$labels.instance}}】, 网卡: {{$labels.device}} 入口流量超过 (> 100 MB/s), 以后值: {{$value}}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡进口流量过高"
description: "主机:【{{$labels.instance}}】, 网卡: {{$labels.device}} 进口流量超过 (> 100 MB/s), 以后值: {{$value}}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机磁盘读取速率过高"
description: "主机:【{{$labels.instance}}】, 磁盘: {{$labels.device}} 读取速度超过 (50 MB/s), 以后值: {{$value}}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘写入速率过高"
description: "主机:【{{$labels.instance}}】, 磁盘: {{$labels.device}} 写入速度超过 (50 MB/s), 以后值: {{$value}}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot"} * 100 < 10
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机分区 Inode 节点有余"
description: "主机:【{{$labels.instance}}】{{$labels.mountpoint}} 分区 inode 节点有余 (可用值小于 {{ $value}}%)"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘 Read 提早过高"
description: "主机:【{{$labels.instance}}】, 磁盘: {{$labels.device}} Read 提早过高 (read operations > 100ms), 以后提早值: {{$value}}ms"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘 Write 提早过高"
description: "主机:【{{$labels.instance}}】, 磁盘: {{$labels.device}} Write 提早过高 (write operations > 100ms), 以后提早值: {{$value}}ms"
alertmanager
[root@vm11 alertmanager]# cat alertmanager.yml
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 3600s
receiver: 'Warning'
routes:
- receiver: 'Information'
repeat_interval: 3600s
match:
severity: Information
- receiver: 'Warning'
repeat_interval: 3600s
match:
severity: Warning
receivers:
- name: 'Warning'
webhook_configs:
- url: 'http://192.1x.1xx.xx:3xx/alertmanager/warning'
send_resolved: true
- name: 'Information'
webhook_configs:
- url: 'http://192.168.1x.1x:38xx/alertmanager/information'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
go code
- main.go
package main
import (
"bytes"
"encoding/json"
"fmt"
"github.com/gin-gonic/gin"
"io"
"io/ioutil"
"net/http"
"prometheus_alarm/alarmtool"
"prometheus_alarm/prometheus"
"strings"
)
// 要害
func JSONDecode(r io.Reader, obj interface{}) error {if err := json.NewDecoder(r).Decode(obj); err != nil {return err}
return nil
}
func main() {gin.SetMode(gin.ReleaseMode)
r := gin.Default()
r.GET("/health", func(c *gin.Context) {
// 将发送的信息封装成 JSON 发送给浏览器
c.JSON(http.StatusOK, gin.H{
// 这是咱们定义的数据
"status": 200,
})
})
r.POST("/alertmanager/warning", func(c *gin.Context) {
// 打印出 body
//data, _ := ioutil.ReadAll(c.Request.Body)
//fmt.Printf("req.body=%s\n, content-type=%v\n", data, c.ContentType())
var reqInfo prometheus.AlertMsgRet
data, err := c.GetRawData()
if err != nil {fmt.Println(err.Error())
}
fmt.Println("### data:", string(data))
// 把字节流从新放回 body 中
c.Request.Body = ioutil.NopCloser(bytes.NewBuffer(data))
err2 := JSONDecode(c.Request.Body, &reqInfo)
if err2 != nil {fmt.Println("decode err:", err2)
}
//// 打印 body 中的参数
//fmt.Println(reqInfo.Status)
//fmt.Println(reqInfo.Alerts[0].Labels)
//fmt.Println(reqInfo.Alerts[0].Labels.Severity)
//fmt.Println(reqInfo.Alerts[0].Annotations.Description)
userList := []string{"18xxxxxxxxx"}
url := "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=axxxxxxxxx"
//alarmtool.SendMsgChatGroupPerson(url, userList, "## hallo kugou")
for _, i := range reqInfo.Alerts {
if i.Status == "firing" {
content := prometheus.FiringTemp
replacements := []struct {
Old string
New string
}{{Old: "alertname17", New: i.Labels.AlertName},
{Old: "appservice17", New: i.Labels.AppService},
{Old: "startsAt17", New: alarmtool.TimeTransform(i.StartsAt)},
{Old: "city17", New: i.Labels.City},
{Old: "instance17", New: i.Labels.Instance},
{Old: "severity17", New: i.Labels.Severity},
{Old: "description17", New: i.Annotations.Description},
}
for _, repl := range replacements {content = strings.Replace(content, repl.Old, repl.New, -1)
}
alarmtool.SendMsgChatGroupPerson(url, userList, content)
} else if i.Status == "resolved" {
content := prometheus.ResolvedTemp
replacements := []struct {
Old string
New string
}{{Old: "alertname17", New: i.Labels.AlertName},
{Old: "appservice17", New: i.Labels.AppService},
{Old: "city17", New: i.Labels.City},
{Old: "startsAt17", New: alarmtool.TimeTransform(i.StartsAt)},
{Old: "endsAt17", New: alarmtool.TimeTransform(i.EndsAt)},
{Old: "instance17", New: i.Labels.Instance},
{Old: "severity17", New: i.Labels.Severity},
{Old: "summary17", New: i.Annotations.Summary},
}
for _, repl := range replacements {content = strings.Replace(content, repl.Old, repl.New, -1)
}
alarmtool.SendMsgChatGroupPerson(url, userList, content)
}
}
c.JSON(200, "success")
})
r.Run("0.0.0.0:38060") // 默认在本地 8080 端口启动服务
}
- prometheus/promethues.go
package prometheus
type AlertMsgRet struct {
Receiver string `json:"receiver"`
Status string `json:"status"`
Alerts []struct {
Status string `json:"status"`
Labels struct {
AlertName string `json:"alertname"`
AppService string `json:"appservice"`
City string `json:"city"`
Instance string `json:"instance"`
InstanceHost string `json:"instancehost"`
Job string `json:"job"`
Severity string `json:"severity"`
} `json:"labels"`
Annotations struct {
Description string `json:"description"`
Summary string `json:"summary"`
} `json:"annotations"`
StartsAt string `json:"startsAt"`
EndsAt string `json:"endsAt"`
GeneratorURL string `json:"generatorURL"`
Fingerprint string `json:"fingerprint"`
} `json:"alerts"`
GroupLabels struct {AlertName string `json:"alertname"`} `json:"groupLabels"`
CommonLabels struct {
AlertName string `json:"alertname"`
AppService string `json:"appservice"`
City string `json:"city"`
Instance string `json:"instance"`
InstanceHost string `json:"instancehost"`
Job string `json:"job"`
Severity string `json:"severity"`
} `json:"commonLabels"`
CommonAnnotations struct {
Description string `json:"description"`
Summary string `json:"summary"`
} `json:"commonAnnotations"`
ExternalURL string `json:"externalURL"`
Version string `json:"version"`
GroupKey string `json:"groupKey"`
TruncatedAlerts int `json:"truncatedAlerts"`
}
const(FiringTemp = "#### [Prometheus 告警信息] \n> <font color=\"#FF0000\"> 告警名称 </font>:alertname17\n> <font color=\"#FF0000\"> 告警级别 </font>:severity17 \n> <font color=\"#FF0000\"> 利用名称 </font>:appservice17\n> <font color=\"#FF0000\"> 开始工夫 </font>:startsAt17 \n> <font color=\"#FF0000\"> 主机地址 </font>:instance17 \n> <font color=\"#FF0000\">City</font>:city17 \n> <font color=\"#FF0000\"> 故障信息 </font>:\n** messages:description17 **"
ResolvedTemp = "#### [Prometheus 复原信息] \n> <font color=\"#00FF00\"> 告警名称 </font>:alertname17\n> <font color=\"#00FF00\"> 告警级别 </font>:severity17 \n> <font color=\"#00FF00\"> 利用名称 </font>:appservice17\n> <font color=\"#00FF00\"> 开始工夫 </font>:startsAt17 \n> <font color=\"#00FF00\"> 复原工夫 </font>:endsAt17 \n> <font color=\"#00FF00\"> 主机地址 </font>:instance17 \n> <font color=\"#00FF00\">City</font>:city17 \n> <font color=\"#00FF00\"> 复原信息 </font>:\n** messages:summary17 **"
)
-
json
{ "receiver": "Warning", "status": "resolved", "alerts": [{ "status": "resolved", "labels": { "alertname": "HostDown", "appservice": "prometheus", "city": "苏州", "instance": "192.168.1xx.x1:3xx0", "instancehost": "192.1xx.1xx.xx", "job": "prometheus", "severity": "critical" }, "annotations": { "description": "主机:【192.168.1xx.x:3xxx】has been down for more than 1 minute", "summary": "主机:【192.168.1xx.xx:3xxx】service is resolved" }, "startsAt": "2023-07-06T01:16:20.991Z", "endsAt": "2023-07-06T01:29:20.991Z", "generatorURL": "http://vm11:39090/graph?g0.expr=up%7Bjob%3D~%22node-exporter%7Cprometheus%7Cgrafana%7Calertmanager%22%7D+%3D%3D+0\u0026g0.tab=1", "fingerprint": "1becfcf3e2ebd5f7" }], "groupLabels": {"alertname": "HostDown"}, "commonLabels": { "alertname": "HostDown", "appservice": "prometheus", "city": "苏州", "instance": "192.168.1x.xx:3xx", "instancehost": "192.1x8.1x4.x", "job": "prometheus", "severity": "critical" }, "commonAnnotations": { "description": "主机:【192.16x.xx.x1:3xxx】has been down for more than 1 minute", "summary": "主机:【192.168.1xx.xx:39xx】service is resolved" }, "externalURL": "http://vm11:39xxx", "version": "4", "groupKey": "{}:{alertname=\"HostDown\"}", "truncatedAlerts": 0 }
- alarmtool/timetransform.go
package alarmtool
import (
"fmt"
"time"
)
func TimeTransform(inputTime string) string{
//inputTime := "2023-07-06T01:16:20.991Z"
// 将字符串解析为工夫对象
t, err := time.Parse(time.RFC3339Nano, inputTime)
if err != nil {fmt.Println("Failed to parse input time:", err)
return "error inputTime"
}
// 设置东八区的时区
loc, err := time.LoadLocation("Asia/Shanghai")
if err != nil {fmt.Println("Failed to load time zone:", err)
return "转换时区 error"
}
// 转换时区
localTime := t.In(loc)
// 格式化工夫为指定的输入格局
outputTime := localTime.Format("2006-01-02 15:04:05")
fmt.Println(outputTime)
return outputTime
}
- alarmtool/wecome
package alarmtool
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
)
type ChatGroupJsonRes struct {
MsgType string `json:"msgtype"`
Markdown struct {Content interface{} `json:"content"`
} `json:"markdown"`
}
type T4 struct {
Errcode int `json:"errcode"`
Errmsg string `json:"errmsg"`
}
func SendMsgChatGroupPerson(webHook string, person []string, comment string) (*T4) {
var j ChatGroupJsonRes
var perSionComment string
for i, p := range person {fmt.Println(i)
perSionComment = perSionComment + fmt.Sprintf("<@%s>", p)
}
j.MsgType = "markdown"
j.Markdown.Content = comment + "\n" + perSionComment
data, err := json.Marshal(j)
if err != nil {fmt.Println("err was %v", err)
}
fmt.Println(string(data))
reader := bytes.NewReader(data)
client := &http.Client{}
req, err := http.NewRequest("POST", webHook, reader)
req.Header.Add("Content-Type", "application/json")
req.Header.Add("Accept", "application/json")
resp, err := client.Do(req)
if err != nil {log.Fatal(err)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {log.Fatal(err)
}
fmt.Println("####send msg##", string(body))
var assetList T4
err = json.Unmarshal(body, &assetList)
return &assetList
}
- Dockerfile
FROM golang:1.19 as builder
ENV HOME /app
ENV CGO_ENABLED 0
ENV GOOS linux
ENV GOPROXY https://goproxy.cn,direct
ENV TIME_ZONE Asia/Shanghai
WORKDIR /app
COPY . .
RUN go mod download
RUN go build -v -a -installsuffix cgo -o demo main.go
FROM alpine:latest
RUN apk --no-cache add ca-certificates
# 设置工作目录
WORKDIR /bin/
COPY --from=builder /app/demo .
ENTRYPOINT ["/bin/demo"]
test
docker build -t prometheusalarm:test1 .
docker run -d -p 38060:38060 prometheusalarm:test1
正文完