乐趣区

关于kubernetes:Kubeprometheus-rules-报警规则-手工汉化

alertmanager

groups:
- name: alertmanager.rules
  rules:
  - alert: Alertmanager 配置不统一
    annotations:
      message: '{{$labels.cluster}} 集群 Alertmanager 集群的节点之间配置不同步 {{$labels.service}}!'
    expr: |
      count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (cluster, service) / ON(cluster, service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (cluster, name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
    for: 5m
    labels:
      severity: critical
  - alert: Alertmanager 重载失败
    annotations:
      message: '{{$labels.cluster}} 集群在重载 Alertmanager 配置时失败 {{$labels.namespace}}/{{$labels.pod}}!'
    expr: |
      alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
    for: 10m
    labels:
      severity: warning
  - alert: Alertmanager 成员不统一
    annotations:
      message: '{{$labels.cluster}} 集群 Alertmanager 未找到群集的所有成员!'
    expr: |
      alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
        != on (cluster,service) GROUP_LEFT()
      count by (cluster,service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
    for: 5m
    labels:
      severity: critical

apiserver

groups:
- name: kubernetes-system-apiserver
  rules:
  - alert: K8S 的 APISERVER 潜在危险过高
    annotations:
      message: '{{$labels.cluster}} 集群 API server 的 {{$labels.verb}} {{$labels.resource}} 有异样提早 {{$value}} 秒!'
    expr: |
      (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
        >
        on (verb) group_left()
        (avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
          +
          2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
        )
      ) > on (verb) group_left()
      1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
      and on (verb,resource)
      cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
      >
      1
    for: 5m
    labels:
      severity: warning
  - alert: K8S 的 APISERVER 潜在致命危险
    annotations:
      message: '{{$labels.cluster}} 集群 API server 的 {{$labels.verb}} {{$labels.resource}} 有 99% 的申请的提早达 {{$value}} 秒!'
    expr: |
      cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
    for: 10m
    labels:
      severity: critical
  - alert: K8S 的 APISERVER 存在返回谬误过高
    annotations:
      message: '{{$labels.cluster}} 集群 API server 申请中有 {{$value | humanizePercentage}} 的返回谬误!'
    expr: |
      sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
        /
      sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
    for: 10m
    labels:
      severity: critical
  - alert: K8S 的 APISERVER 存在返回谬误
    annotations:
      message: '{{$labels.cluster}} 集群 API server 申请中有 {{$value | humanizePercentage}} 的返回谬误!'
    expr: |
      sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
        /
      sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
    for: 10m
    labels:
      severity: warning
  - alert: K8S 的 APISERVER 资源存在返回谬误过高
    annotations:
      message: '{{$labels.cluster}} 集群 API server 的 {{$labels.verb}} {{$labels.resource}} {{$labels.subresource}} 的申请中有 {{$value | humanizePercentage}} 的返回谬误!'
    expr: |
      sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb,cluster)
        /
      sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb,cluster) > 0.10
    for: 10m
    labels:
      severity: critical
  - alert: K8S 的 APISERVER 资源存在返回谬误
    annotations:
      message: '{{$labels.cluster}} 集群 API server 的 {{$labels.verb}} {{$labels.resource}} {{$labels.subresource}} 的申请中有 {{$value | humanizePercentage}} 的返回谬误!'
    expr: |
      sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m])) by (resource,subresource,verb,cluster)
        /
      sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb,cluster) > 0.05
    for: 10m
    labels:
      severity: warning
  - alert: K8S 客户端证书行将过期
    annotations:
      message: '{{$labels.cluster}} 集群一个 K8S 的客户端证书将在 7 天内过期!'
    expr: |
      apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
    labels:
      severity: warning
  - alert: K8S 客户端证书 24 小时内过期
    annotations:
      message: '{{$labels.cluster}} 集群一个 K8S 的客户端证书将在 24 小时内过期!'
    expr: |
      apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
    labels:
      severity: critical
  - alert: APISERVER 掉线
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus Targets 无奈发现 APISERVER!'
    expr: |
      absent(up{job="apiserver"} == 1)
    for: 15m
    labels:
      severity: critical

apps

groups:
- name: kubernetes-apps
  rules:
  - alert: K8S 容器组短时间内屡次重启
    annotations:
      message: '{{$labels.cluster}} 集群容器组 {{$labels.namespace}}/{{$labels.pod}} ({{$labels.container}}) 在 10 分钟内重启了 {{printf"%.2f"$value}} 次!'
    expr: |
      rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 10 > 1
    for: 10m
    labels:
      severity: critical
#  - alert: K8S 容器组 Terminated
#    annotations:
#      message: '{{$labels.cluster}} 集群容器组 {{$labels.namespace}}/{{$labels.pod}} Terminated 起因是 {{$labels.reason}}!'
#    expr: |
#      kube_pod_container_status_terminated_reason{reason!="Completed"} > 0
#    for: 15m
#    labels:
#      severity: warning
#  - alert: K8S 容器组 Completed
#    annotations:
#      message: '{{$labels.cluster}} 集群容器组 {{$labels.namespace}}/{{$labels.pod}} Terminated 起因是 {{$labels.reason}}!'
#    expr: |
#      kube_pod_container_status_terminated_reason{reason="Completed"} > 0
#    for: 15m
#    labels:
#      severity: none
  - alert: K8S 容器组 Waiting
    annotations:
      message: '{{$labels.cluster}} 集群容器组 {{$labels.namespace}}/{{$labels.pod}} Waiting 起因是 {{$labels.reason}}!'
    expr: |
      kube_pod_container_status_waiting_reason{reason!="ContainerCreating"} > 0
    for: 3m
    labels:
      severity: critical
  - alert: K8S 容器组调度失败
    annotations:
      message: '{{$labels.cluster}} 集群容器组 {{$labels.namespace}}/{{$labels.pod}} 无合乎预期工作节点,无奈被调度!'
    expr: |
      sum by (cluster,pod) (kube_pod_status_unschedulable) > 0
    for: 5m
    labels:
      severity: critical
  - alert: K8S 容器组 NotReady
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.namespace}}/{{$labels.pod}} 已处于 non-ready 状态超过 15 分钟!'
    expr: |
      sum by (namespace, pod, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod, cluster) group_left(owner_kind) max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})) > 0
    for: 15m
    labels:
      severity: critical
  - alert: K8S 部署状态异样
    annotations:
      message: '{{$labels.cluster}} 集群部署的 {{$labels.namespace}}/{{$labels.deployment}} 状态异样,局部实例不可用已达 15 分钟!'
    expr: |
      kube_deployment_status_replicas_unavailable{cluster="prod"} != 0
    for: 15m
    labels:
      severity: warning
  - alert: K8S 部署版本号不匹配
    annotations:
      message: '{{$labels.cluster}} 集群部署的 {{$labels.namespace}}/{{$labels.deployment}} 部署版本号不匹配,这表明部署的部署过程失败,并且没有回滚达 15 分钟!'
    expr: |
      kube_deployment_status_observed_generation{job="kube-state-metrics"}
        !=
      kube_deployment_metadata_generation{job="kube-state-metrics"}
    for: 15m
    labels:
      severity: critical
  - alert: K8S 部署理论正本数与预期数不匹配
    annotations:
      message: '{{$labels.cluster}} 集群部署 {{$labels.namespace}}/{{$labels.deployment}} 部署的理论正本数与预期数不匹配超过 15 分钟!'
    expr: |
      kube_deployment_spec_replicas{job="kube-state-metrics"}
        !=
      kube_deployment_status_replicas_available{job="kube-state-metrics"}
    for: 15m
    labels:
      severity: critical
  - alert: K8S 有状态部署理论正本数与预期数不匹配
    annotations:
      message: '{{$labels.cluster}} 集群有状态部署 {{$labels.namespace}}/{{$labels.deployment}} 有状态部署的理论正本数与预期数不匹配超过 15 分钟!'
    expr: |
      kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
        !=
      kube_statefulset_status_replicas{job="kube-state-metrics"}
    for: 15m
    labels:
      severity: critical
  - alert: K8S 有状态部署版本号不匹配
    annotations:
      message: '{{$labels.cluster}} 集群有状态部署的 {{$labels.namespace}}/{{$labels.deployment}} 有状态部署版本号不匹配,这表明有状态部署状态失败,并且没有回滚!'
    expr: |
      kube_statefulset_status_observed_generation{job="kube-state-metrics"}
        !=
      kube_statefulset_metadata_generation{job="kube-state-metrics"}
    for: 15m
    labels:
      severity: critical
  - alert: K8S 有状态部署更新未开展
    annotations:
      message: '{{$labels.cluster}} 集群有状态部署 {{$labels.namespace}}/{{$labels.statefulset}} 的更新未开展,发现以后本非更新版本!'
    expr: |
      max without (revision) (kube_statefulset_status_current_revision{job="kube-state-metrics"}
          unless
        kube_statefulset_status_update_revision{job="kube-state-metrics"}
      )
        *
      (kube_statefulset_replicas{job="kube-state-metrics"}
          !=
        kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
      )
    for: 15m
    labels:
      severity: critical
  - alert: K8S 守护过程集开展失败
    annotations:
      message: '{{$labels.cluster}} 集群守护过程集 {{$labels.namespace}}/{{$labels.daemonset}} 只有预期容器组数的 {{$value | humanizePercentage}} 的容器被调度并就绪!'
    expr: |
      kube_daemonset_status_number_ready{job="kube-state-metrics"}
        /
      kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
    for: 15m
    labels:
      severity: critical
#  - alert: K8S 容器期待中
#    annotations:
#      message: '{{$labels.cluster}} 集群容器组 {{$labels.namespace}}/{{$labels.pod}} 中的 {{$labels.container}} 容器曾经再期待状态超过 1 小时!'
#    expr: |
#      sum by (cluster, namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
#    for: 1h
#    labels:
#      severity: warning
  - alert: K8S 守护过程集未被调度
    annotations:
      message: '{{$labels.cluster}} 集群守护过程集 {{$labels.namespace}}/{{$labels.daemonset}} 的 {{$value}} 个容器组没有被调度!'
    expr: |
      kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
        -
      kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
    for: 10m
    labels:
      severity: warning
  - alert: K8S 守护过程集调度谬误
    annotations:
      message: '{{$labels.cluster}} 集群守护过程集 {{$labels.namespace}}/{{$labels.daemonset}} 的 {{$value}} 个非预期的容器组正在运行!'
    expr: |
      kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
    for: 10m
    labels:
      severity: warning
  - alert: K8S 定时工作运行中
    annotations:
      message: '{{$labels.cluster}} 集群定时工作 {{$labels.namespace}}/{{$labels.cronjob}} 曾经应用 1 小时工夫来实现工作!'
    expr: |
      time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
    for: 1h
    labels:
      severity: warning
  - alert: K8S 工作实现
    annotations:
      message: '{{$labels.cluster}} 集群工作 {{$labels.namespace}}/{{$labels.cronjob}} 曾经应用 1 小时工夫来实现工作!'
    expr: |
      kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"}  > 0
    for: 1h
    labels:
      severity: warning
  - alert: K8S 工作失败
    annotations:
      message: '{{$labels.cluster}} 集群工作 {{$labels.namespace}}/{{$labels.cronjob}} 曾经失败!'
    expr: |
      kube_job_failed{job="kube-state-metrics"}  > 0
    for: 15m
    labels:
      severity: warning
  - alert: K8S 的 HPA 正本数不匹配
    annotations:
      message: '{{$labels.cluster}} 集群 HPA {{$labels.namespace}}/{{$labels.hpa}} 与预期正本数不匹配曾经超过 15 分钟!'
    expr: |
      (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
        !=
      kube_hpa_status_current_replicas{job="kube-state-metrics"})
        and
      changes(kube_hpa_status_current_replicas[15m]) == 0
    for: 15m
    labels:
      severity: warning
  - alert: 侦测到 K8S 的 HPA 缩容
    annotations:
      message: '{{$labels.cluster}} 集群 HPA {{$labels.namespace}}/{{$labels.hpa}} 已触发缩容,可用正本数达到预期,以后预期 {{printf"%.0f"$value}}!'
    expr: |
      (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
        ==
      kube_hpa_status_current_replicas{job="kube-state-metrics"})
        and
      delta(kube_hpa_status_current_replicas[5m]) < 0
    for: 1m
    labels:
      severity: none
  - alert: 侦测到 K8S 的 HPA 扩容
    annotations:
      message: '{{$labels.cluster}} 集群 HPA {{$labels.namespace}}/{{$labels.hpa}} 已触发扩容,可用正本数达到预期,以后预期 {{printf"%.0f"$value}}!!'
    expr: |
      (kube_hpa_status_desired_replicas{job="kube-state-metrics"}
        ==
      kube_hpa_status_current_replicas{job="kube-state-metrics"})
        and
      delta(kube_hpa_status_current_replicas[5m]) > 0
    for: 1m
    labels:
      severity: none
  - alert: K8S 工作负载的 HPA 放弃满载
    annotations:
      message: '{{$labels.cluster}} 集群 HPA {{$labels.namespace}}/{{$labels.hpa}} 以限度最大正本数满载运行超过了 15 分钟!'
    expr: |
      kube_hpa_status_current_replicas{job="kube-state-metrics"}
        ==
      kube_hpa_spec_max_replicas{job="kube-state-metrics"}
    for: 15m
    labels:
      severity: none
  - alert: K8S 部署服务版本变更通告
    annotations:
      message: '侦测到 {{$labels.cluster}} 集群服务部署 {{$labels.namespace}}/{{$labels.deployment}} 部署 metadata 版本已更替,实列数以达到预设值。'
    expr: |
      (kube_deployment_status_observed_generation{job="kube-state-metrics"}
        ==
      kube_deployment_metadata_generation{job="kube-state-metrics"})
        and
      (kube_deployment_spec_replicas{job="kube-state-metrics"}
        ==
      kube_deployment_status_replicas_available{job="kube-state-metrics"})
        and
      changes(kube_deployment_status_observed_generation{job="kube-state-metrics"}[5m]) > 0
    for: 1m
    labels:
      severity: none
  - alert: K8S 部署服务版本变更异样
    annotations:
      message: '侦测到 {{$labels.cluster}} 集群服务部署 {{$labels.namespace}}/{{$labels.deployment}} 部署 metadata 版本已更替,实列在线数不匹配部署预设值,以后运行版本非新版本,或 HPA 已触发,或服务运行故障!'
    expr: |
     ((kube_deployment_status_observed_generation{job="kube-state-metrics"}
        !=
      kube_deployment_metadata_generation{job="kube-state-metrics"})
        or
      (kube_deployment_spec_replicas{job="kube-state-metrics"}
        !=
      kube_deployment_status_replicas_available{job="kube-state-metrics"}))
        or
      ((kube_hpa_status_desired_replicas{job="kube-state-metrics"}
        !=
      kube_hpa_status_current_replicas{job="kube-state-metrics"})
        and
      changes(kube_hpa_status_current_replicas[15m]) != 0)
        and
      changes(kube_deployment_status_observed_generation{job="kube-state-metrics"}[5m]) > 0
    for: 1m
    labels:
      severity: critical

controller-manager

groups:
- name: kubernetes-system-controller-manager
  rules:
  - alert: KubeControllerManager 掉线
    annotations:
      message: KubeControllerManager 从 Prometheus Targets 的发现中隐没!expr: |
      absent(up{job="kube-controller-manager"} == 1)
    for: 15m
    labels:
      severity: critical

general

groups:
- name: general.rules
  rules:
  - alert: Target 掉线
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 中 {{$labels.job}} 的 {{printf"%.4g"$value}}% 个 targets 掉线!'
    expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job,
      namespace, service)) > 10
    for: 10m
    labels:
      severity: warning
  - alert: Watchdog
    annotations:
      message: |
        此警报旨在确认整个警报管道功能性的。这个警报始终处于触发状态,因而它应始终在 Alertmanager 中触发,并始终针对各类接收器发送。expr: vector(1)
    labels:
      severity: none

kubelet

groups:
- name: kubernetes-system-kubelet
  rules:
  - alert: K8S 节点未就绪
    annotations:
      message: '{{$labels.cluster}} 集群 K8S 节点 {{$labels.node}} 处于未就绪状态已超过 15 分钟!'
    expr: |
      kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
    for: 15m
    labels:
      severity: warning
  - alert: K8S 节点不可达
    annotations:
      message: '{{$labels.cluster}} 集群 K8S 节点 {{$labels.node}} 不可达,一部分工作负载已从新调度!'
    expr: |
      kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1
    labels:
      severity: warning
  - alert: Kubelet 节点存在过多容器组
    annotations:
      message: '{{$labels.cluster}} 集群 Kubelet {{$labels.node}} 节点曾经运行了其总量的 {{$value | humanizePercentage}} 的容器组再这个节点上!'
    expr: |
      max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance,cluster) * on(instance,cluster) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node,cluster) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node,cluster) > 0.95
    for: 15m
    labels:
      severity: warning
  - alert: Kubelet 掉线
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus Targets 无奈发现 Kubelet {{$labels.node}}!'
    expr: |
      absent(up{job="kubelet", metrics_path="/metrics"} == 1)
    for: 15m
    labels:
      severity: critical

network

groups:
- name: node-network
  rules:
  - alert: Node 网络网卡抖动
    annotations:
      message: '{{$labels.cluster}} 集群侦测到 node-exporter {{$labels.namespace}}/{{$labels.pod}} 节点上的网卡 {{$labels.device}} 状态常常扭转!'
    expr: |
      changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
    for: 2m
    labels:
      severity: warning
#  - alert: 节点侦测到 TCP 已调配的套接字数量
#    expr: sum(avg_over_time(node_sockstat_TCP_alloc[5m])) by (instance,cluster)  > 5000
#    for: 1m
#    labels:
#      severity: critical
#    annotations:
#      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点侦测到 TCP 已调配的套接字数量达到 {{printf"%.0f"$value}}!'
#  - alert: 节点侦测到 UDP 应用中的套接字数量
#    expr: sum(avg_over_time(node_sockstat_UDP_inuse[5m])) by (instance,cluster)  > 5000
#    for: 1m
#    labels:
#      severity: critical
#    annotations:
#      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点侦测到 UDP 应用中的套接字数量达到 {{printf"%.0f"$value}}!'
  - alert: 节点上行网络谬误
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点的网络设备 {{$labels.device}} 再过来 2 分钟内侦测到 {{printf"%.0f"$value}} 的下载谬误!'
    expr: |
      increase(node_network_receive_errs_total[2m]) > 10
    for: 5m
    labels:
      severity: warning
  - alert: 节点上行网络谬误
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点的网络设备 {{$labels.device}} 再过来 2 分钟内侦测到 {{printf"%.0f"$value}} 的上传谬误!'
    expr: |
      increase(node_network_transmit_errs_total[2m]) > 10
    for: 5m
    labels:
      severity: warning
  - alert: 节点上行带宽过高
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点的网络设备 {{$labels.device}} 下载带宽超过 > 100MB/s'
    expr: |
      sum by (icluster,instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
  - alert: 节点上行带宽过高
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点的网络设备 {{$labels.device}} 上传带宽超过 > 100MB/s'
    expr: |
      sum by (cluster,instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
  - alert: 节点上行丢包率过高
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点 3 分钟内上行丢包率超过达到 {{printf"%.0f"$value}}%!'
    expr: |
      sum by (instance,cluster) (irate(node_network_receive_drop_total[3m])) / sum by (instance,cluster) (irate(node_network_receive_packets_total[3m])) * 100 > 80
    for: 1m
    labels:
      severity: cirtical
  - alert: 节点上行丢包率过高
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点 3 分钟内上行丢包率超过达到 {{printf"%.0f"$value}}%!'
    expr: |
      sum by (instance,cluster) (irate(node_network_transmit_drop_total[3m])) / sum by (instance,cluster) (irate(node_network_transmit_packets_total[3m])) * 100 > 80
    for: 1m
    labels:
      severity: cirtical

prometheus-operator

groups:
- name: prometheus-operator
  rules:
  - alert: PrometheusOperatorReconcileErrors
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.namespace}} 命名空间中协调 {{$labels.controller}} 时产生谬误!'
    expr: |
      rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
    for: 10m
    labels:
      severity: warning
  - alert: PrometheusOperator 节点 lookup 谬误
    annotations:
      message: '{{$labels.cluster}} 集群协调 Prometheus 时 {{$labels.namespace}} 命名空间产生谬误!'
    expr: |
      rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
    for: 10m
    labels:
      severity: warning

prometheus

groups:
- name: prometheus
  rules:
  - alert: Prometheus 谬误的配置
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 再重载配置时失败!'
    expr: |
      # Without max_over_time, failed scrapes could create false negatives, see
      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
      max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
    for: 10m
    labels:
      severity: critical
  - alert: Prometheus 告诉队列已满
    annotations:
      message: Prometheus {{$labels.namespace}}/{{$labels.pod}} 的报警告诉队列已满!30m.
    expr: |
      # Without min_over_time, failed scrapes could create false negatives, see
      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
      (predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
      >
        min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
      )
    for: 15m
    labels:
      severity: warning
  - alert: Prometheus 在推送警报时产生谬误
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.namespace}}/{{$labels.pod}} 在推送警报至某些 Alertmanager {{$labels.alertmanager}} 时呈现了 {{printf"%.1f"$value}}% 的谬误!'
    expr: |
      (rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
      /
        rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
      )
      * 100
      > 1
    for: 15m
    labels:
      severity: warning
  - alert: Prometheus 在推送警报时全副谬误
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.namespace}}/{{$labels.pod}} 在推送警报至全副 Alertmanager {{$labels.alertmanager}} 时呈现了 {{printf"%.1f"$value}}% 的谬误!'
    expr: |
      min without(alertmanager) (rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
      /
        rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
      )
      * 100
      > 3
    for: 15m
    labels:
      severity: critical
  - alert: Prometheus 未连贯 Alertmanagers
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 没有连贯到任何 Alertmanagers!'
    expr: |
      max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1
    for: 10m
    labels:
      severity: warning
  - alert: PrometheusTSDB 重载失败
    annotations:
      message: '{{$labels.cluster}} 集群在过来的 3 小时内 Prometheus {{$labels.namespace}}/{{$labels.pod}} 侦测到 {{$value | humanize}} 个重载谬误!'
    expr: |
      increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
    for: 4h
    labels:
      severity: warning
  - alert: PrometheusTSDB 压缩失败
    annotations:
      message: '{{$labels.cluster}} 集群在过来的 3 小时内 Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} 个压缩谬误!'
    expr: |
      increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
    for: 4h
    labels:
      severity: warning
  - alert: Prometheus 没有采集到数据样本
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 没有采集到数据样本!'
    expr: |
      rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
    for: 10m
    labels:
      severity: warning
  - alert: Prometheus 反复的工夫戳
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 正在抛弃 {{printf"%.4g"$value}} 领有雷同工夫戳不同数据的数据样本!'
    expr: |
      rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
    for: 10m
    labels:
      severity: warning
  - alert: Prometheus 工夫戳超过限度
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 正在抛弃 {{printf"%.4g"$value}} 超过工夫戳限度的数据样本!'
    expr: |
      rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
    for: 10m
    labels:
      severity: warning
  - alert: Prometheus 近程存储失败
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 在推送至数据都队列 {{$labels.queue}} 数据时有 {{printf"%.1f"$value}}% 的谬误!'
    expr: |
      (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
      /
        (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        +
          rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
        )
      )
      * 100
      > 1
    for: 15m
    labels:
      severity: critical
  - alert: Prometheus 近程数据写落后
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 近程写落后于队列 {{$labels.queue}} {{printf"%.1f"$value}} 秒!'
    expr: |
      # Without max_over_time, failed scrapes could create false negatives, see
      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
      (max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
      - on(job, instance) group_right
        max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
      )
      > 120
    for: 15m
    labels:
      severity: critical
  - alert: Prometheus 近程写预期切片
    annotations:
      message: '{{$labels.cluster}} 集群 Prometheus {{$labels.namespace}}/{{$labels.pod}} 近程写的预期切片数预计须要 {{$value}} shards, 大于最大值 {{printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}!'
    expr: |
      # Without max_over_time, failed scrapes could create false negatives, see
      # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
      (max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
      >
        max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
      )
    for: 15m
    labels:
      severity: warning
  - alert: Prometheus 规定谬误
    annotations:
      message: '{{$labels.cluster}} 集群在 5 分钟内 Prometheus {{$labels.namespace}}/{{$labels.pod}} 评估 {{printf"%.0f"$value}} 条的规定失败!'
    expr: |
      increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
    for: 15m
    labels:
      severity: critical
  - alert: Prometheus 短少规定评估
    annotations:
      message: '{{$labels.cluster}} 集群在过来 5 分钟内 Prometheus {{$labels.namespace}}/{{$labels.pod}} 错过了 {{printf"%.0f"$value}} 规定组评估!'
    expr: |
      increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
    for: 15m
    labels:
      severity: warning

resource

groups:     
- name: kubernetes-resources
  rules:
  - alert: K8S 的 CPU 的 Requests 过载
    annotations:
      message: '{{$labels.cluster}} 群集对容器组的 CPU 资源 Requests 过载,并且无容忍策略,集群须要扩容!'
    expr: |
      sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
        /
      sum(kube_node_status_allocatable_cpu_cores)
        >
      (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
    for: 5m
    labels:
      severity: warning
  - alert: K8S 的内存 Requests 过载
    annotations:
      message: '{{$labels.cluster}} 群集对容器组的内存资源 Requests 过载,并且无容忍策略,集群须要扩容!'
    expr: |
      sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
        /
      sum(kube_node_status_allocatable_memory_bytes)
        >
      (count(kube_node_status_allocatable_memory_bytes)-1)
        /
      count(kube_node_status_allocatable_memory_bytes)
    for: 5m
    labels:
      severity: warning
  - alert: K8S 工作节点的 CPURequests 过载
    annotations:
      message: '{{$labels.cluster}} 集群容器组对节点 {{$labels.node}} 的 CPU 资源 Requests 以达到 {{printf"%.0f"$value}}%!'
    expr: |
      sum by (node,cluster) (kube_pod_container_resource_requests_cpu_cores) / sum by (node,cluster) (node:node_num_cpu:sum) * 100 > 95
    for: 5m
    labels:
      severity: warning
  - alert: K8S 工作节点的均匀 CPURequests 过载
    annotations:
      message: '{{$labels.cluster}} 集群容器组对节点 {{$labels.node}} 的 CPU 资源均匀 Requests 以达到 {{printf"%.0f"$value}}%,可能导致无奈调度,{{$labels.cluster}} 集群可能须要扩容!'
    expr: |
      avg by (cluster) (sum by (node,cluster) (kube_pod_container_resource_requests_cpu_cores) / sum by (node,cluster) (node:node_num_cpu:sum)) * 100 > 90
    for: 5m
    labels:
      severity: warning
  - alert: K8S 工作节点内存 Requests 过载
    annotations:
      message: '{{$labels.cluster}} 集群容器组对节点 {{$labels.node}} 的内存资源 Requests 以达到 {{printf"%.0f"$value}}%!'
    expr: |
      sum by (node,cluster) (kube_pod_container_resource_requests_memory_bytes) / sum by (node,cluster) (kube_node_status_allocatable_memory_bytes) * 100 > 95
    labels:
      severity: warning
  - alert: K8S 工作节点均匀内存 Requests 过载
    annotations:
      message: '{{$labels.cluster}} 集群容器组对节点 {{$labels.node}} 的内存资源均匀 Requests 以达到 {{printf"%.0f"$value}}%,可能导致无奈调度,{{$labels.cluster}} 集群可能须要扩容!'
    expr: |
      avg by (cluster) (sum by (node,cluster) (kube_pod_container_resource_requests_memory_bytes) / sum by (node,cluster) (kube_node_status_allocatable_memory_bytes)) * 100 > 85
    labels:
      severity: warning
  - alert: 'K8S 的命名空间 CPU 过载'
    annotations:
      message: '{{$labels.cluster}} 集群命名空间的 CPU 过载!'
    expr: |
      sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
        /
      sum(kube_node_status_allocatable_cpu_cores)
        > 1.5
    for: 5m
    labels:
      severity: warning
  - alert: K8S 的命名空间内存过载
    annotations:
      message: '{{$labels.cluster}} 集群命名空间的内存过载!'
    expr: |
      sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
        /
      sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"})
        > 1.5
    for: 5m
    labels:
      severity: warning
  - alert: K8S 超过配额
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 已应用了其配额的 {{$labels.resource}} {{$value | humanizePercentage}}!'
    expr: |
      kube_resourcequota{job="kube-state-metrics", type="used"}
        / ignoring(instance, job, type)
      (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
        > 0.90
    for: 15m
    labels:
      severity: warning
  - alert: 有受限的 CPU(CPU 节流)
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的容器组 {{$labels.pod}} 中的容器 {{$labels.container}} 存在 {{$value | humanizePercentage}}  受限 CPU(CPU 节流)!'
    expr: |
      sum(increase(container_cpu_cfs_throttled_periods_total{container!="",}[5m])) by (container, pod, namespace,cluster)
        /
      sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace,cluster)
        > (100 / 100)
    for: 15m
    labels:
      severity: warning
  - alert: K8S 容器组 CPULimits% 使用率高
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的容器组 {{$labels.pod}} 中的容器 {{$labels.container}} CPU Limits % 达到 {{$value | humanizePercentage}}!'
    expr: |
      sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate) by (container,pod,namespace,cluster) / sum(kube_pod_container_resource_limits_cpu_cores) by (container,pod,namespace,cluster) > 1
    for: 15m
    labels:
      severity: warning
  - alert: K8S 容器组内存 Limits% 使用率高 
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的容器组 {{$labels.pod}} 中的容器 {{$labels.container}} 内存 Limits% 达到 {{$value | humanizePercentage}}!'
    expr: |
      sum(container_memory_working_set_bytes) by (container,pod,namespace,cluster) / sum(kube_pod_container_resource_limits_memory_bytes) by (container,pod,namespace,cluster) > 1
    for: 15m
    labels:
      severity: warning
#  - alert: K8S 工作负载的 HPA 放弃满载并且资源均匀利用率高
#    annotations:
#      message: '{{$labels.cluster}} 集群 HPA {{$labels.namespace}}/{{$labels.hpa}} 以限度正本数满载运行超过了 15 分钟,并且资源均匀利用率达 {{$value}}% , 须要扩容!'
#    expr: |
#      kube_hpa_status_current_metrics_average_utilization > 95
#        and
#      kube_hpa_status_current_replicas{job="kube-state-metrics"}
#        ==
#      kube_hpa_spec_max_replicas{job="kube-state-metrics"}
#    for: 15m
#    labels:
#      severity: critical
  - alert: K8S 工作负载 CPULimits% 使用率高
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的 {{$labels.workload_type}} 工作负载 {{$labels.workload}} CPU Limits% 达到 {{$value | humanizePercentage}} 可能触发 HPA!'
    expr: |
      sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload,workload_type,cluster,namespace) / sum(kube_pod_container_resource_limits_cpu_cores * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,cluster,namespace) > 3.5
    for: 15m
    labels:
      severity: warning
  - alert: K8S 工作负载 CPURequests% 使用率达 HPA 扩容阈值
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的 {{$labels.workload_type}} 工作负载 {{$labels.workload}} CPU Requests% 达到 {{$value | humanizePercentage}} 达到 HPA 扩容条件!'
    expr: |
      sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=~"prod|super", container!=""}
          * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload,workload_type,cluster,namespace)
        /
      sum(kube_pod_container_resource_requests_cpu_cores{namespace=~"prod|super"} * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,cluster,namespace) > 4
        and
      count(
        sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{namespace=~"prod|super", container!=""}
            * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload,workload_type,pod,cluster,namespace)
          /
        sum(kube_pod_container_resource_requests_cpu_cores{namespace=~"prod|super"}
            * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,pod,cluster,namespace)
          > 4) by (workload, workload_type,cluster,namespace)
        ==
      count(
        sum(kube_pod_container_resource_requests_cpu_cores{namespace=~"prod|super"}
        * on(namespace,pod)
          group_left(workload, workload_type) mixin_pod_workload{namespace=~"prod|super"}) by (workload, workload_type,pod,cluster,namespace)) by (workload, workload_type,cluster,namespace)
    for: 30s
    labels:
      severity: none
  - alert: K8S 工作负载内存 Limits% 使用率高
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的 {{$labels.workload_type}} 工作负载 {{$labels.workload}} 内存 Limits% 达到 {{$value | humanizePercentage}} 可能触发 HPA!'
    expr: |
      sum(container_memory_working_set_bytes * on(namespace,pod,container,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,cluster,namespace) / sum(kube_pod_container_resource_limits_memory_bytes * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,cluster,namespace) > 1
    for: 15m 
    labels:
      severity: warning
  - alert: K8S 工作负载内存 Requests% 使用率达 HPA 扩容阈值
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 的 {{$labels.workload_type}} 工作负载 {{$labels.workload}} 内存 Requests% 达到 {{$value | humanizePercentage}} 达到 HPA 扩容条件!'
    expr: |
      (sum(container_memory_working_set_bytes{namespace=~"prod|super", container!=""}
          * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload,workload_type,cluster,namespace)
        /
      sum(kube_pod_container_resource_requests_memory_bytes{namespace=~"prod|super"} * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,cluster,namespace) > 1.1)
        and
      ((count(
        sum(container_memory_working_set_bytes{namespace=~"prod|super", container!=""}
            * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload,workload_type,pod,cluster,namespace) 
          / 
        sum(kube_pod_container_resource_requests_memory_bytes{namespace=~"prod|super"}
            * on(namespace,pod,cluster) group_left(workload, workload_type) mixin_pod_workload) by (workload, workload_type,pod,cluster,namespace) 
          > 1.1) by (workload, workload_type,cluster,namespace))
        == 
      (count(
        sum(kube_pod_container_resource_requests_memory_bytes{namespace=~"prod|super"}
        * on(namespace,pod)
          group_left(workload, workload_type) mixin_pod_workload{namespace=~"prod|super"}) by (workload, workload_type,pod,cluster,namespace)) by (workload, workload_type,cluster,namespace)))
    for: 30s
    labels:
      severity: none

scheduler

groups:
- name: kubernetes-system-scheduler
  rules:
  - alert: K8SScheduler 掉线
    annotations:
      message: KubeScheduler 从 Prometheus Targets 的发现中隐没!expr: |
      absent(up{job="kube-scheduler"} == 1)
    for: 15m
    labels:
      severity: critical

storage

group:
- name: kubernetes-storage
  rules:
  - alert: K8S 的 PV 使用量警报
    annotations:
      message: '{{$labels.cluster}} 集群命名空间 {{$labels.namespace}} 中被 PVC {{$labels.persistentvolumeclaim}} 申明的的 PV 只剩下 {{$value | humanizePercentage}} 闲暇!'
    expr: |
      kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
        /
      kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
        < 0.03
    for: 1m
    labels:
      severity: critical
  - alert: KubePersistentVolumeFullInFourDays
    annotations:
      message: '{{$labels.cluster}} 集群通过抽样计算,命名空间 {{$labels.namespace}} 中被 PVC {{$labels.persistentvolumeclaim}} 申明的的 PV 将在 4 天内用尽,以后残余 {{$value | humanizePercentage}}!'
    expr: |
      (kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
          /
        kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
      ) < 0.15
      and
      predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
    for: 1h
    labels:
      severity: critical
  - alert: K8S 的 PV 谬误
    annotations:
      message: '{{$labels.cluster}} 集群 PV {{$labels.persistentvolume}} 的状态为 {{$labels.phase}}!'
    expr: |
      kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
    for: 5m
    labels:
      severity: critical

system

groups: 
- name: kubernetes-system
  rules:
  - alert: 节点文件系统 24 小时内用完
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间,速率计算可能在 24 小时内填满!'
    expr: |
      (node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
      and
        predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: warning
  - alert: 节点文件系统 4 小时内用完
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间,速率计算可能在 4 小时内填满!'
    expr: |
      (node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
      and
        predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: critical
  - alert: 节点文件系统只剩下不到 5%
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间!'
    expr: |
      (node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: warning
  - alert: 节点文件系统只剩下不到 3%
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间!'
    expr: |
      (node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: critical
  - alert: 节点挂载的文件系统闲暇的文件节点个数 24 小时内用完
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间,速率计算可能在 24 小时内填满!'
    expr: |
      (node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
      and
        predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: warning
  - alert: 节点挂载的文件系统闲暇的文件节点个数 4 小时内用完
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间,速率计算可能在 4 小时内填满!'
    expr: |
      (node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
      and
        predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: critical
  - alert: 节点挂载的文件系统闲暇的文件节点个数不到 5%
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间!'
    expr: |
      (node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: warning
  - alert: 节点挂载的文件系统闲暇的文件节点个数不到 3%
    annotations:
      message: '{{$labels.cluster}} 集群的 {{$labels.instance}} 节点的文件系统的 {{$labels.device}} 设施只剩下 {{printf"%.2f"$value}}% 可应用空间!'
    expr: |
      (node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
      and
        node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
      )
    for: 1h
    labels:
      severity: critical
  - alert: 节点 CPU 应用大于 85%
    expr: 100 - (avg by(cluster,instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
    for: 3m
    labels:
      severity: critical
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点 CPU 使用率在 3m 内继续达到 {{printf"%.0f"$value}}%!'
  - alert: 节点内存闲暇低
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点侦测到内存使用率在 3m 内继续达到 {{printf"%.0f"$value}}%!'
    expr: |
      100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100) > 85
    for: 3m
    labels:
      severity: critical
  - alert: 侦测到 OOM 触发行为
    annotations:
      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点侦测到 OOM 行为!'
    expr: |
      node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 15
    for: 3m
    labels:
      severity: critical
#  - alert: 节点侦测到文件描述符切换次数过高
#    expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 5000
#    for: 1m
#    labels:
#      severity: critical
#    annotations:
#      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点侦测到文件描述符切换次数达到 {{printf"%.0f"$value}} 次 /s!'
#  - alert: 节点侦测到关上的文件描述符过多
#    expr: avg by (instance,cluster) (node_filefd_allocated) > 102400
#    for: 1m
#    labels:
#      severity: critical
#    annotations:
#      message: '{{$labels.cluster}} 集群 {{$labels.instance}} 节点侦测到关上的文件描述符达到 {{printf"%.0f"$value}}!'

time

groups:
- name: node-time
  rules:
  - alert: 侦测到时钟偏差
    annotations:
      message:  '{{$labels.cluster}} 集群 node-exporter {{$labels.namespace}}/{{$labels.pod}} 侦测到时钟偏差!'
    expr: |
      abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05
    for: 2m
    labels:
      severity: warning

version

groups:
- name: node-time
  rules:
  - alert: 侦测到时钟偏差
    annotations:
      message:  '{{$labels.cluster}} 集群 node-exporter {{$labels.namespace}}/{{$labels.pod}} 侦测到时钟偏差!'
    expr: |
      abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05
    for: 2m
    labels:
      severity: warning
[root@iZrj99mogqud1ugmdysxvjZ rules]# cat k8s-version.yml
groups:
- name: kubernetes-system
  rules:
  - alert: K8S 版本不匹配
    annotations:
      message: '{{$labels.cluster}} 集群存在 {{$value}} 各不同的 Kubernetes 版本 running!'
    expr: |
      count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
    for: 15m
    labels:
      severity: warning
  - alert: K8S 的 APISERVER 谬误
    annotations:
      message: '{{$labels.cluster}} 集群 K8S API server client {{$labels.job}}/{{$labels.instance}} 存在 {{$value | humanizePercentage}} 的申请谬误!'
    expr: |
      (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job, cluster)
        /
      sum(rate(rest_client_requests_total[5m])) by (instance, job, cluster))
      > 0.01
    for: 15m
    labels:
      severity: warning
退出移动版