Zstack的监控架构分析

PromQL 基础用法

  • 指标名(metric name): Prometheus 采集的数据都有一个指标名,比如:node_cpu_seconds_total
  • 标签过滤(label filter):用 {key="value"} 过滤标签,例如:node_cpu_seconds_total{mode=”idle”, instance=”host1”}
  • 时间范围选择器:时间序列可以按区间查询,例如:rate(node_cpu_seconds_total[5m]),这里 rate() 是对 5 分钟内的增长速率求平均。
  • 其他就是一些常见函数:max, min, count, stddev, stdvar,irate(),avg by(...)….

zstack prometheus配置分析

zstack的监控告警设计图:

Collected Exporter安装地址: https://github.com/prometheus/collectd_exporter

查看普罗的配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
[root@exxk ~]# cat /usr/local/zstack/prometheus/conf.yaml 
global:
scrape_interval: 20s #每隔 20秒 从所有配置的目标(targets)抓取一次指标数据,实际就是20秒掉一次/metrics接口
scrape_timeout: 5s #抓取每个目标时的超时时间,最多等 5 秒。
evaluation_interval: 10s # 每 10秒 重新计算一次告警规则(alert rules)和记录规则(recording rules)
rule_files: #里面可以是记录规则也可以是告警规则
- /usr/local/zstack/prometheus/rules/zwatch.rule.yml #规则文件,主要用于表达式聚合
- /usr/local/zstack/prometheus/rules/hosts/collectd.rule.yml #规则文件,主要用于表达式聚合
scrape_configs:
- job_name: management-server-exporter
scrape_interval: 10s
scrape_timeout: 5s
file_sd_configs:
- files:
- /usr/local/zstack/prometheus/discovery/management-node/*.json
refresh_interval: 10s
- job_name: baremetal-pxeserver-exporter
scrape_interval: 10s
scrape_timeout: 5s
file_sd_configs:
- files:
- /usr/local/zstack/prometheus/discovery/pxeserver/*.json
refresh_interval: 10s
- job_name: backup-storage-exporter
scrape_interval: 10s
scrape_timeout: 5s
file_sd_configs:
- files:
- /usr/local/zstack/prometheus/discovery/backupStorage/*.json
refresh_interval: 10s
- job_name: vrouter-exporter
scrape_interval: 10s
scrape_timeout: 5s
file_sd_configs:
- files:
- /usr/local/zstack/prometheus/discovery/vrouter/*.json
refresh_interval: 10s
- job_name: custom-metrics-pushgateway
scrape_interval: 10s
scrape_timeout: 5s
honor_labels: true
static_configs:
- targets:
- 192.168.10.2:9091
- job_name: collectd
scrape_interval: 10s
scrape_timeout: 5s
file_sd_configs:
- files:
- /usr/local/zstack/prometheus/discovery/hosts/*.json #配置动态采集设备的ip和端口信息
refresh_interval: 10s #10秒刷新一次配置

查看普罗的采集指标

1
2
3
4
5
6
7
[root@exxk ~]# cat /usr/local/zstack/prometheus/discovery/hosts/8623bf76e14c4509abd4202ee717e9ad-192-168-10-254.json 
[{"targets":["192.168.10.2:9103","192.168.10.2:9100","192.168.10.2:7069"],"labels":{"hostUuid":"8623bf76e14c4509abd4202ee717e9ad"}},{"targets":["192.168.10.2:9092"],"labels":{}}]
#具体指标内容可以通过ip加端口加/metrics
http://192.168.10.2:9100/metrics #node_exporter 采集的
http://192.168.10.2:9103/metrics #collectd 采集的
http://192.168.10.2:7069/metrics #应该是zstack自己写的一个python采集的
http://192.168.10.2:9092/metrics #pushgateway 的

查看记录规则文件

定义一个 Recording Rule(记录规则),它的目的是将一个复杂的表达式的结果保存成一个新的时间序列(即指标名),方便后续查询和告警使用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[root@exxk ~]# cat /usr/local/zstack/prometheus/rules/zwatch.rule.yml
groups:
- name: zwatch.rule ## 告警或记录规则组的名称,里面部分指标会依赖基础指标collectd.rule.yml
rules:
- record: ZStack:BaremetalVM::OperatingSystemNetworkOutPackets #定义一个新的时间序列名称(指标)
expr: irate(bm_node_network_transmit_packets{vmUuid!=""}[10m]) #表达式
- record: ZStack:BaremetalVM::DiskFreeCapacityInPercent
expr: ((bm_node_filesystem_avail{vmUuid!=""} + 1) / (bm_node_filesystem_size{vmUuid!=""} + 1)) * 100
.....
[root@exxk ~]# cat /usr/local/zstack/prometheus/rules/hosts/collectd.rule.yml
groups:
- name: collectd ## 记录一些基础指标的转换
rules:
- record: collectd:collectd_virt_virt_cpu_total
expr: irate(collectd_virt_virt_cpu_total[10m]) / 1e+07
- record: collectd:collectd_virt_virt_vcpu
expr: irate(collectd_virt_virt_vcpu[10m]) / 1e+07

zStack报警器

页面位置:平台运维->云平台监控->报警器,查看列表的接口

http://192.168.10.2:5000/graphql?gql=zwatchAlarmList

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
{
"data": {
"zwatchAlarmList": {
"list": [
{
"uuid": "5z6gsgkc5kccpylj9ocgbd647p2700b7",
"name": "Average CPU Utilization of Hosts",
"zhName": "物理机平均CPU使用率",
"description": null,
"period": 300,
"namespace": "ZStack/Host",
"metricName": "CPUAverageUsedUtilization", #这里可以得到指标的名称,对应zwatch.rule.yml
"threshold": 80, #阈值
"repeatCount": -1,
"repeatInterval": 1800,
"enableRecovery": false,
"emergencyLevel": "Important",
"comparisonOperator": "GreaterThanOrEqualTo", #条件,<>=
"eventName": null,
"thirdpartyPlatformName": "-",
"state": "Enabled",
"status": "OK",
"topicNum": 1,
"createDate": "Apr 2, 2025 12:49:41 PM",
"lastOpDate": "Apr 2, 2025 12:49:41 PM",
"actions": [
{
"alarmUuid": "5z6gsgkc5kccpylj9ocgbd647p2700b7",
"subscriptionUuid": null,
"actionUuid": "e7d6f5e23bb74e99a2777126078b551c",
"actionType": "sns",
"__typename": "AlarmActions"
}
],
"labels": [],
"userTag": null,
"owner": {
"uuid": "36c27e8ff05c4780bf6d2fa65700f22e",
"name": "admin",
"__typename": "BasicOwner"
},
"platform": null,
"__typename": "ZWatchAlarmVO"
},
......
],
"total": 4,
"__typename": "ZWatchAlarmVoResp"
}
}
}

得到物理机报警器的具体表达式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
1. 物理机平均CPU使用率:  
# ≥ 80% , 并持续 5分钟  严重
record:ZStack:Host::CPUAverageUsedUtilization
复杂expr:avg by(hostUuid) ((sum by(hostUuid) (100 - collectd_cpu_percent{hostUuid!="",type="idle"}) / sum by(hostUuid) (collectd_cpu_percent{hostUuid!=""})) * 100)

#collectd_cpu_percent是由collectd提供的指标,如果不想安装collectd,需要将collectd转换成node_exporter里面的指标
复杂expr: avg by(hostUuid) ((sum by(hostUuid, cpu) (node_cpu_seconds_total{mode!="idle", hostUuid!=""})/sum by(hostUuid, cpu) (node_cpu_seconds_total{hostUuid!=""})) * 100)

2. 物理机内存已用百分比:
# ≥ 80% , 并持续 5分钟  严重
record:ZStack:Host::MemoryUsedInPercent
复杂expr:100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))

3. 物理机内存已用百分比 kvm
# ≥ 80% , 并持续 5分钟  严重
record:ZStack:Host::MemoryUsedCapacityPerHostInPercent

4. 物理机根盘使用率报警器
# ≥ 80% , 并持续 10分钟 紧急
record:ZStack:Host::DiskRootUsedCapacityInPercent
复杂expr:(sum by(hostUuid) (node_filesystem_size{fstype!="rootfs",hostUuid!="",mountpoint="/"} - node_filesystem_avail{fstype!="rootfs",hostUuid!="",mountpoint="/"}) / sum by(hostUuid) (node_filesystem_size{fstype!="rootfs",hostUuid!="",mountpoint="/"})) * 100

后续可以直接通过record进行promql查询。

根据zstack的配置,模拟搭建一个prometheus

docker-compose.yml

1
2
3
4
5
6
7
8
9
10
11
services:
prometheus:
image: prom/prometheus:v2.37.1
container_name: prometheus
ports:
- 9090:9090
command:
- --config.file=/etc/prometheus/config/prometheus.yml
volumes:
- ./config:/etc/prometheus/config
- /etc/localtime:/etc/localtime:ro

配置目录结构如下

1
2
3
4
5
6
7
8
9
10
.
├── config
│   ├── collectd.rule.yml #基础规则文件
│   ├── hosts
│   │   ├── PM_exxk_192.168.10.2.json
│   │   ├── PM_xxx_xx.xx.xx.xx.json #其他节点配置文件,文件命名规则,PM_主机名_ip.json
│   │   └── .... #更多
│   ├── prometheus.yml #普罗配置文件
│   └── zwatch.rule.yml #普罗规则聚合文件
└── docker-compose.yml

PM_exxk_192.168.10.2.json

1
[{"targets":["192.168.10.2:9100"],"labels":{"hostUuid":"PM_exxk_192.168.10.2"}}]

collectd.rule.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
groups:
- name: collectd
rules:
- record: collectd:collectd_virt_virt_cpu_total
expr: irate(collectd_virt_virt_cpu_total[10m]) / 1e+07
- record: collectd:collectd_virt_virt_vcpu
expr: irate(collectd_virt_virt_vcpu[10m]) / 1e+07
- record: collectd:collectd_virt_memory
expr: collectd_virt_memory
- record: collectd:collectd_disk_disk_octets_read
expr: irate(collectd_disk_disk_octets_0[10m])
- record: collectd:collectd_disk_disk_octets_write
expr: irate(collectd_disk_disk_octets_1[10m])
- record: collectd:collectd_disk_disk_ops_read
expr: irate(collectd_disk_disk_ops_0[10m])
- record: collectd:collectd_disk_disk_ops_write
expr: irate(collectd_disk_disk_ops_1[10m])
- record: collectd:collectd_disk_disk_time_read
expr: irate(collectd_disk_disk_time_0[10m])
- record: collectd:collectd_disk_disk_time_write
expr: irate(collectd_disk_disk_time_1[10m])
- record: collectd:collectd_interface_if_errors_rx
expr: irate(collectd_interface_if_errors_0[10m])
- record: collectd:collectd_interface_if_errors_tx
expr: irate(collectd_interface_if_errors_1[10m])
- record: collectd:collectd_interface_if_octets_rx
expr: irate(collectd_interface_if_octets_0[10m])
- record: collectd:collectd_interface_if_octets_tx
expr: irate(collectd_interface_if_octets_1[10m])
- record: collectd:collectd_interface_if_packets_rx
expr: irate(collectd_interface_if_packets_0[10m])
- record: collectd:collectd_interface_if_packets_tx
expr: irate(collectd_interface_if_packets_1[10m])
- record: collectd:collectd_memory
expr: collectd_memory
- record: collectd:collectd_cpu_percent
expr: collectd_cpu_percent
- record: collectd:wmi_cpu_time_total
expr: wmi_cpu_time_total
- record: collectd:collectd_virt_disk_octets_read
expr: irate(collectd_virt_disk_octets_0[10m])
- record: collectd:collectd_virt_disk_octets_write
expr: irate(collectd_virt_disk_octets_1[10m])
- record: collectd:collectd_virt_disk_ops_read
expr: irate(collectd_virt_disk_ops_0[10m])
- record: collectd:collectd_virt_disk_ops_write
expr: irate(collectd_virt_disk_ops_1[10m])
- record: collectd:collectd_virt_if_dropped_read
expr: irate(collectd_virt_if_dropped_0[10m])
- record: collectd:collectd_virt_if_dropped_write
expr: irate(collectd_virt_if_dropped_1[10m])
- record: collectd:collectd_virt_if_errors_rx
expr: irate(collectd_virt_if_errors_0[10m])
- record: collectd:collectd_virt_if_errors_tx
expr: irate(collectd_virt_if_errors_1[10m])
- record: collectd:collectd_virt_if_octets_rx
expr: irate(collectd_virt_if_octets_0[10m])
- record: collectd:collectd_virt_if_octets_tx
expr: irate(collectd_virt_if_octets_1[10m])
- record: collectd:collectd_virt_if_packets_rx
expr: irate(collectd_virt_if_packets_0[10m])
- record: collectd:collectd_virt_if_packets_tx
expr: irate(collectd_virt_if_packets_1[10m])
- expr: node_filesystem_free_bytes
record: node_filesystem_free
- expr: node_filesystem_avail_bytes
record: node_filesystem_avail
- expr: node_filesystem_size_bytes
record: node_filesystem_size

zwatch.rule.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
groups:
- name: zwatch.rule
rules:
- record: ZStack:Host::ReclaimedMemoryInBytes
expr: clamp_min(sum(collectd_virt_memory{hostUuid!="", type="max_balloon"}) by (hostUuid) - on(hostUuid) sum(collectd_virt_memory{hostUuid!="", type="actual_balloon"}) by (hostUuid), 0)
- record: ZStack:Host::DiskCapacityInBytes
expr: node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}
- record: ZStack:Host::DiskAllReadBytes
expr: sum(irate(collectd_disk_disk_octets_0{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::CPUAverageWaitUtilization
expr: avg((sum(collectd_cpu_percent{type="wait", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100) by (hostUuid)
- record: ZStack:Host::DiskAllWriteOps
expr: sum(irate(collectd_disk_disk_ops_1{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::NetworkAllOutBytesByServiceType
expr: sum(irate(host_network_all_out_bytes_by_service_type{hostUuid!=""}[10m])) by(hostUuid, service_type)
- record: ZStack:Host::DiskFreeCapacityInPercent
expr: ((node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} + 1) / (node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} + 1)) * 100
- record: ZStack:Host::CPUAverageUsedUtilization
# 需要安装collectd才能用 collectd_cpu_percent这个指标
# expr: avg((sum(100 - collectd_cpu_percent{type="idle", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100) by (hostUuid)
# 这里采用node_exporter采集的指标
expr: avg by(hostUuid) ((sum by(hostUuid, cpu) (node_cpu_seconds_total{mode!="idle", hostUuid!=""})/sum by(hostUuid, cpu) (node_cpu_seconds_total{hostUuid!=""})) * 100)
- record: ZStack:Host::DiskRootUsedCapacityInBytes
expr: sum(node_filesystem_size{hostUuid!="", fstype!="rootfs",mountpoint="/"} - node_filesystem_avail{hostUuid!="", fstype!="rootfs",mountpoint="/"}) by(hostUuid)
- record: ZStack:Host::NetworkOutPackets
expr: irate(collectd_interface_if_packets_1{hostUuid!=""}[10m])
- record: ZStack:Host::NetworkOutDropped
expr: irate(collectd_interface_if_dropped_1{hostUuid!=""}[10m])
- record: ZStack:Host::DiskWriteBytesWwid
expr: irate(collectd_disk_disk_octets_1[10m:]) * on (disk, hostUuid) group_left(wwid) node_disk_wwid
- record: ZStack:Host::CPUAllIdleUtilization
expr: (sum(collectd_cpu_percent{type="idle", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100
- record: ZStack:Host::NetworkAllInBytes
expr: sum(irate(host_network_all_in_bytes{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::NetworkInDropped
expr: irate(collectd_interface_if_dropped_0{hostUuid!=""}[10m])
- record: ZStack:Host::DiskUsedCapacityInPercent
expr: (((node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} - node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) + 1) / (node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} + 1)) * 100
- record: ZStack:Host::DiskWriteBytes
expr: irate(collectd_disk_disk_octets_1{hostUuid!=""}[10m])
- record: ZStack:Host::DiskZStackUsedCapacityInPercent
expr: (sum(zstack_used_capacity_in_bytes) by(hostUuid) / sum(node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid)) * 100
- record: ZStack:Host::NetworkInErrors
expr: irate(collectd_interface_if_errors_0{hostUuid!=""}[10m])
- record: ZStack:Host::DiskRootUsedCapacityInPercent
expr: (sum(node_filesystem_size{hostUuid!="", fstype!="rootfs",mountpoint="/"} - node_filesystem_avail{hostUuid!="", fstype!="rootfs",mountpoint="/"}) by(hostUuid) / sum(node_filesystem_size{hostUuid!="", fstype!="rootfs",mountpoint="/"}) by(hostUuid)) * 100
- record: ZStack:Host::NetworkAllInPackets
expr: sum(irate(host_network_all_in_packages{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::DiskAllUsedCapacityInBytes
expr: sum(node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} - node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid)
- record: ZStack:Host::DiskWriteOps
expr: irate(collectd_disk_disk_ops_1{hostUuid!=""}[10m])
- record: ZStack:Host::NetworkAllOutBytes
expr: sum(irate(host_network_all_out_bytes{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::VolumeGroupUsedCapacityInPercent
expr: ((vg_size - vg_avail + 1) / (vg_size + 1)) * 100
- record: ZStack:Host::NetworkAllInErrors
expr: sum(irate(host_network_all_in_errors{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::NetworkAllInBytesByServiceType
expr: sum(irate(host_network_all_in_bytes_by_service_type{hostUuid!=""}[10m])) by(hostUuid, service_type)
- record: ZStack:Host::NetworkAllOutErrorsByServiceType
expr: sum(irate(host_network_all_out_errors_by_service_type{hostUuid!=""}[10m])) by(hostUuid, service_type)
- record: ZStack:Host::NetworkOutErrors
expr: irate(collectd_interface_if_errors_1{hostUuid!=""}[10m])
- record: ZStack:Host::NetworkAllInPacketsByServiceType
expr: sum(irate(host_network_all_in_packages_by_service_type{hostUuid!=""}[10m])) by(hostUuid, service_type)
- record: ZStack:Host::DiskTransUsedCapacityInBytes
expr: sum(node_filesystem_size{hostUuid!="", fstype!="rootfs"} - node_filesystem_avail{hostUuid!="", fstype!="rootfs"}) by(hostUuid) - sum(zstack_used_capacity_in_bytes) by(hostUuid)
- record: ZStack:Host::NetworkAllInErrorsByServiceType
expr: sum(irate(host_network_all_in_bytes_by_service_type{hostUuid!=""}[10m])) by(hostUuid, service_type)
- record: ZStack:Host::DiskUsedCapacityInBytes
expr: node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} - node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}
- record: ZStack:Host::DiskReadOpsWwid
expr: irate(collectd_disk_disk_ops_0[10m:]) * on (disk, hostUuid) group_left(wwid) node_disk_wwid
- record: ZStack:Host::DiskLatencyWwid
expr: (delta(collectd_disk_disk_io_time_0[1m]) + delta(collectd_disk_disk_io_time_1[1m])+1) / (delta(collectd_disk_disk_ops_0[1m]) + delta(collectd_disk_disk_ops_1[1m])+1) * on (disk, hostUuid) group_left(wwid) node_disk_wwid
- record: ZStack:Host::CPUAllUsedUtilization
expr: clamp((sum(100 - collectd_cpu_percent{type="idle", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100, 0, 100)
- record: ZStack:Host::DiskReadBytesWwid
expr: irate(collectd_disk_disk_octets_0[10m:]) * on (disk, hostUuid) group_left(wwid) node_disk_wwid
- record: ZStack:Host::CPUAverageIdleUtilization
expr: avg((sum(collectd_cpu_percent{type="idle", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100) by (hostUuid)
- record: ZStack:Host::CPUAverageUserUtilization
expr: avg((sum(collectd_cpu_percent{type="user", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100) by (hostUuid)
- record: ZStack:Host::MemoryUsedBytes
expr: node_memory_MemTotal_bytes-node_memory_MemAvailable_bytes
- record: ZStack:Host::MemoryFreeInPercent
expr: 100 * (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)
- record: ZStack:Host::NetworkOutBytes
expr: irate(collectd_interface_if_octets_1{hostUuid!=""}[10m])
- record: ZStack:Host::NetworkAllOutPacketsByServiceType
expr: sum(irate(host_network_all_out_packages_by_service_type{hostUuid!=""}[10m])) by(hostUuid, service_type)
- record: ZStack:Host::DiskAllWriteBytes
expr: sum(irate(collectd_disk_disk_octets_1{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::DiskTotalCapacityInBytes
expr: sum(node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid)
- record: ZStack:Host::MemoryUsedInPercent
expr: 100 * (1 - (node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes))
- record: ZStack:Host::DiskLatency
expr: (delta(collectd_disk_disk_io_time_0[1m]) + delta(collectd_disk_disk_io_time_1[1m])+1) / (delta(collectd_disk_disk_ops_0[1m]) + delta(collectd_disk_disk_ops_1[1m])+1)
- record: ZStack:Host::DiskReadBytes
expr: irate(collectd_disk_disk_octets_0{hostUuid!=""}[10m])
- record: ZStack:Host::DiskWriteOpsWwid
expr: irate(collectd_disk_disk_ops_1[10m]) * on (disk, hostUuid) group_left(wwid) node_disk_wwid
- record: ZStack:Host::NetworkInPackets
expr: irate(collectd_interface_if_packets_0{hostUuid!=""}[10m])
- record: ZStack:Host::CPUUsedUtilization
expr: abs(100 - collectd_cpu_percent{type="idle", hostUuid!=""})
- record: ZStack:Host::DiskAllFreeCapacityInPercent
expr: (sum(node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid) / sum(node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid)) * 100
- record: ZStack:Host::DiskTransUsedCapacityInPercent
expr: (sum(node_filesystem_size{hostUuid!="", fstype!="rootfs"} - node_filesystem_avail{hostUuid!="", fstype!="rootfs"}) by(hostUuid) - sum(zstack_used_capacity_in_bytes) by(hostUuid)) / sum(node_filesystem_size{hostUuid!="", fstype!="rootfs"}) by(hostUuid) * 100
- record: ZStack:Host::DiskAllReadOps
expr: sum(irate(collectd_disk_disk_ops_0{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::NetworkAllOutPackets
expr: sum(irate(host_network_all_out_packages{hostUuid!=""}[10m])) by(hostUuid)
- record: ZStack:Host::NetworkInBytes
expr: irate(collectd_interface_if_octets_0{hostUuid!=""}[10m])
- record: ZStack:Host::DiskReadOps
expr: irate(collectd_disk_disk_ops_0{hostUuid!=""}[10m])
- record: ZStack:Host::CPUAverageSystemUtilization
expr: avg((sum(collectd_cpu_percent{type="system", hostUuid!=""}) by(hostUuid) / sum(collectd_cpu_percent{hostUuid!=""}) by(hostUuid)) * 100) by (hostUuid)
- record: ZStack:Host::DiskAllUsedCapacityInPercent
expr: (sum(node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"} - node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid) / sum(node_filesystem_size{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid)) * 100
- record: ZStack:Host::DiskAllFreeCapacityInBytes
expr: sum(node_filesystem_avail{hostUuid!="", fstype!~"proc|tmpfs|rootfs|ramfs|iso9660|rpc_pipefs", mountpoint!~"/tmp/zs-.*"}) by(hostUuid)
- record: ZStack:Host::NetworkAllOutErrors
expr: sum(irate(host_network_all_out_errors{hostUuid!=""}[10m])) by(hostUuid)

启动普罗docker-compose up -d

写个代码简单模拟下zStack的告警

规则配置枚举类:AlarmRuleEnum.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package com.example.prometheus_demo;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

public enum AlarmRuleEnum {

CPU_UTILIZATION(
"CPUAverageUsedUtilization",
"ZStack/Host",
"Important",
"Average CPU Utilization of Hosts",
"GreaterThanOrEqualTo",
80, //测试时可以把阈值调小才能触发
300,
1800 //测试时可以把这个告警周期调小,才能更频繁触发
), //物理机平均CPU使用率

MEMORY_UTILIZATION(
"MemoryUsedInPercent",
"ZStack/Host",
"Important",
"Host Memory Utilization",
"GreaterThanOrEqualTo",
80,
300,
1800
), //理机内存已用百分比

DISK_USAGE(
"DiskRootUsedCapacityInPercent",
"ZStack/Host",
"Emergent",
"Host Root Volume Utilization",
"GreaterThanOrEqualTo",
80,
600,
1800
); //物理机根盘使用率报警器

private final String metricName; //指标名称
private final String namespace; //命名空间
private final String emergencyLevel; //报警级别
private final String name; //名称
private final String comparisonOperator; //条件,大于小于等等
private final int threshold; //阈值
private final int period; //持续时间
private final int repeatInterval; //报警间隔

AlarmRuleEnum(String metricName, String namespace, String emergencyLevel,
String name, String comparisonOperator, int threshold, int period, int repeatInterval) {
this.metricName = metricName;
this.namespace = namespace;
this.emergencyLevel = emergencyLevel;
this.name = name;
this.comparisonOperator = comparisonOperator;
this.threshold = threshold;
this.period = period;
this.repeatInterval = repeatInterval;
}

//todo 省略了get方法

public static AlarmRuleEnum fromMetricName(String metricName) {
for (AlarmRuleEnum rule : values()) {
if (rule.getMetricName().equals(metricName)) {
return rule;
}
}
return null;
}

//GreaterThan > ,GreaterThanOrEqualTo >= ,LessThan< ,LessThanOrEqualTo<=
private static final Map<String, String> OPERATOR_MAP;
static {
Map<String, String> map = new HashMap<>();
map.put("GreaterThan", ">");
map.put("GreaterThanOrEqualTo", ">=");
map.put("LessThan", "<");
map.put("LessThanOrEqualTo", "<=");
OPERATOR_MAP = Collections.unmodifiableMap(map);
}


// ✅ 生成 PromQL 表达式 avg_over_time(ZStack:Host::CPUAverageUsedUtilization[5m]) > 80
public String getPromQl() {
String prefix = namespace.replace("/", ":") + "::";
int minutes = period / 60;
// 映射判断符号
String operatorSymbol = OPERATOR_MAP.getOrDefault(comparisonOperator, ">");
return String.format("avg_over_time(%s%s[%dm]) %s %d", prefix, metricName, minutes,operatorSymbol, threshold);
}

//获取完整的指标名称,唯一
public String getFullMetricName(){
return namespace.replace("/", ":") + "::"+metricName;
}

}

告警调度类:AlarmScheduler.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
package com.example.prometheus_demo;

import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONArray;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.TaskScheduler;
import org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler;
import org.springframework.stereotype.Component;

import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.UUID;
import java.util.concurrent.ScheduledFuture;


@Component
public class AlarmScheduler {

private static final Logger log = LoggerFactory.getLogger(AlarmScheduler.class);

@Resource
private SysAlarmLogService alarmLogService;

//todo 这个需要删除,有定义好的配置文件--------------------------------------------------------------
private static final String PROMETHEUS_URL = "http://127.0.0.1:9090/api/v1/query?query=";

/**
* Spring 提供的线程调度器,支持定时和固定频率执行任务。
*/
private final TaskScheduler scheduler;

public AlarmScheduler() {
// 初始化线程池调度器(用于执行多个定时任务)
ThreadPoolTaskScheduler taskScheduler = new ThreadPoolTaskScheduler();
taskScheduler.setPoolSize(5); // 设置线程池大小
taskScheduler.setThreadNamePrefix("alarm-task-");
taskScheduler.initialize();
this.scheduler = taskScheduler;
}

/**
* 初始化时自动调度所有枚举中定义的规则。
* 多节点部署时建议在这里加分布式锁,避免重复调度。
*/
@PostConstruct
public void scheduleAll() {
log.debug("开始初始化告警规则调度...");
for (AlarmRuleEnum rule : AlarmRuleEnum.values()) {
scheduleRule(rule);
}
log.debug("所有告警规则已初始化调度。");
}

/**
* 为某个告警规则启动调度任务。
* @param rule 告警规则
*/
private void scheduleRule(AlarmRuleEnum rule) {
long intervalMs = rule.getRepeatInterval() * 1000L; //转换成毫秒

// 🚨 多节点部署建议此处加分布式锁控制,如:Redis + Redisson tryLock
// if (!DistributedLock.tryLock(rule.getMetricName())) return;
log.debug("调度告警规则: {},执行周期:{} 毫秒", rule.getMetricName(), intervalMs);
ScheduledFuture<?> future = scheduler.scheduleAtFixedRate(() -> check(rule), intervalMs);
//如果需要取消或者停止任务,可以将future存到变量里面,提供方法就可以进行其他操作了
}

private void check(AlarmRuleEnum rule) {
log.info("正在检查告警规则: {}", rule.getMetricName());
try {
//todo 这部分需要替换,代码里面已经有封装的工具类,参考物理机的监控统计接口---------------------------------------------
String encodedQuery = URLEncoder.encode(rule.getPromQl(), String.valueOf(StandardCharsets.UTF_8));
String url = PROMETHEUS_URL + encodedQuery;
String response = HttpUtil.get(url);
JSONObject json = JSONUtil.parseObj(response);
//todo ---------------替换结束-------------------------------------------------------------------------------
log.debug("告警promql: {} ", rule.getPromQl());
JSONArray results = json.getJSONObject("data").getJSONArray("result");
if (results != null && !results.isEmpty()) {
for (Object obj : results) {
JSONObject item = (JSONObject) obj;
String hostUuid = item.getJSONObject("metric").getStr("hostUuid"); //hostUuid里面数据格式为:PM_主机名_ip
if (hostUuid!=null&&hostUuid.startsWith("PM_")) { //ip是PM_开头的,才是额外的物理机,其他物理机由zStack自己的告警进行触发,防止告警重复
String[] parts= hostUuid.split("_");
String hostName=parts[1];
String ip=parts[2]; //获取ip
// 判断是否已有相同 instance、alarmUuid 且状态为 Alarm 的记录
LambdaQueryWrapper<SysAlarmLog> query = Wrappers.<SysAlarmLog>lambdaQuery()
.eq(SysAlarmLog::getSourceUuid, ip)
.eq(SysAlarmLog::getAlarmUuid, rule.getFullMetricName())
.eq(SysAlarmLog::getAlarmStatus, "Alarm");

SysAlarmLog existing = alarmLogService.getOne(query, false);
if (existing != null) {
existing.setTimes(existing.getTimes() == null ? 2 : existing.getTimes() + 1);
// existing.setReadStatus(0); //1 未读取,0已读取,todo 是否需要重置为未读
existing.setFirstTime(new Date()); //更新时间
alarmLogService.updateById(existing);
} else {
SysAlarmLog log = new SysAlarmLog();
log.setUuid(UUID.randomUUID().toString());
log.setSource(2); // 其他物理机
log.setSourceUuid(ip); //来源数据uuid
log.setAlarmName(rule.getName());
log.setAlarmStatus("Alarm"); //报警器状态 Alarm 已告警 OK 监控中
log.setAlarmUuid(rule.getFullMetricName()); //报警器uuid取报警器指标全名,相对唯一
log.setComparisonOperator(rule.getComparisonOperator());
log.setContext(item.toString()); //资源信息,直接放报文内容
log.setEmergencyLevel(rule.getEmergencyLevel());
log.setMetricName(rule.getMetricName());
log.setNamespace(rule.getNamespace());
log.setPeriod(rule.getPeriod());
log.setReadStatus(1); //1 未读取,0已读取
log.setThreshold(rule.getThreshold());
log.setCreateTime(new Date());
log.setFirstTime(new Date());
log.setResourceName(hostName);
log.setResourceUuid(ip);
log.setTimes(1); //报警次数
log.setType("alarm"); //告警消息类型 event 事件报警器 alarm 资源报警器
alarmLogService.save(log);
}
}
}
}
log.info("告警规则 {} 检查完成", rule.getMetricName());
} catch (Exception e) {
// 建议使用日志组件代替
e.printStackTrace();
log.error("告警规则 {} 检查中断", rule.getMetricName(), e);
}
}
}