[monitor] 最高调度级别的采集数据告警恢复时监控状态联动

This commit is contained in:
tomsun28
2021-12-15 11:21:51 +08:00
parent 077f8b9d07
commit 5f211560a6
8 changed files with 257 additions and 144 deletions

View File

@@ -36,13 +36,15 @@ public class CalculateAlarm {
private AlerterDataQueue dataQueue;
private AlertDefineService alertDefineService;
private Map<String, Alert> triggeredAlertMap;
private Map<Long, CollectRep.Code> triggeredMonitorStateAlertMap;
public CalculateAlarm (AlerterProperties properties, AlerterWorkerPool workerPool,
AlerterDataQueue dataQueue, AlertDefineService alertDefineService) {
public CalculateAlarm (AlerterWorkerPool workerPool, AlerterDataQueue dataQueue,
AlertDefineService alertDefineService) {
this.workerPool = workerPool;
this.dataQueue = dataQueue;
this.alertDefineService = alertDefineService;
this.triggeredAlertMap = new ConcurrentHashMap<>(128);
this.triggeredMonitorStateAlertMap = new ConcurrentHashMap<>(128);
startCalculate();
}
@@ -68,29 +70,42 @@ public class CalculateAlarm {
long monitorId = metricsData.getId();
String app = metricsData.getApp();
String metrics = metricsData.getMetrics();
// 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
// 采集异常
Alert.AlertBuilder alertBuilder = Alert.builder()
.monitorId(monitorId)
.priority((byte) 0)
.status((byte) 0)
.times(1);
if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
// UN_REACHABLE 对端不可达(网络层icmp)
alertBuilder.target(CommonConstants.REACHABLE)
.content("监控紧急可达性告警: " + metricsData.getCode().name());
dataQueue.addAlertData(alertBuilder.build());
} else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
// UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
alertBuilder.target(CommonConstants.AVAILABLE)
.content("监控紧急可用性告警: " + metricsData.getCode().name());
dataQueue.addAlertData(alertBuilder.build());
} else {
// todo 其它规范异常 TIMEOUT ...
// 先判断调度优先级为0的指标组采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警进行监控状态变更
if (metricsData.getPriority() == 0) {
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
// 采集异常
Alert.AlertBuilder alertBuilder = Alert.builder()
.monitorId(monitorId)
.priority((byte) 0)
.status((byte) 0)
.times(1);
if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
// UN_REACHABLE 对端不可达(网络层icmp)
alertBuilder.target(CommonConstants.REACHABLE)
.content("监控紧急可达性告警: " + metricsData.getCode().name());
triggeredMonitorStateAlertMap.put(monitorId, CollectRep.Code.UN_REACHABLE);
dataQueue.addAlertData(alertBuilder.build());
} else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
// UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
alertBuilder.target(CommonConstants.AVAILABLE)
.content("监控紧急可用性告警: " + metricsData.getCode().name());
triggeredMonitorStateAlertMap.put(monitorId, CollectRep.Code.UN_CONNECTABLE);
dataQueue.addAlertData(alertBuilder.build());
} else {
// todo 其它规范异常 TIMEOUT ...
return;
}
return;
} else {
// 判断关联监控之前是否有可用性或者不可达告警,发送恢复告警进行监控状态恢复
CollectRep.Code stateCode = triggeredMonitorStateAlertMap.remove(monitorId);
if (stateCode != null) {
// 发送告警恢复
Alert resumeAlert = Alert.builder()
.monitorId(monitorId).status((byte) 2).build();
dataQueue.addAlertData(resumeAlert);
}
}
return;
}
// 查出此监控类型下的此指标集合下关联配置的告警定义信息
// field - define[]

View File

@@ -67,7 +67,7 @@ public class Alert {
@Length(max = 1024)
private String content;
@ApiModelProperty(value = "告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)",
@ApiModelProperty(value = "告警状态: 0-正常告警 1-触发中:阈值触发但未达到告警次数 2-恢复告警",
example = "1", accessMode = READ_WRITE, position = 7)
@Min(0)
@Max(2)