[monitor] 最高调度级别的采集数据告警恢复时监控状态联动
This commit is contained in:
@@ -36,13 +36,15 @@ public class CalculateAlarm {
|
||||
private AlerterDataQueue dataQueue;
|
||||
private AlertDefineService alertDefineService;
|
||||
private Map<String, Alert> triggeredAlertMap;
|
||||
private Map<Long, CollectRep.Code> triggeredMonitorStateAlertMap;
|
||||
|
||||
public CalculateAlarm (AlerterProperties properties, AlerterWorkerPool workerPool,
|
||||
AlerterDataQueue dataQueue, AlertDefineService alertDefineService) {
|
||||
public CalculateAlarm (AlerterWorkerPool workerPool, AlerterDataQueue dataQueue,
|
||||
AlertDefineService alertDefineService) {
|
||||
this.workerPool = workerPool;
|
||||
this.dataQueue = dataQueue;
|
||||
this.alertDefineService = alertDefineService;
|
||||
this.triggeredAlertMap = new ConcurrentHashMap<>(128);
|
||||
this.triggeredMonitorStateAlertMap = new ConcurrentHashMap<>(128);
|
||||
startCalculate();
|
||||
}
|
||||
|
||||
@@ -68,29 +70,42 @@ public class CalculateAlarm {
|
||||
long monitorId = metricsData.getId();
|
||||
String app = metricsData.getApp();
|
||||
String metrics = metricsData.getMetrics();
|
||||
// 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
|
||||
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
|
||||
// 采集异常
|
||||
Alert.AlertBuilder alertBuilder = Alert.builder()
|
||||
.monitorId(monitorId)
|
||||
.priority((byte) 0)
|
||||
.status((byte) 0)
|
||||
.times(1);
|
||||
if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
|
||||
// UN_REACHABLE 对端不可达(网络层icmp)
|
||||
alertBuilder.target(CommonConstants.REACHABLE)
|
||||
.content("监控紧急可达性告警: " + metricsData.getCode().name());
|
||||
dataQueue.addAlertData(alertBuilder.build());
|
||||
} else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
|
||||
// UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
|
||||
alertBuilder.target(CommonConstants.AVAILABLE)
|
||||
.content("监控紧急可用性告警: " + metricsData.getCode().name());
|
||||
dataQueue.addAlertData(alertBuilder.build());
|
||||
} else {
|
||||
// todo 其它规范异常 TIMEOUT ...
|
||||
// 先判断调度优先级为0的指标组采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警进行监控状态变更
|
||||
if (metricsData.getPriority() == 0) {
|
||||
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
|
||||
// 采集异常
|
||||
Alert.AlertBuilder alertBuilder = Alert.builder()
|
||||
.monitorId(monitorId)
|
||||
.priority((byte) 0)
|
||||
.status((byte) 0)
|
||||
.times(1);
|
||||
if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
|
||||
// UN_REACHABLE 对端不可达(网络层icmp)
|
||||
alertBuilder.target(CommonConstants.REACHABLE)
|
||||
.content("监控紧急可达性告警: " + metricsData.getCode().name());
|
||||
triggeredMonitorStateAlertMap.put(monitorId, CollectRep.Code.UN_REACHABLE);
|
||||
dataQueue.addAlertData(alertBuilder.build());
|
||||
} else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
|
||||
// UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
|
||||
alertBuilder.target(CommonConstants.AVAILABLE)
|
||||
.content("监控紧急可用性告警: " + metricsData.getCode().name());
|
||||
triggeredMonitorStateAlertMap.put(monitorId, CollectRep.Code.UN_CONNECTABLE);
|
||||
dataQueue.addAlertData(alertBuilder.build());
|
||||
} else {
|
||||
// todo 其它规范异常 TIMEOUT ...
|
||||
return;
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
// 判断关联监控之前是否有可用性或者不可达告警,发送恢复告警进行监控状态恢复
|
||||
CollectRep.Code stateCode = triggeredMonitorStateAlertMap.remove(monitorId);
|
||||
if (stateCode != null) {
|
||||
// 发送告警恢复
|
||||
Alert resumeAlert = Alert.builder()
|
||||
.monitorId(monitorId).status((byte) 2).build();
|
||||
dataQueue.addAlertData(resumeAlert);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
// 查出此监控类型下的此指标集合下关联配置的告警定义信息
|
||||
// field - define[]
|
||||
|
||||
@@ -67,7 +67,7 @@ public class Alert {
|
||||
@Length(max = 1024)
|
||||
private String content;
|
||||
|
||||
@ApiModelProperty(value = "告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)",
|
||||
@ApiModelProperty(value = "告警状态: 0-正常告警 1-触发中:阈值触发但未达到告警次数 2-恢复告警",
|
||||
example = "1", accessMode = READ_WRITE, position = 7)
|
||||
@Min(0)
|
||||
@Max(2)
|
||||
|
||||
Reference in New Issue
Block a user