소스 검색

[monitor] 最高调度级别的采集数据告警恢复时监控状态联动

tomsun28 4 년 전
부모
커밋
a6a8bbce98

+ 38 - 23
alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java

@@ -36,13 +36,15 @@ public class CalculateAlarm {
     private AlerterDataQueue dataQueue;
     private AlertDefineService alertDefineService;
     private Map<String, Alert> triggeredAlertMap;
+    private Map<Long, CollectRep.Code> triggeredMonitorStateAlertMap;
 
-    public CalculateAlarm (AlerterProperties properties, AlerterWorkerPool workerPool,
-                           AlerterDataQueue dataQueue, AlertDefineService alertDefineService) {
+    public CalculateAlarm (AlerterWorkerPool workerPool, AlerterDataQueue dataQueue,
+                           AlertDefineService alertDefineService) {
         this.workerPool = workerPool;
         this.dataQueue = dataQueue;
         this.alertDefineService = alertDefineService;
         this.triggeredAlertMap = new ConcurrentHashMap<>(128);
+        this.triggeredMonitorStateAlertMap = new ConcurrentHashMap<>(128);
         startCalculate();
     }
 
@@ -68,29 +70,42 @@ public class CalculateAlarm {
         long monitorId = metricsData.getId();
         String app = metricsData.getApp();
         String metrics = metricsData.getMetrics();
-        // 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
-        if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
-            // 采集异常
-            Alert.AlertBuilder alertBuilder = Alert.builder()
-                    .monitorId(monitorId)
-                    .priority((byte) 0)
-                    .status((byte) 0)
-                    .times(1);
-            if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
-                // UN_REACHABLE 对端不可达(网络层icmp)
-                alertBuilder.target(CommonConstants.REACHABLE)
-                        .content("监控紧急可达性告警: " + metricsData.getCode().name());
-                dataQueue.addAlertData(alertBuilder.build());
-            } else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
-                // UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
-                alertBuilder.target(CommonConstants.AVAILABLE)
-                        .content("监控紧急可用性告警: " + metricsData.getCode().name());
-                dataQueue.addAlertData(alertBuilder.build());
-            } else {
-                // todo 其它规范异常 TIMEOUT ...
+        // 先判断调度优先级为0的指标组采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警进行监控状态变更
+        if (metricsData.getPriority() == 0) {
+            if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
+                // 采集异常
+                Alert.AlertBuilder alertBuilder = Alert.builder()
+                        .monitorId(monitorId)
+                        .priority((byte) 0)
+                        .status((byte) 0)
+                        .times(1);
+                if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
+                    // UN_REACHABLE 对端不可达(网络层icmp)
+                    alertBuilder.target(CommonConstants.REACHABLE)
+                            .content("监控紧急可达性告警: " + metricsData.getCode().name());
+                    triggeredMonitorStateAlertMap.put(monitorId, CollectRep.Code.UN_REACHABLE);
+                    dataQueue.addAlertData(alertBuilder.build());
+                } else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
+                    // UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
+                    alertBuilder.target(CommonConstants.AVAILABLE)
+                            .content("监控紧急可用性告警: " + metricsData.getCode().name());
+                    triggeredMonitorStateAlertMap.put(monitorId, CollectRep.Code.UN_CONNECTABLE);
+                    dataQueue.addAlertData(alertBuilder.build());
+                } else {
+                    // todo 其它规范异常 TIMEOUT ...
+                    return;
+                }
                 return;
+            } else {
+                // 判断关联监控之前是否有可用性或者不可达告警,发送恢复告警进行监控状态恢复
+                CollectRep.Code stateCode = triggeredMonitorStateAlertMap.remove(monitorId);
+                if (stateCode != null) {
+                    // 发送告警恢复
+                    Alert resumeAlert = Alert.builder()
+                            .monitorId(monitorId).status((byte) 2).build();
+                    dataQueue.addAlertData(resumeAlert);
+                }
             }
-            return;
         }
         // 查出此监控类型下的此指标集合下关联配置的告警定义信息
         // field - define[]

+ 1 - 1
alerter/src/main/java/com/usthe/alert/pojo/entity/Alert.java

@@ -67,7 +67,7 @@ public class Alert {
     @Length(max = 1024)
     private String content;
 
-    @ApiModelProperty(value = "告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)",
+    @ApiModelProperty(value = "告警状态: 0-正常告警 1-触发中:阈值触发但未达到告警次数 2-恢复告警",
             example = "1", accessMode = READ_WRITE, position = 7)
     @Min(0)
     @Max(2)

+ 1 - 0
collector/server/src/main/java/com/usthe/collector/dispatch/MetricsCollect.java

@@ -149,6 +149,7 @@ public class MetricsCollect implements Runnable, Comparable<MetricsCollect> {
      * @param collectData 采集数据
      */
     private void calculateFields(Metrics metrics, CollectRep.MetricsData.Builder collectData) {
+        collectData.setPriority(metrics.getPriority());
         List<CollectRep.Field> fieldList = new LinkedList<>();
         for (Metrics.Field field : metrics.getFields()) {
             fieldList.add(CollectRep.Field.newBuilder().setName(field.getField()).setType(field.getType()).build());

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 200 - 110
common/src/main/java/com/usthe/common/entity/message/CollectRep.java


+ 7 - 5
common/src/main/message/collect_rep.proto

@@ -9,16 +9,18 @@ message MetricsData
     string app = 2;
     // 监控采集的指标集合 eg: cpu | memory | health
     string metrics = 3;
+    // 监控采集指标集合的采集优先级>=0
+    uint32 priority = 4;
     // 采集时间
-    uint64 time = 4;
+    uint64 time = 5;
     // 采集响应码
-    Code code = 5;
+    Code code = 6;
     // 采集响应信息
-    string msg = 6;
+    string msg = 7;
     // 采集指标名
-    repeated Field fields = 7;
+    repeated Field fields = 8;
     // 采集指标值集合(fields作为字段名称与ValueRow映射)
-    repeated ValueRow values = 8;
+    repeated ValueRow values = 9;
 }
 
 message Field

+ 6 - 3
manager/src/main/java/com/usthe/manager/component/alerter/DispatchAlarm.java

@@ -55,7 +55,6 @@ public class DispatchAlarm {
     }
 
     private void storeAlertData(Alert alert) {
-        // todo 过滤重复告警 使用 告警持续时间参数-duration 这个时间段的相同重复告警应该被过滤
         // todo 使用缓存不直接操作库
         Monitor monitor = monitorService.getMonitor(alert.getMonitorId());
         if (monitor == null) {
@@ -67,8 +66,7 @@ public class DispatchAlarm {
             // 当监控未管理时  忽略静默其告警信息
             return;
         }
-        if (monitor.getStatus() != CommonConstants.UN_AVAILABLE_CODE
-                && monitor.getStatus() != CommonConstants.UN_REACHABLE_CODE) {
+        if (monitor.getStatus() == CommonConstants.AVAILABLE_CODE) {
             if (CommonConstants.AVAILABLE.equals(alert.getTarget())) {
                 // 可用性告警 需变更监控状态为不可用
                 monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_AVAILABLE_CODE);
@@ -76,6 +74,11 @@ public class DispatchAlarm {
                 // 可达性告警 需变更监控状态为不可达
                 monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_REACHABLE_CODE);
             }
+        } else {
+            // 若是恢复告警 需对监控状态进行恢复
+           if (alert.getStatus() == 2) {
+               monitorService.updateMonitorStatus(alert.getMonitorId(), CommonConstants.AVAILABLE_CODE);
+           }
         }
         // 告警落库
         alertService.addAlert(alert);

+ 2 - 0
manager/src/main/java/com/usthe/manager/dao/MonitorDao.java

@@ -4,6 +4,7 @@ import com.usthe.manager.pojo.dto.AppCount;
 import com.usthe.manager.pojo.entity.Monitor;
 import org.springframework.data.jpa.repository.JpaRepository;
 import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
+import org.springframework.data.jpa.repository.Modifying;
 import org.springframework.data.jpa.repository.Query;
 import org.springframework.data.repository.query.Param;
 
@@ -50,6 +51,7 @@ public interface MonitorDao extends JpaRepository<Monitor, Long>, JpaSpecificati
      * @param id 监控ID
      * @param status 监控状态
      */
+    @Modifying
     @Query("update Monitor set status = :status where id = :id")
     void updateMonitorStatus(@Param(value = "id") Long id, @Param(value = "status") byte status);
 }

+ 1 - 0
manager/src/main/java/com/usthe/manager/service/impl/MonitorServiceImpl.java

@@ -42,6 +42,7 @@ import java.util.stream.Collectors;
  * @date 2021/11/14 13:06
  */
 @Service
+@Transactional(rollbackFor = Exception.class)
 @Slf4j
 public class MonitorServiceImpl implements MonitorService {
 

+ 1 - 1
manager/src/main/resources/db/schema.sql

@@ -116,7 +116,7 @@ CREATE TABLE  alert
     alert_define_id  bigint           not null comment '告警关联的告警定义ID',
     priority         tinyint          not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
     content          varchar(255)     not null comment '告警通知实际内容',
-    status           tinyint          not null default 0 comment '告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)',
+    status           tinyint          not null default 0 comment '告警状态: 0-正常告警 1-阈值触发但未达到告警次数 2-恢复告警',
     times            int              not null comment '触发次数,即达到告警定义的触发阈值次数要求后才会发告警',
     gmt_create       timestamp        default current_timestamp comment 'create time',
     primary key (id)

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.