فهرست منبع

[alerter,manager] 告警信息入库,监控状态变更联动

tomsun28 4 سال پیش
والد
کامیت
ab2d4511ec

+ 19 - 13
alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java

@@ -68,18 +68,24 @@ public class CalculateAlarm {
         // 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
         // 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
         if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
         if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
             // 采集异常
             // 采集异常
-            if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE
-                    || metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
-                // 连接型可用性异常 UN_REACHABLE 对端不可达(网络层icmp) UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
-                Alert alert = Alert.builder()
-                        .monitorId(monitorId)
-                        .priority((byte) 0)
-                        .status((byte) 0)
-                        .target(CommonConstants.AVAILABLE)
-                        .duration(300)
-                        .content("监控紧急可用性告警: " + metricsData.getCode().name())
-                        .build();
-                dataQueue.addAlertData(alert);
+            Alert.AlertBuilder alertBuilder = Alert.builder()
+                    .monitorId(monitorId)
+                    .priority((byte) 0)
+                    .status((byte) 0)
+                    .duration(300);
+            if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
+                // UN_REACHABLE 对端不可达(网络层icmp)
+                alertBuilder.target(CommonConstants.REACHABLE)
+                        .content("监控紧急可达性告警: " + metricsData.getCode().name());
+                dataQueue.addAlertData(alertBuilder.build());
+            } else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
+                // UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
+                alertBuilder.target(CommonConstants.AVAILABLE)
+                        .content("监控紧急可用性告警: " + metricsData.getCode().name());
+                dataQueue.addAlertData(alertBuilder.build());
+            } else {
+                // todo 其它规范异常 TIMEOUT ...
+                return;
             }
             }
             return;
             return;
         }
         }
@@ -127,7 +133,7 @@ public class CalculateAlarm {
                             Expression expression = AviatorEvaluator.compile(expr, true);
                             Expression expression = AviatorEvaluator.compile(expr, true);
                             Boolean match = (Boolean) expression.execute(fieldValueMap);
                             Boolean match = (Boolean) expression.execute(fieldValueMap);
                             if (match) {
                             if (match) {
-                                // 阈值规则匹配,触发告警 todo 告警延迟delay参数实现
+                                // 阈值规则匹配,触发告警
                                 Alert alert = Alert.builder()
                                 Alert alert = Alert.builder()
                                         .monitorId(monitorId)
                                         .monitorId(monitorId)
                                         .priority(define.getPriority())
                                         .priority(define.getPriority())

+ 14 - 0
alerter/src/main/java/com/usthe/alert/dao/AlertDao.java

@@ -0,0 +1,14 @@
+package com.usthe.alert.dao;
+
+import com.usthe.alert.pojo.entity.Alert;
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
+
+/**
+ * Alert 数据库操作
+ * @author tom
+ * @date 2021/12/9 10:03
+ */
+public interface AlertDao extends JpaRepository<Alert, Long>, JpaSpecificationExecutor<Alert> {
+
+}

+ 1 - 1
alerter/src/main/java/com/usthe/alert/dao/AlertDefineDao.java

@@ -29,7 +29,7 @@ public interface AlertDefineDao extends JpaRepository<AlertDefine, Long>, JpaSpe
      * @return 告警定义列表
      * @return 告警定义列表
      */
      */
     @Query("select define from AlertDefine define join AlertDefineBind bind on bind.alertDefineId = define.id " +
     @Query("select define from AlertDefine define join AlertDefineBind bind on bind.alertDefineId = define.id " +
-            "where bind.monitorId = :monitorId and define.metric = :metrics")
+            "where bind.monitorId = :monitorId and define.metric = :metrics and define.enable = true")
     List<AlertDefine> queryAlertDefinesByMonitor(@Param(value = "monitorId") Long monitorId,
     List<AlertDefine> queryAlertDefinesByMonitor(@Param(value = "monitorId") Long monitorId,
                                                  @Param(value = "metrics") String metrics);
                                                  @Param(value = "metrics") String metrics);
 }
 }

+ 1 - 2
alerter/src/main/java/com/usthe/alert/pojo/entity/Alert.java

@@ -61,8 +61,7 @@ public class Alert {
 
 
     @ApiModelProperty(value = "告警目标对象: 监控可用性-available 指标-app.metrics.field",
     @ApiModelProperty(value = "告警目标对象: 监控可用性-available 指标-app.metrics.field",
             example = "1", accessMode = READ_WRITE, position = 4)
             example = "1", accessMode = READ_WRITE, position = 4)
-    @Min(0)
-    @Max(2)
+    @Length(max = 255)
     private String target;
     private String target;
 
 
     @ApiModelProperty(value = "触发告警后持续时间,单位s", example = "60", accessMode = READ_WRITE, position = 7)
     @ApiModelProperty(value = "触发告警后持续时间,单位s", example = "60", accessMode = READ_WRITE, position = 7)

+ 1 - 5
alerter/src/main/java/com/usthe/alert/pojo/entity/AlertDefine.java

@@ -70,13 +70,9 @@ public class AlertDefine {
     @Min(0)
     @Min(0)
     private int duration;
     private int duration;
 
 
-    @ApiModelProperty(value = "告警触发后是否发送", example = "true", accessMode = READ_WRITE, position = 8)
+    @ApiModelProperty(value = "告警阈值开关", example = "true", accessMode = READ_WRITE, position = 8)
     private boolean enable = true;
     private boolean enable = true;
 
 
-    @ApiModelProperty(value = "告警延迟时间,即延迟多久再发送告警,单位s", example = "300", accessMode = READ_WRITE, position = 9)
-    @Min(0)
-    private int delay;
-
     @ApiModelProperty(value = "告警通知内容", example = "linux {monitor_name}: {monitor_id} cpu usage high",
     @ApiModelProperty(value = "告警通知内容", example = "linux {monitor_name}: {monitor_id} cpu usage high",
             accessMode = READ_WRITE, position = 10)
             accessMode = READ_WRITE, position = 10)
     @Length(max = 1024)
     @Length(max = 1024)

+ 29 - 0
alerter/src/main/java/com/usthe/alert/service/AlertService.java

@@ -0,0 +1,29 @@
+package com.usthe.alert.service;
+
+import com.usthe.alert.pojo.entity.Alert;
+import org.springframework.data.domain.Page;
+import org.springframework.data.domain.PageRequest;
+import org.springframework.data.jpa.domain.Specification;
+
+/**
+ * 告警信息管理接口
+ * @author tom
+ * @date 2021/12/9 10:06
+ */
+public interface AlertService {
+
+    /**
+     * 新增告警
+     * @param alert 告警实体
+     * @throws RuntimeException 新增过程异常抛出
+     */
+    void addAlert(Alert alert) throws RuntimeException;
+
+    /**
+     * 动态条件查询
+     * @param specification 查询条件
+     * @param pageRequest 分页参数
+     * @return 查询结果
+     */
+    Page<Alert> getAlerts(Specification<Alert> specification, PageRequest pageRequest);
+}

+ 32 - 0
alerter/src/main/java/com/usthe/alert/service/impl/AlertServiceImpl.java

@@ -0,0 +1,32 @@
+package com.usthe.alert.service.impl;
+
+import com.usthe.alert.dao.AlertDao;
+import com.usthe.alert.pojo.entity.Alert;
+import com.usthe.alert.service.AlertService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.data.domain.Page;
+import org.springframework.data.domain.PageRequest;
+import org.springframework.data.jpa.domain.Specification;
+import org.springframework.stereotype.Service;
+
+/**
+ * 告警信息服务实现
+ * @author tom
+ * @date 2021/12/10 15:39
+ */
+@Service
+public class AlertServiceImpl implements AlertService {
+
+    @Autowired
+    private AlertDao alertDao;
+
+    @Override
+    public void addAlert(Alert alert) throws RuntimeException {
+        alertDao.save(alert);
+    }
+
+    @Override
+    public Page<Alert> getAlerts(Specification<Alert> specification, PageRequest pageRequest) {
+        return alertDao.findAll(specification, pageRequest);
+    }
+}

+ 2 - 1
alerter/src/main/resources/META-INF/spring.factories

@@ -1,9 +1,10 @@
 org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
 org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
 com.usthe.alert.service.impl.AlertDefineServiceImpl,\
 com.usthe.alert.service.impl.AlertDefineServiceImpl,\
+com.usthe.alert.service.impl.AlertServiceImpl,\
 com.usthe.alert.controller.AlertDefineController,\
 com.usthe.alert.controller.AlertDefineController,\
 com.usthe.alert.AlerterWorkerPool,\
 com.usthe.alert.AlerterWorkerPool,\
 com.usthe.alert.AlerterProperties,\
 com.usthe.alert.AlerterProperties,\
 com.usthe.alert.AlerterDataQueue,\
 com.usthe.alert.AlerterDataQueue,\
-com.usthe.alert.entrance.KafkaDataConsume,\
 com.usthe.alert.AlerterConfiguration,\
 com.usthe.alert.AlerterConfiguration,\
+com.usthe.alert.entrance.KafkaDataConsume,\
 com.usthe.alert.calculate.CalculateAlarm
 com.usthe.alert.calculate.CalculateAlarm

+ 5 - 0
common/src/main/java/com/usthe/common/util/CommonConstants.java

@@ -87,4 +87,9 @@ public interface CommonConstants {
      * 可用性对象
      * 可用性对象
      */
      */
     String AVAILABLE = "available";
     String AVAILABLE = "available";
+
+    /**
+     * 可达性对象
+     */
+    String REACHABLE = "reachable";
 }
 }

+ 89 - 0
manager/src/main/java/com/usthe/manager/component/alerter/DispatchAlarm.java

@@ -0,0 +1,89 @@
+package com.usthe.manager.component.alerter;
+
+import com.usthe.alert.AlerterDataQueue;
+import com.usthe.alert.AlerterWorkerPool;
+import com.usthe.alert.pojo.entity.Alert;
+import com.usthe.alert.service.AlertService;
+import com.usthe.common.util.CommonConstants;
+import com.usthe.manager.pojo.entity.Monitor;
+import com.usthe.manager.service.MonitorService;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Component;
+
+/**
+ * 告警信息入库分发
+ * @author tom
+ * @date 2021/12/10 12:58
+ */
+@Component
+@Slf4j
+public class DispatchAlarm {
+
+    private AlerterWorkerPool workerPool;
+    private AlerterDataQueue dataQueue;
+    private AlertService alertService;
+    private MonitorService monitorService;
+
+    public DispatchAlarm(AlerterWorkerPool workerPool, AlerterDataQueue dataQueue,
+                         AlertService alertService, MonitorService monitorService) {
+        this.workerPool = workerPool;
+        this.dataQueue = dataQueue;
+        this.alertService = alertService;
+        this.monitorService = monitorService;
+        startDispatch();
+    }
+
+    private void startDispatch() {
+        Runnable runnable = () -> {
+            while (!Thread.currentThread().isInterrupted()) {
+                try {
+                    Alert alert = dataQueue.pollAlertData();
+                    if (alert != null) {
+                        // 判断告警类型入库
+                        storeAlertData(alert);
+                        // 通知分发
+                        sendAlertDataListener(alert);
+                    }
+                } catch (InterruptedException e) {
+                    log.error(e.getMessage());
+                }
+            }
+        };
+        workerPool.executeJob(runnable);
+        workerPool.executeJob(runnable);
+        workerPool.executeJob(runnable);
+    }
+
+    private void storeAlertData(Alert alert) {
+        // todo 过滤重复告警 使用 告警持续时间参数-duration 这个时间段的相同重复告警应该被过滤
+        // todo 使用缓存不直接操作库
+        Monitor monitor = monitorService.getMonitor(alert.getMonitorId());
+        if (monitor == null) {
+            log.warn("Dispatch alarm the monitorId: {} not existed, ignored.", alert.getMonitorId());
+            return;
+        }
+        alert.setMonitorName(monitor.getName());
+        if (monitor.getStatus() == CommonConstants.UN_MANAGE_CODE) {
+            // 当监控未管理时  忽略静默其告警信息
+            return;
+        }
+        if (monitor.getStatus() != CommonConstants.UN_AVAILABLE_CODE
+                && monitor.getStatus() != CommonConstants.UN_REACHABLE_CODE) {
+            if (CommonConstants.AVAILABLE.equals(alert.getTarget())) {
+                // 可用性告警 需变更监控状态为不可用
+                monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_AVAILABLE_CODE);
+            } else if (CommonConstants.REACHABLE.equals(alert.getTarget())) {
+                // 可达性告警 需变更监控状态为不可达
+                monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_REACHABLE_CODE);
+            }
+        }
+        // 告警落库
+        alertService.addAlert(alert);
+    }
+
+    private void sendAlertDataListener(Alert alert) {
+        // todo 转发配置的邮件 微信 webhook
+    }
+
+
+}

+ 1 - 1
manager/src/main/java/com/usthe/manager/controller/MonitorController.java

@@ -66,7 +66,7 @@ public class MonitorController {
     public ResponseEntity<Message<MonitorDto>> getMonitor(
     public ResponseEntity<Message<MonitorDto>> getMonitor(
             @ApiParam(value = "监控ID", example = "6565463543") @PathVariable("id") long id) {
             @ApiParam(value = "监控ID", example = "6565463543") @PathVariable("id") long id) {
         // 获取监控信息
         // 获取监控信息
-        MonitorDto monitorDto = monitorService.getMonitor(id);
+        MonitorDto monitorDto = monitorService.getMonitorDto(id);
         Message.MessageBuilder<MonitorDto> messageBuilder = Message.builder();
         Message.MessageBuilder<MonitorDto> messageBuilder = Message.builder();
         if (monitorDto == null) {
         if (monitorDto == null) {
             messageBuilder.code(MONITOR_NOT_EXIST_CODE).msg("Monitor not exist.");
             messageBuilder.code(MONITOR_NOT_EXIST_CODE).msg("Monitor not exist.");

+ 9 - 0
manager/src/main/java/com/usthe/manager/dao/MonitorDao.java

@@ -5,6 +5,7 @@ import com.usthe.manager.pojo.entity.Monitor;
 import org.springframework.data.jpa.repository.JpaRepository;
 import org.springframework.data.jpa.repository.JpaRepository;
 import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
 import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
 import org.springframework.data.jpa.repository.Query;
 import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;
 
 
 import java.util.List;
 import java.util.List;
 import java.util.Set;
 import java.util.Set;
@@ -36,4 +37,12 @@ public interface MonitorDao extends JpaRepository<Monitor, Long>, JpaSpecificati
      */
      */
     @Query("select new com.usthe.manager.pojo.dto.AppCount(mo.app, COUNT(mo.id)) from Monitor mo group by mo.app")
     @Query("select new com.usthe.manager.pojo.dto.AppCount(mo.app, COUNT(mo.id)) from Monitor mo group by mo.app")
     List<AppCount> findAppsCount();
     List<AppCount> findAppsCount();
+
+    /**
+     * 更新指定监控的状态
+     * @param id 监控ID
+     * @param status 监控状态
+     */
+    @Query("update Monitor set status = :status where id = :id")
+    void updateMonitorStatus(@Param(value = "id") Long id, @Param(value = "status") byte status);
 }
 }

+ 15 - 1
manager/src/main/java/com/usthe/manager/service/MonitorService.java

@@ -73,7 +73,7 @@ public interface MonitorService {
      * @return MonitorDto
      * @return MonitorDto
      * @throws RuntimeException 查询过程中异常抛出
      * @throws RuntimeException 查询过程中异常抛出
      */
      */
-    MonitorDto getMonitor(long id) throws RuntimeException;
+    MonitorDto getMonitorDto(long id) throws RuntimeException;
 
 
     /**
     /**
      * 动态条件查询
      * 动态条件查询
@@ -100,4 +100,18 @@ public interface MonitorService {
      * @return 监控类别与监控数量映射
      * @return 监控类别与监控数量映射
      */
      */
     List<AppCount> getAllAppMonitorsCount();
     List<AppCount> getAllAppMonitorsCount();
+
+    /**
+     * 查询监控
+     * @param monitorId 监控ID
+     * @return 监控信息
+     */
+    Monitor getMonitor(Long monitorId);
+
+    /**
+     * 更新指定监控的状态
+     * @param monitorId 监控ID
+     * @param status 监控状态
+     */
+    void updateMonitorStatus(Long monitorId, byte status);
 }
 }

+ 11 - 1
manager/src/main/java/com/usthe/manager/service/impl/MonitorServiceImpl.java

@@ -264,7 +264,7 @@ public class MonitorServiceImpl implements MonitorService {
 
 
     @Override
     @Override
     @Transactional(readOnly = true)
     @Transactional(readOnly = true)
-    public MonitorDto getMonitor(long id) throws RuntimeException {
+    public MonitorDto getMonitorDto(long id) throws RuntimeException {
         Optional<Monitor> monitorOptional = monitorDao.findById(id);
         Optional<Monitor> monitorOptional = monitorDao.findById(id);
         if (monitorOptional.isPresent()) {
         if (monitorOptional.isPresent()) {
             Monitor monitor = monitorOptional.get();
             Monitor monitor = monitorOptional.get();
@@ -335,4 +335,14 @@ public class MonitorServiceImpl implements MonitorService {
         return monitorDao.findAppsCount();
         return monitorDao.findAppsCount();
 
 
     }
     }
+
+    @Override
+    public Monitor getMonitor(Long monitorId) {
+        return monitorDao.findById(monitorId).orElse(null);
+    }
+
+    @Override
+    public void updateMonitorStatus(Long monitorId, byte status) {
+        monitorDao.updateMonitorStatus(monitorId, status);
+    }
 }
 }

+ 18 - 2
manager/src/main/resources/db/schema.sql

@@ -78,8 +78,7 @@ CREATE TABLE  alert_define
     expr         varchar(255)     not null comment '告警触发条件表达式',
     expr         varchar(255)     not null comment '告警触发条件表达式',
     priority     tinyint          not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
     priority     tinyint          not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
     duration     int              not null comment '触发告警后持续时间,单位s',
     duration     int              not null comment '触发告警后持续时间,单位s',
-    enable       boolean          not null default true comment '告警触发后是否发送',
-    delay        int              not null comment '告警延迟时间,即延迟多久再发送告警,单位s',
+    enable       boolean          not null default true comment '告警阈值开关',
     template     varchar(255)     not null comment '告警通知模板内容',
     template     varchar(255)     not null comment '告警通知模板内容',
     creator      varchar(100)     comment '创建者',
     creator      varchar(100)     comment '创建者',
     modifier     varchar(100)     comment '最新修改者',
     modifier     varchar(100)     comment '最新修改者',
@@ -104,5 +103,22 @@ CREATE TABLE  alert_define_monitor_bind
     index index_bind (alert_define_id, monitor_id)
     index index_bind (alert_define_id, monitor_id)
 ) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
 ) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
 
 
+-- ----------------------------
+-- Table structure for alert
+-- ----------------------------
+DROP TABLE IF EXISTS  alert ;
+CREATE TABLE  alert
+(
+    id            bigint           not null auto_increment comment '告警ID',
+    monitor_id    bigint           not null comment '告警监控对象ID',
+    monitor_name  varchar(100)     comment '告警监控对象名称',
+    priority      tinyint          not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
+    status        tinyint          not null default 0 comment '告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)',
+    target        varchar(255)     not null comment '告警目标对象: 监控可用性-available 指标-app.metrics.field',
+    duration      int              not null comment '触发告警后持续时间,单位s',
+    content       varchar(255)     not null comment '告警通知实际内容',
+    gmt_create    timestamp        default current_timestamp comment 'create time',
+    primary key (id)
+) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
 
 
 COMMIT;
 COMMIT;