ソースを参照

[alerter,manager] 告警信息入库,监控状态变更联动

tomsun28 4 年 前
コミット
ab2d4511ec

+ 19 - 13
alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java

@@ -68,18 +68,24 @@ public class CalculateAlarm {
         // 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
         if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
             // 采集异常
-            if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE
-                    || metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
-                // 连接型可用性异常 UN_REACHABLE 对端不可达(网络层icmp) UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
-                Alert alert = Alert.builder()
-                        .monitorId(monitorId)
-                        .priority((byte) 0)
-                        .status((byte) 0)
-                        .target(CommonConstants.AVAILABLE)
-                        .duration(300)
-                        .content("监控紧急可用性告警: " + metricsData.getCode().name())
-                        .build();
-                dataQueue.addAlertData(alert);
+            Alert.AlertBuilder alertBuilder = Alert.builder()
+                    .monitorId(monitorId)
+                    .priority((byte) 0)
+                    .status((byte) 0)
+                    .duration(300);
+            if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
+                // UN_REACHABLE 对端不可达(网络层icmp)
+                alertBuilder.target(CommonConstants.REACHABLE)
+                        .content("监控紧急可达性告警: " + metricsData.getCode().name());
+                dataQueue.addAlertData(alertBuilder.build());
+            } else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
+                // UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
+                alertBuilder.target(CommonConstants.AVAILABLE)
+                        .content("监控紧急可用性告警: " + metricsData.getCode().name());
+                dataQueue.addAlertData(alertBuilder.build());
+            } else {
+                // todo 其它规范异常 TIMEOUT ...
+                return;
             }
             return;
         }
@@ -127,7 +133,7 @@ public class CalculateAlarm {
                             Expression expression = AviatorEvaluator.compile(expr, true);
                             Boolean match = (Boolean) expression.execute(fieldValueMap);
                             if (match) {
-                                // 阈值规则匹配,触发告警 todo 告警延迟delay参数实现
+                                // 阈值规则匹配,触发告警
                                 Alert alert = Alert.builder()
                                         .monitorId(monitorId)
                                         .priority(define.getPriority())

+ 14 - 0
alerter/src/main/java/com/usthe/alert/dao/AlertDao.java

@@ -0,0 +1,14 @@
+package com.usthe.alert.dao;
+
+import com.usthe.alert.pojo.entity.Alert;
+import org.springframework.data.jpa.repository.JpaRepository;
+import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
+
+/**
+ * Alert 数据库操作
+ * @author tom
+ * @date 2021/12/9 10:03
+ */
+public interface AlertDao extends JpaRepository<Alert, Long>, JpaSpecificationExecutor<Alert> {
+
+}

+ 1 - 1
alerter/src/main/java/com/usthe/alert/dao/AlertDefineDao.java

@@ -29,7 +29,7 @@ public interface AlertDefineDao extends JpaRepository<AlertDefine, Long>, JpaSpe
      * @return 告警定义列表
      */
     @Query("select define from AlertDefine define join AlertDefineBind bind on bind.alertDefineId = define.id " +
-            "where bind.monitorId = :monitorId and define.metric = :metrics")
+            "where bind.monitorId = :monitorId and define.metric = :metrics and define.enable = true")
     List<AlertDefine> queryAlertDefinesByMonitor(@Param(value = "monitorId") Long monitorId,
                                                  @Param(value = "metrics") String metrics);
 }

+ 1 - 2
alerter/src/main/java/com/usthe/alert/pojo/entity/Alert.java

@@ -61,8 +61,7 @@ public class Alert {
 
     @ApiModelProperty(value = "告警目标对象: 监控可用性-available 指标-app.metrics.field",
             example = "1", accessMode = READ_WRITE, position = 4)
-    @Min(0)
-    @Max(2)
+    @Length(max = 255)
     private String target;
 
     @ApiModelProperty(value = "触发告警后持续时间,单位s", example = "60", accessMode = READ_WRITE, position = 7)

+ 1 - 5
alerter/src/main/java/com/usthe/alert/pojo/entity/AlertDefine.java

@@ -70,13 +70,9 @@ public class AlertDefine {
     @Min(0)
     private int duration;
 
-    @ApiModelProperty(value = "告警触发后是否发送", example = "true", accessMode = READ_WRITE, position = 8)
+    @ApiModelProperty(value = "告警阈值开关", example = "true", accessMode = READ_WRITE, position = 8)
     private boolean enable = true;
 
-    @ApiModelProperty(value = "告警延迟时间,即延迟多久再发送告警,单位s", example = "300", accessMode = READ_WRITE, position = 9)
-    @Min(0)
-    private int delay;
-
     @ApiModelProperty(value = "告警通知内容", example = "linux {monitor_name}: {monitor_id} cpu usage high",
             accessMode = READ_WRITE, position = 10)
     @Length(max = 1024)

+ 29 - 0
alerter/src/main/java/com/usthe/alert/service/AlertService.java

@@ -0,0 +1,29 @@
+package com.usthe.alert.service;
+
+import com.usthe.alert.pojo.entity.Alert;
+import org.springframework.data.domain.Page;
+import org.springframework.data.domain.PageRequest;
+import org.springframework.data.jpa.domain.Specification;
+
+/**
+ * 告警信息管理接口
+ * @author tom
+ * @date 2021/12/9 10:06
+ */
+public interface AlertService {
+
+    /**
+     * 新增告警
+     * @param alert 告警实体
+     * @throws RuntimeException 新增过程异常抛出
+     */
+    void addAlert(Alert alert) throws RuntimeException;
+
+    /**
+     * 动态条件查询
+     * @param specification 查询条件
+     * @param pageRequest 分页参数
+     * @return 查询结果
+     */
+    Page<Alert> getAlerts(Specification<Alert> specification, PageRequest pageRequest);
+}

+ 32 - 0
alerter/src/main/java/com/usthe/alert/service/impl/AlertServiceImpl.java

@@ -0,0 +1,32 @@
+package com.usthe.alert.service.impl;
+
+import com.usthe.alert.dao.AlertDao;
+import com.usthe.alert.pojo.entity.Alert;
+import com.usthe.alert.service.AlertService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.data.domain.Page;
+import org.springframework.data.domain.PageRequest;
+import org.springframework.data.jpa.domain.Specification;
+import org.springframework.stereotype.Service;
+
+/**
+ * 告警信息服务实现
+ * @author tom
+ * @date 2021/12/10 15:39
+ */
+@Service
+public class AlertServiceImpl implements AlertService {
+
+    @Autowired
+    private AlertDao alertDao;
+
+    @Override
+    public void addAlert(Alert alert) throws RuntimeException {
+        alertDao.save(alert);
+    }
+
+    @Override
+    public Page<Alert> getAlerts(Specification<Alert> specification, PageRequest pageRequest) {
+        return alertDao.findAll(specification, pageRequest);
+    }
+}

+ 2 - 1
alerter/src/main/resources/META-INF/spring.factories

@@ -1,9 +1,10 @@
 org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
 com.usthe.alert.service.impl.AlertDefineServiceImpl,\
+com.usthe.alert.service.impl.AlertServiceImpl,\
 com.usthe.alert.controller.AlertDefineController,\
 com.usthe.alert.AlerterWorkerPool,\
 com.usthe.alert.AlerterProperties,\
 com.usthe.alert.AlerterDataQueue,\
-com.usthe.alert.entrance.KafkaDataConsume,\
 com.usthe.alert.AlerterConfiguration,\
+com.usthe.alert.entrance.KafkaDataConsume,\
 com.usthe.alert.calculate.CalculateAlarm

+ 5 - 0
common/src/main/java/com/usthe/common/util/CommonConstants.java

@@ -87,4 +87,9 @@ public interface CommonConstants {
      * 可用性对象
      */
     String AVAILABLE = "available";
+
+    /**
+     * 可达性对象
+     */
+    String REACHABLE = "reachable";
 }

+ 89 - 0
manager/src/main/java/com/usthe/manager/component/alerter/DispatchAlarm.java

@@ -0,0 +1,89 @@
+package com.usthe.manager.component.alerter;
+
+import com.usthe.alert.AlerterDataQueue;
+import com.usthe.alert.AlerterWorkerPool;
+import com.usthe.alert.pojo.entity.Alert;
+import com.usthe.alert.service.AlertService;
+import com.usthe.common.util.CommonConstants;
+import com.usthe.manager.pojo.entity.Monitor;
+import com.usthe.manager.service.MonitorService;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Component;
+
+/**
+ * 告警信息入库分发
+ * @author tom
+ * @date 2021/12/10 12:58
+ */
+@Component
+@Slf4j
+public class DispatchAlarm {
+
+    private AlerterWorkerPool workerPool;
+    private AlerterDataQueue dataQueue;
+    private AlertService alertService;
+    private MonitorService monitorService;
+
+    public DispatchAlarm(AlerterWorkerPool workerPool, AlerterDataQueue dataQueue,
+                         AlertService alertService, MonitorService monitorService) {
+        this.workerPool = workerPool;
+        this.dataQueue = dataQueue;
+        this.alertService = alertService;
+        this.monitorService = monitorService;
+        startDispatch();
+    }
+
+    private void startDispatch() {
+        Runnable runnable = () -> {
+            while (!Thread.currentThread().isInterrupted()) {
+                try {
+                    Alert alert = dataQueue.pollAlertData();
+                    if (alert != null) {
+                        // 判断告警类型入库
+                        storeAlertData(alert);
+                        // 通知分发
+                        sendAlertDataListener(alert);
+                    }
+                } catch (InterruptedException e) {
+                    log.error(e.getMessage());
+                }
+            }
+        };
+        workerPool.executeJob(runnable);
+        workerPool.executeJob(runnable);
+        workerPool.executeJob(runnable);
+    }
+
+    private void storeAlertData(Alert alert) {
+        // todo 过滤重复告警 使用 告警持续时间参数-duration 这个时间段的相同重复告警应该被过滤
+        // todo 使用缓存不直接操作库
+        Monitor monitor = monitorService.getMonitor(alert.getMonitorId());
+        if (monitor == null) {
+            log.warn("Dispatch alarm the monitorId: {} not existed, ignored.", alert.getMonitorId());
+            return;
+        }
+        alert.setMonitorName(monitor.getName());
+        if (monitor.getStatus() == CommonConstants.UN_MANAGE_CODE) {
+            // 当监控未管理时  忽略静默其告警信息
+            return;
+        }
+        if (monitor.getStatus() != CommonConstants.UN_AVAILABLE_CODE
+                && monitor.getStatus() != CommonConstants.UN_REACHABLE_CODE) {
+            if (CommonConstants.AVAILABLE.equals(alert.getTarget())) {
+                // 可用性告警 需变更监控状态为不可用
+                monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_AVAILABLE_CODE);
+            } else if (CommonConstants.REACHABLE.equals(alert.getTarget())) {
+                // 可达性告警 需变更监控状态为不可达
+                monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_REACHABLE_CODE);
+            }
+        }
+        // 告警落库
+        alertService.addAlert(alert);
+    }
+
+    private void sendAlertDataListener(Alert alert) {
+        // todo 转发配置的邮件 微信 webhook
+    }
+
+
+}

+ 1 - 1
manager/src/main/java/com/usthe/manager/controller/MonitorController.java

@@ -66,7 +66,7 @@ public class MonitorController {
     public ResponseEntity<Message<MonitorDto>> getMonitor(
             @ApiParam(value = "监控ID", example = "6565463543") @PathVariable("id") long id) {
         // 获取监控信息
-        MonitorDto monitorDto = monitorService.getMonitor(id);
+        MonitorDto monitorDto = monitorService.getMonitorDto(id);
         Message.MessageBuilder<MonitorDto> messageBuilder = Message.builder();
         if (monitorDto == null) {
             messageBuilder.code(MONITOR_NOT_EXIST_CODE).msg("Monitor not exist.");

+ 9 - 0
manager/src/main/java/com/usthe/manager/dao/MonitorDao.java

@@ -5,6 +5,7 @@ import com.usthe.manager.pojo.entity.Monitor;
 import org.springframework.data.jpa.repository.JpaRepository;
 import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
 import org.springframework.data.jpa.repository.Query;
+import org.springframework.data.repository.query.Param;
 
 import java.util.List;
 import java.util.Set;
@@ -36,4 +37,12 @@ public interface MonitorDao extends JpaRepository<Monitor, Long>, JpaSpecificati
      */
     @Query("select new com.usthe.manager.pojo.dto.AppCount(mo.app, COUNT(mo.id)) from Monitor mo group by mo.app")
     List<AppCount> findAppsCount();
+
+    /**
+     * 更新指定监控的状态
+     * @param id 监控ID
+     * @param status 监控状态
+     */
+    @Query("update Monitor set status = :status where id = :id")
+    void updateMonitorStatus(@Param(value = "id") Long id, @Param(value = "status") byte status);
 }

+ 15 - 1
manager/src/main/java/com/usthe/manager/service/MonitorService.java

@@ -73,7 +73,7 @@ public interface MonitorService {
      * @return MonitorDto
      * @throws RuntimeException 查询过程中异常抛出
      */
-    MonitorDto getMonitor(long id) throws RuntimeException;
+    MonitorDto getMonitorDto(long id) throws RuntimeException;
 
     /**
      * 动态条件查询
@@ -100,4 +100,18 @@ public interface MonitorService {
      * @return 监控类别与监控数量映射
      */
     List<AppCount> getAllAppMonitorsCount();
+
+    /**
+     * 查询监控
+     * @param monitorId 监控ID
+     * @return 监控信息
+     */
+    Monitor getMonitor(Long monitorId);
+
+    /**
+     * 更新指定监控的状态
+     * @param monitorId 监控ID
+     * @param status 监控状态
+     */
+    void updateMonitorStatus(Long monitorId, byte status);
 }

+ 11 - 1
manager/src/main/java/com/usthe/manager/service/impl/MonitorServiceImpl.java

@@ -264,7 +264,7 @@ public class MonitorServiceImpl implements MonitorService {
 
     @Override
     @Transactional(readOnly = true)
-    public MonitorDto getMonitor(long id) throws RuntimeException {
+    public MonitorDto getMonitorDto(long id) throws RuntimeException {
         Optional<Monitor> monitorOptional = monitorDao.findById(id);
         if (monitorOptional.isPresent()) {
             Monitor monitor = monitorOptional.get();
@@ -335,4 +335,14 @@ public class MonitorServiceImpl implements MonitorService {
         return monitorDao.findAppsCount();
 
     }
+
+    @Override
+    public Monitor getMonitor(Long monitorId) {
+        return monitorDao.findById(monitorId).orElse(null);
+    }
+
+    @Override
+    public void updateMonitorStatus(Long monitorId, byte status) {
+        monitorDao.updateMonitorStatus(monitorId, status);
+    }
 }

+ 18 - 2
manager/src/main/resources/db/schema.sql

@@ -78,8 +78,7 @@ CREATE TABLE  alert_define
     expr         varchar(255)     not null comment '告警触发条件表达式',
     priority     tinyint          not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
     duration     int              not null comment '触发告警后持续时间,单位s',
-    enable       boolean          not null default true comment '告警触发后是否发送',
-    delay        int              not null comment '告警延迟时间,即延迟多久再发送告警,单位s',
+    enable       boolean          not null default true comment '告警阈值开关',
     template     varchar(255)     not null comment '告警通知模板内容',
     creator      varchar(100)     comment '创建者',
     modifier     varchar(100)     comment '最新修改者',
@@ -104,5 +103,22 @@ CREATE TABLE  alert_define_monitor_bind
     index index_bind (alert_define_id, monitor_id)
 ) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
 
+-- ----------------------------
+-- Table structure for alert
+-- ----------------------------
+DROP TABLE IF EXISTS  alert ;
+CREATE TABLE  alert
+(
+    id            bigint           not null auto_increment comment '告警ID',
+    monitor_id    bigint           not null comment '告警监控对象ID',
+    monitor_name  varchar(100)     comment '告警监控对象名称',
+    priority      tinyint          not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
+    status        tinyint          not null default 0 comment '告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)',
+    target        varchar(255)     not null comment '告警目标对象: 监控可用性-available 指标-app.metrics.field',
+    duration      int              not null comment '触发告警后持续时间,单位s',
+    content       varchar(255)     not null comment '告警通知实际内容',
+    gmt_create    timestamp        default current_timestamp comment 'create time',
+    primary key (id)
+) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
 
 COMMIT;