[alerter,manager] 告警信息入库,监控状态变更联动
This commit is contained in:
@@ -68,18 +68,24 @@ public class CalculateAlarm {
|
||||
// 先判断采集响应数据状态 UN_REACHABLE/UN_CONNECTABLE 则需发最高级别告警
|
||||
if (metricsData.getCode() != CollectRep.Code.SUCCESS) {
|
||||
// 采集异常
|
||||
if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE
|
||||
|| metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
|
||||
// 连接型可用性异常 UN_REACHABLE 对端不可达(网络层icmp) UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
|
||||
Alert alert = Alert.builder()
|
||||
.monitorId(monitorId)
|
||||
.priority((byte) 0)
|
||||
.status((byte) 0)
|
||||
.target(CommonConstants.AVAILABLE)
|
||||
.duration(300)
|
||||
.content("监控紧急可用性告警: " + metricsData.getCode().name())
|
||||
.build();
|
||||
dataQueue.addAlertData(alert);
|
||||
Alert.AlertBuilder alertBuilder = Alert.builder()
|
||||
.monitorId(monitorId)
|
||||
.priority((byte) 0)
|
||||
.status((byte) 0)
|
||||
.duration(300);
|
||||
if (metricsData.getCode() == CollectRep.Code.UN_REACHABLE) {
|
||||
// UN_REACHABLE 对端不可达(网络层icmp)
|
||||
alertBuilder.target(CommonConstants.REACHABLE)
|
||||
.content("监控紧急可达性告警: " + metricsData.getCode().name());
|
||||
dataQueue.addAlertData(alertBuilder.build());
|
||||
} else if (metricsData.getCode() == CollectRep.Code.UN_CONNECTABLE) {
|
||||
// UN_CONNECTABLE 对端连接失败(传输层tcp,udp)
|
||||
alertBuilder.target(CommonConstants.AVAILABLE)
|
||||
.content("监控紧急可用性告警: " + metricsData.getCode().name());
|
||||
dataQueue.addAlertData(alertBuilder.build());
|
||||
} else {
|
||||
// todo 其它规范异常 TIMEOUT ...
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -127,7 +133,7 @@ public class CalculateAlarm {
|
||||
Expression expression = AviatorEvaluator.compile(expr, true);
|
||||
Boolean match = (Boolean) expression.execute(fieldValueMap);
|
||||
if (match) {
|
||||
// 阈值规则匹配,触发告警 todo 告警延迟delay参数实现
|
||||
// 阈值规则匹配,触发告警
|
||||
Alert alert = Alert.builder()
|
||||
.monitorId(monitorId)
|
||||
.priority(define.getPriority())
|
||||
|
||||
14
alerter/src/main/java/com/usthe/alert/dao/AlertDao.java
Normal file
14
alerter/src/main/java/com/usthe/alert/dao/AlertDao.java
Normal file
@@ -0,0 +1,14 @@
|
||||
package com.usthe.alert.dao;
|
||||
|
||||
import com.usthe.alert.pojo.entity.Alert;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
|
||||
|
||||
/**
|
||||
* Alert 数据库操作
|
||||
* @author tom
|
||||
* @date 2021/12/9 10:03
|
||||
*/
|
||||
public interface AlertDao extends JpaRepository<Alert, Long>, JpaSpecificationExecutor<Alert> {
|
||||
|
||||
}
|
||||
@@ -29,7 +29,7 @@ public interface AlertDefineDao extends JpaRepository<AlertDefine, Long>, JpaSpe
|
||||
* @return 告警定义列表
|
||||
*/
|
||||
@Query("select define from AlertDefine define join AlertDefineBind bind on bind.alertDefineId = define.id " +
|
||||
"where bind.monitorId = :monitorId and define.metric = :metrics")
|
||||
"where bind.monitorId = :monitorId and define.metric = :metrics and define.enable = true")
|
||||
List<AlertDefine> queryAlertDefinesByMonitor(@Param(value = "monitorId") Long monitorId,
|
||||
@Param(value = "metrics") String metrics);
|
||||
}
|
||||
|
||||
@@ -61,8 +61,7 @@ public class Alert {
|
||||
|
||||
@ApiModelProperty(value = "告警目标对象: 监控可用性-available 指标-app.metrics.field",
|
||||
example = "1", accessMode = READ_WRITE, position = 4)
|
||||
@Min(0)
|
||||
@Max(2)
|
||||
@Length(max = 255)
|
||||
private String target;
|
||||
|
||||
@ApiModelProperty(value = "触发告警后持续时间,单位s", example = "60", accessMode = READ_WRITE, position = 7)
|
||||
|
||||
@@ -70,13 +70,9 @@ public class AlertDefine {
|
||||
@Min(0)
|
||||
private int duration;
|
||||
|
||||
@ApiModelProperty(value = "告警触发后是否发送", example = "true", accessMode = READ_WRITE, position = 8)
|
||||
@ApiModelProperty(value = "告警阈值开关", example = "true", accessMode = READ_WRITE, position = 8)
|
||||
private boolean enable = true;
|
||||
|
||||
@ApiModelProperty(value = "告警延迟时间,即延迟多久再发送告警,单位s", example = "300", accessMode = READ_WRITE, position = 9)
|
||||
@Min(0)
|
||||
private int delay;
|
||||
|
||||
@ApiModelProperty(value = "告警通知内容", example = "linux {monitor_name}: {monitor_id} cpu usage high",
|
||||
accessMode = READ_WRITE, position = 10)
|
||||
@Length(max = 1024)
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.usthe.alert.service;
|
||||
|
||||
import com.usthe.alert.pojo.entity.Alert;
|
||||
import org.springframework.data.domain.Page;
|
||||
import org.springframework.data.domain.PageRequest;
|
||||
import org.springframework.data.jpa.domain.Specification;
|
||||
|
||||
/**
|
||||
* 告警信息管理接口
|
||||
* @author tom
|
||||
* @date 2021/12/9 10:06
|
||||
*/
|
||||
public interface AlertService {
|
||||
|
||||
/**
|
||||
* 新增告警
|
||||
* @param alert 告警实体
|
||||
* @throws RuntimeException 新增过程异常抛出
|
||||
*/
|
||||
void addAlert(Alert alert) throws RuntimeException;
|
||||
|
||||
/**
|
||||
* 动态条件查询
|
||||
* @param specification 查询条件
|
||||
* @param pageRequest 分页参数
|
||||
* @return 查询结果
|
||||
*/
|
||||
Page<Alert> getAlerts(Specification<Alert> specification, PageRequest pageRequest);
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.usthe.alert.service.impl;
|
||||
|
||||
import com.usthe.alert.dao.AlertDao;
|
||||
import com.usthe.alert.pojo.entity.Alert;
|
||||
import com.usthe.alert.service.AlertService;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.data.domain.Page;
|
||||
import org.springframework.data.domain.PageRequest;
|
||||
import org.springframework.data.jpa.domain.Specification;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
/**
|
||||
* 告警信息服务实现
|
||||
* @author tom
|
||||
* @date 2021/12/10 15:39
|
||||
*/
|
||||
@Service
|
||||
public class AlertServiceImpl implements AlertService {
|
||||
|
||||
@Autowired
|
||||
private AlertDao alertDao;
|
||||
|
||||
@Override
|
||||
public void addAlert(Alert alert) throws RuntimeException {
|
||||
alertDao.save(alert);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Page<Alert> getAlerts(Specification<Alert> specification, PageRequest pageRequest) {
|
||||
return alertDao.findAll(specification, pageRequest);
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,10 @@
|
||||
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
|
||||
com.usthe.alert.service.impl.AlertDefineServiceImpl,\
|
||||
com.usthe.alert.service.impl.AlertServiceImpl,\
|
||||
com.usthe.alert.controller.AlertDefineController,\
|
||||
com.usthe.alert.AlerterWorkerPool,\
|
||||
com.usthe.alert.AlerterProperties,\
|
||||
com.usthe.alert.AlerterDataQueue,\
|
||||
com.usthe.alert.entrance.KafkaDataConsume,\
|
||||
com.usthe.alert.AlerterConfiguration,\
|
||||
com.usthe.alert.entrance.KafkaDataConsume,\
|
||||
com.usthe.alert.calculate.CalculateAlarm
|
||||
@@ -87,4 +87,9 @@ public interface CommonConstants {
|
||||
* 可用性对象
|
||||
*/
|
||||
String AVAILABLE = "available";
|
||||
|
||||
/**
|
||||
* 可达性对象
|
||||
*/
|
||||
String REACHABLE = "reachable";
|
||||
}
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
package com.usthe.manager.component.alerter;
|
||||
|
||||
import com.usthe.alert.AlerterDataQueue;
|
||||
import com.usthe.alert.AlerterWorkerPool;
|
||||
import com.usthe.alert.pojo.entity.Alert;
|
||||
import com.usthe.alert.service.AlertService;
|
||||
import com.usthe.common.util.CommonConstants;
|
||||
import com.usthe.manager.pojo.entity.Monitor;
|
||||
import com.usthe.manager.service.MonitorService;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
/**
|
||||
* 告警信息入库分发
|
||||
* @author tom
|
||||
* @date 2021/12/10 12:58
|
||||
*/
|
||||
@Component
|
||||
@Slf4j
|
||||
public class DispatchAlarm {
|
||||
|
||||
private AlerterWorkerPool workerPool;
|
||||
private AlerterDataQueue dataQueue;
|
||||
private AlertService alertService;
|
||||
private MonitorService monitorService;
|
||||
|
||||
public DispatchAlarm(AlerterWorkerPool workerPool, AlerterDataQueue dataQueue,
|
||||
AlertService alertService, MonitorService monitorService) {
|
||||
this.workerPool = workerPool;
|
||||
this.dataQueue = dataQueue;
|
||||
this.alertService = alertService;
|
||||
this.monitorService = monitorService;
|
||||
startDispatch();
|
||||
}
|
||||
|
||||
private void startDispatch() {
|
||||
Runnable runnable = () -> {
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
try {
|
||||
Alert alert = dataQueue.pollAlertData();
|
||||
if (alert != null) {
|
||||
// 判断告警类型入库
|
||||
storeAlertData(alert);
|
||||
// 通知分发
|
||||
sendAlertDataListener(alert);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
};
|
||||
workerPool.executeJob(runnable);
|
||||
workerPool.executeJob(runnable);
|
||||
workerPool.executeJob(runnable);
|
||||
}
|
||||
|
||||
private void storeAlertData(Alert alert) {
|
||||
// todo 过滤重复告警 使用 告警持续时间参数-duration 这个时间段的相同重复告警应该被过滤
|
||||
// todo 使用缓存不直接操作库
|
||||
Monitor monitor = monitorService.getMonitor(alert.getMonitorId());
|
||||
if (monitor == null) {
|
||||
log.warn("Dispatch alarm the monitorId: {} not existed, ignored.", alert.getMonitorId());
|
||||
return;
|
||||
}
|
||||
alert.setMonitorName(monitor.getName());
|
||||
if (monitor.getStatus() == CommonConstants.UN_MANAGE_CODE) {
|
||||
// 当监控未管理时 忽略静默其告警信息
|
||||
return;
|
||||
}
|
||||
if (monitor.getStatus() != CommonConstants.UN_AVAILABLE_CODE
|
||||
&& monitor.getStatus() != CommonConstants.UN_REACHABLE_CODE) {
|
||||
if (CommonConstants.AVAILABLE.equals(alert.getTarget())) {
|
||||
// 可用性告警 需变更监控状态为不可用
|
||||
monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_AVAILABLE_CODE);
|
||||
} else if (CommonConstants.REACHABLE.equals(alert.getTarget())) {
|
||||
// 可达性告警 需变更监控状态为不可达
|
||||
monitorService.updateMonitorStatus(monitor.getId(), CommonConstants.UN_REACHABLE_CODE);
|
||||
}
|
||||
}
|
||||
// 告警落库
|
||||
alertService.addAlert(alert);
|
||||
}
|
||||
|
||||
private void sendAlertDataListener(Alert alert) {
|
||||
// todo 转发配置的邮件 微信 webhook
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -66,7 +66,7 @@ public class MonitorController {
|
||||
public ResponseEntity<Message<MonitorDto>> getMonitor(
|
||||
@ApiParam(value = "监控ID", example = "6565463543") @PathVariable("id") long id) {
|
||||
// 获取监控信息
|
||||
MonitorDto monitorDto = monitorService.getMonitor(id);
|
||||
MonitorDto monitorDto = monitorService.getMonitorDto(id);
|
||||
Message.MessageBuilder<MonitorDto> messageBuilder = Message.builder();
|
||||
if (monitorDto == null) {
|
||||
messageBuilder.code(MONITOR_NOT_EXIST_CODE).msg("Monitor not exist.");
|
||||
|
||||
@@ -5,6 +5,7 @@ import com.usthe.manager.pojo.entity.Monitor;
|
||||
import org.springframework.data.jpa.repository.JpaRepository;
|
||||
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
|
||||
import org.springframework.data.jpa.repository.Query;
|
||||
import org.springframework.data.repository.query.Param;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
@@ -36,4 +37,12 @@ public interface MonitorDao extends JpaRepository<Monitor, Long>, JpaSpecificati
|
||||
*/
|
||||
@Query("select new com.usthe.manager.pojo.dto.AppCount(mo.app, COUNT(mo.id)) from Monitor mo group by mo.app")
|
||||
List<AppCount> findAppsCount();
|
||||
|
||||
/**
|
||||
* 更新指定监控的状态
|
||||
* @param id 监控ID
|
||||
* @param status 监控状态
|
||||
*/
|
||||
@Query("update Monitor set status = :status where id = :id")
|
||||
void updateMonitorStatus(@Param(value = "id") Long id, @Param(value = "status") byte status);
|
||||
}
|
||||
|
||||
@@ -73,7 +73,7 @@ public interface MonitorService {
|
||||
* @return MonitorDto
|
||||
* @throws RuntimeException 查询过程中异常抛出
|
||||
*/
|
||||
MonitorDto getMonitor(long id) throws RuntimeException;
|
||||
MonitorDto getMonitorDto(long id) throws RuntimeException;
|
||||
|
||||
/**
|
||||
* 动态条件查询
|
||||
@@ -100,4 +100,18 @@ public interface MonitorService {
|
||||
* @return 监控类别与监控数量映射
|
||||
*/
|
||||
List<AppCount> getAllAppMonitorsCount();
|
||||
|
||||
/**
|
||||
* 查询监控
|
||||
* @param monitorId 监控ID
|
||||
* @return 监控信息
|
||||
*/
|
||||
Monitor getMonitor(Long monitorId);
|
||||
|
||||
/**
|
||||
* 更新指定监控的状态
|
||||
* @param monitorId 监控ID
|
||||
* @param status 监控状态
|
||||
*/
|
||||
void updateMonitorStatus(Long monitorId, byte status);
|
||||
}
|
||||
|
||||
@@ -264,7 +264,7 @@ public class MonitorServiceImpl implements MonitorService {
|
||||
|
||||
@Override
|
||||
@Transactional(readOnly = true)
|
||||
public MonitorDto getMonitor(long id) throws RuntimeException {
|
||||
public MonitorDto getMonitorDto(long id) throws RuntimeException {
|
||||
Optional<Monitor> monitorOptional = monitorDao.findById(id);
|
||||
if (monitorOptional.isPresent()) {
|
||||
Monitor monitor = monitorOptional.get();
|
||||
@@ -335,4 +335,14 @@ public class MonitorServiceImpl implements MonitorService {
|
||||
return monitorDao.findAppsCount();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Monitor getMonitor(Long monitorId) {
|
||||
return monitorDao.findById(monitorId).orElse(null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateMonitorStatus(Long monitorId, byte status) {
|
||||
monitorDao.updateMonitorStatus(monitorId, status);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +78,7 @@ CREATE TABLE alert_define
|
||||
expr varchar(255) not null comment '告警触发条件表达式',
|
||||
priority tinyint not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
|
||||
duration int not null comment '触发告警后持续时间,单位s',
|
||||
enable boolean not null default true comment '告警触发后是否发送',
|
||||
delay int not null comment '告警延迟时间,即延迟多久再发送告警,单位s',
|
||||
enable boolean not null default true comment '告警阈值开关',
|
||||
template varchar(255) not null comment '告警通知模板内容',
|
||||
creator varchar(100) comment '创建者',
|
||||
modifier varchar(100) comment '最新修改者',
|
||||
@@ -104,5 +103,22 @@ CREATE TABLE alert_define_monitor_bind
|
||||
index index_bind (alert_define_id, monitor_id)
|
||||
) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for alert
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS alert ;
|
||||
CREATE TABLE alert
|
||||
(
|
||||
id bigint not null auto_increment comment '告警ID',
|
||||
monitor_id bigint not null comment '告警监控对象ID',
|
||||
monitor_name varchar(100) comment '告警监控对象名称',
|
||||
priority tinyint not null default 0 comment '告警级别 0:高-emergency-紧急告警-红色 1:中-critical-严重告警-橙色 2:低-warning-警告告警-黄色',
|
||||
status tinyint not null default 0 comment '告警状态: 0-待发送 1-已发送 2-已过期(已经超过持续时间)',
|
||||
target varchar(255) not null comment '告警目标对象: 监控可用性-available 指标-app.metrics.field',
|
||||
duration int not null comment '触发告警后持续时间,单位s',
|
||||
content varchar(255) not null comment '告警通知实际内容',
|
||||
gmt_create timestamp default current_timestamp comment 'create time',
|
||||
primary key (id)
|
||||
) ENGINE = InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
COMMIT;
|
||||
Reference in New Issue
Block a user