From 51266aab874955c6d851ed11dd3c8489e7d1696a Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Tue, 5 Apr 2022 20:50:33 +0800 Subject: [PATCH] [collector,alerter]bugfix:monitors always timeout alert (#67) --- .../com/usthe/alert/calculate/CalculateAlarm.java | 2 +- .../usthe/collector/dispatch/CommonDispatcher.java | 14 +++++++++----- .../usthe/collector/dispatch/MetricsCollect.java | 14 +++++++++++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java b/alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java index 7b8ee72..4e7ecb7 100644 --- a/alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java +++ b/alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java @@ -110,7 +110,7 @@ public class CalculateAlarm { } else { // 其他异常 alertBuilder.target(CommonConstants.AVAILABLE) - .content("监控紧急可用性告警: " + metricsData.getCode().name()); + .content("监控可用性告警: " + metricsData.getCode().name() + " : " + metricsData.getMsg()); triggeredMonitorStateAlertMap.put(monitorId, metricsData.getCode()); dataQueue.addAlertData(alertBuilder.build()); } diff --git a/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java b/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java index 835c934..989920f 100644 --- a/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java +++ b/collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java @@ -105,9 +105,13 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc .setId(timerJob.getJob().getMonitorId()) .setApp(timerJob.getJob().getApp()) .setMetrics(metricsTime.getMetrics().getName()) + .setPriority(metricsTime.getMetrics().getPriority()) .setTime(System.currentTimeMillis()) .setCode(CollectRep.Code.TIMEOUT).setMsg("collect timeout").build(); - dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData); + log.error("[Collect Timeout]: \n{}", metricsData); + if (metricsData.getPriority() == 0) { + dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData); + } metricsTimeoutMonitorMap.remove(entry.getKey()); } } @@ -165,8 +169,8 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc metricsSet.forEach(metricItem -> { MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this); jobRequestQueue.addJob(metricsCollect); - metricsTimeoutMonitorMap.put(job.getId() + "-" + metrics.getName(), - new MetricsTime(System.currentTimeMillis(), metrics, timeout)); + metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName(), + new MetricsTime(System.currentTimeMillis(), metricItem, timeout)); }); } else { // 当前执行级别的指标组列表未全执行完成, @@ -185,8 +189,8 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc metricsSet.forEach(metricItem -> { MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this); jobRequestQueue.addJob(metricsCollect); - metricsTimeoutMonitorMap.put(job.getId() + "-" + metrics.getName(), - new MetricsTime(System.currentTimeMillis(), metrics, timeout)); + metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName(), + new MetricsTime(System.currentTimeMillis(), metricItem, timeout)); }); } else { // 当前执行级别的指标组列表未全执行完成, diff --git a/collector/src/main/java/com/usthe/collector/dispatch/MetricsCollect.java b/collector/src/main/java/com/usthe/collector/dispatch/MetricsCollect.java index 93ad92f..6407da7 100644 --- a/collector/src/main/java/com/usthe/collector/dispatch/MetricsCollect.java +++ b/collector/src/main/java/com/usthe/collector/dispatch/MetricsCollect.java @@ -34,6 +34,10 @@ import java.util.stream.Collectors; @Slf4j @Data public class MetricsCollect implements Runnable, Comparable { + /** + * 调度告警阈值时间 100ms + */ + private static final long WARN_DISPATCH_TIME = 100; /** * 监控ID */ @@ -267,11 +271,15 @@ public class MetricsCollect implements Runnable, Comparable { private CollectRep.MetricsData validateResponse(CollectRep.MetricsData.Builder builder) { long endTime = System.currentTimeMillis(); builder.setTime(endTime); - log.debug("[Collect]: newTime: {}, startTime: {}, spendTime: {}.", newTime, startTime, endTime - startTime); + long runningTime = endTime - startTime; + long allTime = endTime - newTime; + if (startTime - newTime >= WARN_DISPATCH_TIME) { + log.warn("[Collector Dispatch Warn, Dispatch Use {}ms.", startTime - newTime); + } if (builder.getCode() != CollectRep.Code.SUCCESS) { - log.info("[Collect Fail] Reason: {}", builder.getMsg()); + log.info("[Collect Failed, Run {}ms, All {}ms] Reason: {}", runningTime, allTime, builder.getMsg()); } else { - log.info("[Collect Success]."); + log.info("[Collect Success, Run {}ms, All {}ms].", runningTime, allTime); } return builder.build(); }