Przeglądaj źródła

[collector,alerter]bugfix:monitors always timeout alert (#67)

tomsun28 3 lat temu
rodzic
commit
51266aab87

+ 1 - 1
alerter/src/main/java/com/usthe/alert/calculate/CalculateAlarm.java

@@ -110,7 +110,7 @@ public class CalculateAlarm {
                 } else {
                     // 其他异常
                     alertBuilder.target(CommonConstants.AVAILABLE)
-                            .content("监控紧急可用性告警: " + metricsData.getCode().name());
+                            .content("监控可用性告警: " + metricsData.getCode().name() + " : " + metricsData.getMsg());
                     triggeredMonitorStateAlertMap.put(monitorId, metricsData.getCode());
                     dataQueue.addAlertData(alertBuilder.build());
                 }

+ 9 - 5
collector/src/main/java/com/usthe/collector/dispatch/CommonDispatcher.java

@@ -105,9 +105,13 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc
                                     .setId(timerJob.getJob().getMonitorId())
                                     .setApp(timerJob.getJob().getApp())
                                     .setMetrics(metricsTime.getMetrics().getName())
+                                    .setPriority(metricsTime.getMetrics().getPriority())
                                     .setTime(System.currentTimeMillis())
                                     .setCode(CollectRep.Code.TIMEOUT).setMsg("collect timeout").build();
-                            dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData);
+                            log.error("[Collect Timeout]: \n{}", metricsData);
+                            if (metricsData.getPriority() == 0) {
+                                dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData);
+                            }
                             metricsTimeoutMonitorMap.remove(entry.getKey());
                         }
                     }
@@ -165,8 +169,8 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc
                 metricsSet.forEach(metricItem -> {
                     MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this);
                     jobRequestQueue.addJob(metricsCollect);
-                    metricsTimeoutMonitorMap.put(job.getId() + "-" + metrics.getName(),
-                            new MetricsTime(System.currentTimeMillis(), metrics, timeout));
+                    metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName(),
+                            new MetricsTime(System.currentTimeMillis(), metricItem, timeout));
                 });
             } else {
                 // 当前执行级别的指标组列表未全执行完成,
@@ -185,8 +189,8 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc
                 metricsSet.forEach(metricItem -> {
                     MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this);
                     jobRequestQueue.addJob(metricsCollect);
-                    metricsTimeoutMonitorMap.put(job.getId() + "-" + metrics.getName(),
-                            new MetricsTime(System.currentTimeMillis(), metrics, timeout));
+                    metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName(),
+                            new MetricsTime(System.currentTimeMillis(), metricItem, timeout));
                 });
             } else {
                 // 当前执行级别的指标组列表未全执行完成,

+ 11 - 3
collector/src/main/java/com/usthe/collector/dispatch/MetricsCollect.java

@@ -35,6 +35,10 @@ import java.util.stream.Collectors;
 @Data
 public class MetricsCollect implements Runnable, Comparable<MetricsCollect> {
     /**
+     * 调度告警阈值时间 100ms
+     */
+    private static final long WARN_DISPATCH_TIME = 100;
+    /**
      * 监控ID
      */
     protected long monitorId;
@@ -267,11 +271,15 @@ public class MetricsCollect implements Runnable, Comparable<MetricsCollect> {
     private CollectRep.MetricsData validateResponse(CollectRep.MetricsData.Builder builder) {
         long endTime = System.currentTimeMillis();
         builder.setTime(endTime);
-        log.debug("[Collect]: newTime: {}, startTime: {}, spendTime: {}.", newTime, startTime, endTime - startTime);
+        long runningTime = endTime - startTime;
+        long allTime = endTime - newTime;
+        if (startTime - newTime >= WARN_DISPATCH_TIME) {
+            log.warn("[Collector Dispatch Warn, Dispatch Use {}ms.", startTime - newTime);
+        }
         if (builder.getCode() != CollectRep.Code.SUCCESS) {
-            log.info("[Collect Fail] Reason: {}", builder.getMsg());
+            log.info("[Collect Failed, Run {}ms, All {}ms] Reason: {}", runningTime, allTime, builder.getMsg());
         } else {
-            log.info("[Collect Success].");
+            log.info("[Collect Success, Run {}ms, All {}ms].", runningTime, allTime);
         }
         return builder.build();
     }