[collector,alerter]bugfix:monitors always timeout alert (#67)

This commit is contained in:
tomsun28
2022-04-05 20:50:33 +08:00
committed by GitHub
parent e99dd2e870
commit 51266aab87
3 changed files with 21 additions and 9 deletions

View File

@@ -110,7 +110,7 @@ public class CalculateAlarm {
} else {
// 其他异常
alertBuilder.target(CommonConstants.AVAILABLE)
.content("监控紧急可用性告警: " + metricsData.getCode().name());
.content("监控可用性告警: " + metricsData.getCode().name() + " : " + metricsData.getMsg());
triggeredMonitorStateAlertMap.put(monitorId, metricsData.getCode());
dataQueue.addAlertData(alertBuilder.build());
}

View File

@@ -105,9 +105,13 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc
.setId(timerJob.getJob().getMonitorId())
.setApp(timerJob.getJob().getApp())
.setMetrics(metricsTime.getMetrics().getName())
.setPriority(metricsTime.getMetrics().getPriority())
.setTime(System.currentTimeMillis())
.setCode(CollectRep.Code.TIMEOUT).setMsg("collect timeout").build();
dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData);
log.error("[Collect Timeout]: \n{}", metricsData);
if (metricsData.getPriority() == 0) {
dispatchCollectData(metricsTime.timeout, metricsTime.getMetrics(), metricsData);
}
metricsTimeoutMonitorMap.remove(entry.getKey());
}
}
@@ -165,8 +169,8 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc
metricsSet.forEach(metricItem -> {
MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this);
jobRequestQueue.addJob(metricsCollect);
metricsTimeoutMonitorMap.put(job.getId() + "-" + metrics.getName(),
new MetricsTime(System.currentTimeMillis(), metrics, timeout));
metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName(),
new MetricsTime(System.currentTimeMillis(), metricItem, timeout));
});
} else {
// 当前执行级别的指标组列表未全执行完成,
@@ -185,8 +189,8 @@ public class CommonDispatcher implements MetricsTaskDispatch, CollectDataDispatc
metricsSet.forEach(metricItem -> {
MetricsCollect metricsCollect = new MetricsCollect(metricItem, timeout, this);
jobRequestQueue.addJob(metricsCollect);
metricsTimeoutMonitorMap.put(job.getId() + "-" + metrics.getName(),
new MetricsTime(System.currentTimeMillis(), metrics, timeout));
metricsTimeoutMonitorMap.put(job.getId() + "-" + metricItem.getName(),
new MetricsTime(System.currentTimeMillis(), metricItem, timeout));
});
} else {
// 当前执行级别的指标组列表未全执行完成,

View File

@@ -34,6 +34,10 @@ import java.util.stream.Collectors;
@Slf4j
@Data
public class MetricsCollect implements Runnable, Comparable<MetricsCollect> {
/**
* 调度告警阈值时间 100ms
*/
private static final long WARN_DISPATCH_TIME = 100;
/**
* 监控ID
*/
@@ -267,11 +271,15 @@ public class MetricsCollect implements Runnable, Comparable<MetricsCollect> {
private CollectRep.MetricsData validateResponse(CollectRep.MetricsData.Builder builder) {
long endTime = System.currentTimeMillis();
builder.setTime(endTime);
log.debug("[Collect]: newTime: {}, startTime: {}, spendTime: {}.", newTime, startTime, endTime - startTime);
long runningTime = endTime - startTime;
long allTime = endTime - newTime;
if (startTime - newTime >= WARN_DISPATCH_TIME) {
log.warn("[Collector Dispatch Warn, Dispatch Use {}ms.", startTime - newTime);
}
if (builder.getCode() != CollectRep.Code.SUCCESS) {
log.info("[Collect Fail] Reason: {}", builder.getMsg());
log.info("[Collect Failed, Run {}ms, All {}ms] Reason: {}", runningTime, allTime, builder.getMsg());
} else {
log.info("[Collect Success].");
log.info("[Collect Success, Run {}ms, All {}ms].", runningTime, allTime);
}
return builder.build();
}