From ec32f93a231d8fc9f81dc06ad843ab06575da49a Mon Sep 17 00:00:00 2001 From: Manish Khandelwal Date: Mon, 17 Jun 2024 12:32:39 +0530 Subject: [PATCH] =?UTF-8?q?Issue=20397=20Metric=20reporter=20to=20log=20de?= =?UTF-8?q?bug=20messages=20if=20configured=20number=20=E2=80=A6=20(#662)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 2 + .../application/ECChronosInternals.java | 12 ++ .../ecchronos/application/config/Config.java | 1 + .../config/metrics/StatisticsConfig.java | 67 ++++++- .../application/spring/MetricBeans.java | 8 + application/src/main/resources/ecc.yml | 18 ++ .../application/config/TestConfig.java | 25 ++- application/src/test/resources/all_set.yml | 7 + ...val_bigger_than_repair_failures_window.yml | 22 +++ .../core/metrics/MetricInspector.java | 123 +++++++++++++ .../ecchronos/core/utils/StatusLogger.java | 78 ++++++++ .../core/metrics/TestMetricInspector.java | 119 +++++++++++++ .../core/utils/TestStatusLogger.java | 167 ++++++++++++++++++ core/src/test/resources/logback-test.xml | 1 + docs/METRICS.md | 13 +- docs/autogenerated/EccYamlFile.md | 18 ++ 16 files changed, 673 insertions(+), 8 deletions(-) create mode 100644 application/src/test/resources/trigger_interval_bigger_than_repair_failures_window.yml create mode 100644 core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/MetricInspector.java create mode 100644 core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/utils/StatusLogger.java create mode 100644 core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/TestMetricInspector.java create mode 100644 core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/utils/TestStatusLogger.java diff --git a/CHANGES.md b/CHANGES.md index d3f54b9d4..264f56c1a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,8 @@ # Changes ## Version 5.0.5 +* Metric status logger for troubleshooting - Issue #397 + ## Version 5.0.4 * ecChronos will break if repair interval is shorter than the initial delay - Issue #667 diff --git a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/ECChronosInternals.java b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/ECChronosInternals.java index 44b4267ba..7aacb222c 100644 --- a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/ECChronosInternals.java +++ b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/ECChronosInternals.java @@ -29,6 +29,7 @@ import com.ericsson.bss.cassandra.ecchronos.core.JmxProxyFactoryImpl; import com.ericsson.bss.cassandra.ecchronos.core.TableStorageStates; import com.ericsson.bss.cassandra.ecchronos.core.TableStorageStatesImpl; +import com.ericsson.bss.cassandra.ecchronos.core.metrics.MetricInspector; import com.ericsson.bss.cassandra.ecchronos.core.metrics.TableRepairMetrics; import com.ericsson.bss.cassandra.ecchronos.core.metrics.TableRepairMetricsImpl; import com.ericsson.bss.cassandra.ecchronos.core.scheduling.RunPolicy; @@ -62,6 +63,8 @@ public class ECChronosInternals implements Closeable private final CASLockFactory myLockFactory; private final CassandraMetrics myCassandraMetrics; + private final MetricInspector myMetricInspector; + public ECChronosInternals(final Config configuration, final NativeConnectionProvider nativeConnectionProvider, final JmxConnectionProvider jmxConnectionProvider, @@ -108,11 +111,20 @@ public ECChronosInternals(final Config configuration, .withTableStorageStates(myTableStorageStatesImpl) .withMeterRegistry(meterRegistry) .build(); + + myMetricInspector = new MetricInspector(meterRegistry, + configuration.getStatisticsConfig().getRepairFailuresCount(), + configuration.getStatisticsConfig().getRepairFailuresTimeWindow() + .getInterval(TimeUnit.MINUTES), + configuration.getStatisticsConfig().getTriggerIntervalForMetricInspection() + .getInterval(TimeUnit.MILLISECONDS)); + myMetricInspector.startInspection(); } else { myTableStorageStatesImpl = null; myTableRepairMetricsImpl = null; + myMetricInspector = null; } myScheduleManagerImpl = ScheduleManagerImpl.builder() .withLockFactory(myLockFactory) diff --git a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/Config.java b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/Config.java index 8fe53c842..ae1fd2e52 100644 --- a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/Config.java +++ b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/Config.java @@ -86,6 +86,7 @@ public final void setStatisticsConfig(final StatisticsConfig statisticsConfig) if (statisticsConfig != null) { myStatisticsConfig = statisticsConfig; + myStatisticsConfig.validate(); } } diff --git a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/metrics/StatisticsConfig.java b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/metrics/StatisticsConfig.java index 59440c4df..c06dfcf9a 100644 --- a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/metrics/StatisticsConfig.java +++ b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/config/metrics/StatisticsConfig.java @@ -14,24 +14,32 @@ */ package com.ericsson.bss.cassandra.ecchronos.application.config.metrics; +import com.ericsson.bss.cassandra.ecchronos.application.config.Interval; import com.fasterxml.jackson.annotation.JsonProperty; import java.io.File; +import java.util.concurrent.TimeUnit; public class StatisticsConfig { + private static final int DEFAULT_FAILURES_TIME_WINDOW_IN_MINUTES = 30; + private static final int DEFAULT_TRIGGER_INTERVAL_FOR_METRIC_INSPECTION_IN_SECONDS = 5; + private static final int DEFAULT_REPAIR_FAILURES_COUNT = 5; private boolean myIsEnabled = true; + private File myOutputDirectory = new File("./statistics"); private ReportingConfigs myReportingConfigs = new ReportingConfigs(); private String myMetricsPrefix = ""; + private int myRepairFailuresCount = DEFAULT_REPAIR_FAILURES_COUNT; + private Interval myRepairFailuresTimeWindow = new Interval(DEFAULT_FAILURES_TIME_WINDOW_IN_MINUTES, + TimeUnit.MINUTES); + private Interval myTriggerIntervalForMetricInspection = new + Interval(DEFAULT_TRIGGER_INTERVAL_FOR_METRIC_INSPECTION_IN_SECONDS, TimeUnit.SECONDS); @JsonProperty("enabled") public final boolean isEnabled() { - boolean isAnyReportingEnabled = myReportingConfigs.isFileReportingEnabled() - || myReportingConfigs.isJmxReportingEnabled() - || myReportingConfigs.isHttpReportingEnabled(); - return myIsEnabled && isAnyReportingEnabled; + return myIsEnabled; } @JsonProperty("directory") @@ -52,6 +60,24 @@ public final String getMetricsPrefix() return myMetricsPrefix; } + @JsonProperty("repair_failures_count") + public final int getRepairFailuresCount() + { + return myRepairFailuresCount; + } + + @JsonProperty("repair_failures_time_window") + public final Interval getRepairFailuresTimeWindow() + { + return myRepairFailuresTimeWindow; + } + + @JsonProperty("trigger_interval_for_metric_inspection") + public final Interval getTriggerIntervalForMetricInspection() + { + return myTriggerIntervalForMetricInspection; + } + @JsonProperty("enabled") public final void setEnabled(final boolean enabled) { @@ -75,4 +101,37 @@ public final void setMetricsPrefix(final String metricsPrefix) { myMetricsPrefix = metricsPrefix; } + + @JsonProperty("repair_failures_count") + public final void setRepairFailuresCount(final int repairFailuresCount) + { + myRepairFailuresCount = repairFailuresCount; + } + + @JsonProperty("repair_failures_time_window") + public final void setRepairFailuresTimeWindow(final Interval repairFailuresTimeWindow) + { + myRepairFailuresTimeWindow = repairFailuresTimeWindow; + } + @JsonProperty("trigger_interval_for_metric_inspection") + public final void setTriggerIntervalForMetricInspection(final Interval triggerIntervalForStatusLogger) + { + myTriggerIntervalForMetricInspection = triggerIntervalForStatusLogger; + } + + public final void validate() + { + long repairTimeWindowInSeconds = getRepairFailuresTimeWindow().getInterval(TimeUnit.SECONDS); + long triggerIntervalForMetricInspection = getTriggerIntervalForMetricInspection() + .getInterval(TimeUnit.SECONDS); + if (triggerIntervalForMetricInspection >= repairTimeWindowInSeconds) + { + throw new IllegalArgumentException(String.format("Repair window time must be greater than trigger interval." + + " Current repair window time: %d seconds," + + " trigger interval for metric inspection: %d seconds", + repairTimeWindowInSeconds, triggerIntervalForMetricInspection)); + } + } + } + diff --git a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/spring/MetricBeans.java b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/spring/MetricBeans.java index dffbe7bc7..feb4db12a 100644 --- a/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/spring/MetricBeans.java +++ b/application/src/main/java/com/ericsson/bss/cassandra/ecchronos/application/spring/MetricBeans.java @@ -23,6 +23,7 @@ import io.micrometer.core.instrument.Clock; import io.micrometer.core.instrument.composite.CompositeMeterRegistry; import io.micrometer.core.instrument.config.MeterFilter; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; import io.micrometer.jmx.JmxConfig; import io.micrometer.jmx.JmxMeterRegistry; import io.micrometer.prometheus.PrometheusConfig; @@ -66,6 +67,7 @@ public MetricBeans(final Config config) createPrometheusMeterRegistry(metricConfig); } } + createStatusLoggerMeterRegistry(); } private void createJmxMeterRegistry(final StatisticsConfig metricConfig) @@ -103,6 +105,12 @@ private void createPrometheusMeterRegistry(final StatisticsConfig metricConfig) myCompositeMeterRegistry.add(myPrometheusMeterRegistry); } + private void createStatusLoggerMeterRegistry() + { + SimpleMeterRegistry simpleMeterRegistry = new SimpleMeterRegistry(); + myCompositeMeterRegistry.add(simpleMeterRegistry); + } + @Bean public PrometheusMeterRegistry prometheusMeterRegistry() { diff --git a/application/src/main/resources/ecc.yml b/application/src/main/resources/ecc.yml index f5bf1a3e2..e2d530d13 100644 --- a/application/src/main/resources/ecc.yml +++ b/application/src/main/resources/ecc.yml @@ -276,6 +276,24 @@ statistics: ## The prefix cannot start or end with a dot or any other path separator. ## prefix: '' + ## + ## Number of repair failures before status logger logs metrics in debug logs + ## The number is used to trigger a status once number of failures is breached in a time window mentioned below + ## + repair_failures_count: 5 + ## + ## Time window over which to track repair failures in node for trigger status logger messages in debug log + ## + repair_failures_time_window: + time: 30 + unit: minutes + ## + ## Trigger interval for metric inspection. + ## This time should always be lesser than repair_failures_time_window + ## + trigger_interval_for_metric_inspection: + time: 5 + unit: seconds lock_factory: cas: diff --git a/application/src/test/java/com/ericsson/bss/cassandra/ecchronos/application/config/TestConfig.java b/application/src/test/java/com/ericsson/bss/cassandra/ecchronos/application/config/TestConfig.java index 1ba84a9a2..5f5ea05c5 100644 --- a/application/src/test/java/com/ericsson/bss/cassandra/ecchronos/application/config/TestConfig.java +++ b/application/src/test/java/com/ericsson/bss/cassandra/ecchronos/application/config/TestConfig.java @@ -160,6 +160,10 @@ public void testAllValues() throws Exception assertThat(httpReportingConfig.getExcludedMetrics()).hasSize(1); assertThat(httpReportingConfig.getExcludedMetrics()).contains(expectedHttpExcludedMetric); + assertThat(statisticsConfig.getRepairFailuresTimeWindow().getInterval(TimeUnit.MINUTES)).isEqualTo(5); + assertThat(statisticsConfig.getTriggerIntervalForMetricInspection().getInterval(TimeUnit.SECONDS)).isEqualTo(30); + assertThat(statisticsConfig.getRepairFailuresCount()).isEqualTo(5); + LockFactoryConfig lockFactoryConfig = config.getLockFactory(); assertThat(lockFactoryConfig.getCasLockFactoryConfig().getKeyspaceName()).isEqualTo("ecc"); assertThat(lockFactoryConfig.getCasLockFactoryConfig().getConsistencySerial().equals(ConsistencyType.LOCAL)).isTrue(); @@ -370,7 +374,11 @@ public void testDefault() throws Exception assertThat(httpReportingConfig.isEnabled()).isTrue(); assertThat(httpReportingConfig.getExcludedMetrics()).isEmpty(); - LockFactoryConfig lockFactoryConfig = config.getLockFactory(); + assertThat(statisticsConfig.getRepairFailuresTimeWindow().getInterval(TimeUnit.MINUTES)).isEqualTo(30); + assertThat(statisticsConfig.getTriggerIntervalForMetricInspection().getInterval(TimeUnit.SECONDS)).isEqualTo(5); + assertThat(statisticsConfig.getRepairFailuresCount()).isEqualTo(5); + + LockFactoryConfig lockFactoryConfig = config.getLockFactory(); assertThat(lockFactoryConfig.getCasLockFactoryConfig().getKeyspaceName()).isEqualTo("ecchronos"); assertThat(lockFactoryConfig.getCasLockFactoryConfig().getConsistencySerial().equals(ConsistencyType.DEFAULT)).isTrue(); @@ -408,7 +416,7 @@ public void testWarnIntervalLongerThanError() } @Test - public void testStatisticsDisabledIfNoReporting() throws Exception + public void testStatisticsEnabledIfNoReporting() throws Exception { ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); File file = new File(classLoader.getResource("all_reporting_disabled.yml").getFile()); @@ -421,7 +429,18 @@ public void testStatisticsDisabledIfNoReporting() throws Exception assertThat(statisticsConfig.getReportingConfigs().isJmxReportingEnabled()).isFalse(); assertThat(statisticsConfig.getReportingConfigs().isFileReportingEnabled()).isFalse(); assertThat(statisticsConfig.getReportingConfigs().isHttpReportingEnabled()).isFalse(); - assertThat(statisticsConfig.isEnabled()).isFalse(); + assertThat(statisticsConfig.isEnabled()).isTrue(); + } + + @Test + public void testTriggerIntervalBiggerThanRepairFailuresWindow() + { + ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); + File file = new File(classLoader.getResource("trigger_interval_bigger_than_repair_failures_window.yml").getFile()); + + ObjectMapper objectMapper = new ObjectMapper(new YAMLFactory()); + + assertThatExceptionOfType(JsonMappingException.class).isThrownBy(() -> objectMapper.readValue(file, Config.class)); } public static class TestNativeConnectionProvider implements NativeConnectionProvider diff --git a/application/src/test/resources/all_set.yml b/application/src/test/resources/all_set.yml index c132f0451..e707b7b8a 100644 --- a/application/src/test/resources/all_set.yml +++ b/application/src/test/resources/all_set.yml @@ -85,6 +85,13 @@ statistics: excludedMetrics: - name: '.*httpExcluded' prefix: "unittest" + repair_failures_count: 5 + repair_failures_time_window: + time: 5 + unit: minutes + trigger_interval_for_metric_inspection: + time: 30 + unit: seconds lock_factory: cas: diff --git a/application/src/test/resources/trigger_interval_bigger_than_repair_failures_window.yml b/application/src/test/resources/trigger_interval_bigger_than_repair_failures_window.yml new file mode 100644 index 000000000..b03952169 --- /dev/null +++ b/application/src/test/resources/trigger_interval_bigger_than_repair_failures_window.yml @@ -0,0 +1,22 @@ +# +# Copyright 2024 Telefonaktiebolaget LM Ericsson +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +statistics: + repair_failures_time_window: + time: 30 + unit: seconds + trigger_interval_for_metric_inspection: + time: 5 + unit: minutes \ No newline at end of file diff --git a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/MetricInspector.java b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/MetricInspector.java new file mode 100644 index 000000000..f761a8ccd --- /dev/null +++ b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/MetricInspector.java @@ -0,0 +1,123 @@ +/* + * Copyright 2024 Telefonaktiebolaget LM Ericsson + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.ericsson.bss.cassandra.ecchronos.core.metrics; + +import com.ericsson.bss.cassandra.ecchronos.core.utils.StatusLogger; +import com.google.common.annotations.VisibleForTesting; +import io.micrometer.core.instrument.MeterRegistry; + +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; +import java.util.Timer; +import java.util.TimerTask; + +public final class MetricInspector +{ + private final MeterRegistry myMeterRegistry; + private long myRepairFailuresCountSinceLastReport = 0; + private long myTotalRecordFailures = 0; + private final int myRepairFailureThreshold; + private final long myRepairFailuresTimeWindow; + private final long myTriggerIntervalForMetricInspection; + private LocalDateTime myRecordingStartTimestamp = LocalDateTime.now(); + private Timer timer; + + @VisibleForTesting + long getRepairFailuresCountSinceLastReport() + { + return myRepairFailuresCountSinceLastReport; + } + + @VisibleForTesting + long getTotalRecordFailures() + { + return myTotalRecordFailures; + } + + @VisibleForTesting + LocalDateTime getRecordingStartTimestamp() + { + return myRecordingStartTimestamp; + } + + public MetricInspector(final MeterRegistry meterRegistry, + final int repairFailureThreshold, + final long repairFailuresTimeWindow, + final long triggerIntervalForMetricInspection) + { + myMeterRegistry = meterRegistry; + myRepairFailureThreshold = repairFailureThreshold; + myRepairFailuresTimeWindow = repairFailuresTimeWindow; + myTriggerIntervalForMetricInspection = triggerIntervalForMetricInspection; + } + + public void startInspection() + { + timer = new Timer(); + timer.scheduleAtFixedRate(new TimerTask() + { + @Override + public void run() + { + inspectMeterRegistryForRepairFailures(); + } + }, 0, myTriggerIntervalForMetricInspection); + } + + public void stopInspection() + { + if (timer != null) + { + timer.cancel(); + } + } + + @VisibleForTesting + void inspectMeterRegistryForRepairFailures() + { + io.micrometer.core.instrument.Timer nodeRepairSessions = myMeterRegistry + .find(TableRepairMetricsImpl.NODE_REPAIR_SESSIONS) + .tags("successful", "false") + .timer(); + if (nodeRepairSessions != null) + { + myTotalRecordFailures = nodeRepairSessions.count(); + } + if (myTotalRecordFailures - myRepairFailuresCountSinceLastReport > myRepairFailureThreshold) + { + myRepairFailuresCountSinceLastReport = myTotalRecordFailures; + myRecordingStartTimestamp = LocalDateTime.now(); + StatusLogger.log(myMeterRegistry); + } + resetRepairFailureCount(); + } + + /* + * If in defined time window, number of repair failure has not crossed the configured number, then + * reset failure count for new timed window. Reinitialize time window. + */ + @VisibleForTesting + void resetRepairFailureCount() + { + LocalDateTime currentTimeStamp = LocalDateTime.now(); + LocalDateTime timeRepairWindowMinutesAgo = currentTimeStamp. + minus(myRepairFailuresTimeWindow, ChronoUnit.MINUTES); + if (myRecordingStartTimestamp.isBefore(timeRepairWindowMinutesAgo)) + { + myRepairFailuresCountSinceLastReport = myTotalRecordFailures; + myRecordingStartTimestamp = LocalDateTime.now(); + } + } +} diff --git a/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/utils/StatusLogger.java b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/utils/StatusLogger.java new file mode 100644 index 000000000..b550b457c --- /dev/null +++ b/core/src/main/java/com/ericsson/bss/cassandra/ecchronos/core/utils/StatusLogger.java @@ -0,0 +1,78 @@ +/* + * Copyright 2024 Telefonaktiebolaget LM Ericsson + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.ericsson.bss.cassandra.ecchronos.core.utils; + +import io.micrometer.core.instrument.Gauge; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.Timer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public final class StatusLogger +{ + private static final Logger LOG = LoggerFactory.getLogger(StatusLogger.class); + static final String NODE_REPAIR_SESSIONS = "node.repair.sessions"; + static final String NODE_REMAINING_REPAIR_TIME = "node.remaining.repair.time"; + static final String NODE_TIME_SINCE_LAST_REPAIRED = "node.time.since.last.repaired"; + static final String NODE_REPAIRED_RATIO = "node.repaired.ratio"; + + private StatusLogger() + { + throw new AssertionError("Utility classes should not be instantiated"); + } + + public static void log(final MeterRegistry myMeterRegistry) + { + Timer failedRepairSessions = myMeterRegistry.find(NODE_REPAIR_SESSIONS) + .tags("successful", "false") + .timer(); + if (failedRepairSessions != null) + { + LOG.debug("Total repair failures in node till now is: {}", failedRepairSessions.count()); + } + + Timer successfulRepairSessions = myMeterRegistry.find(NODE_REPAIR_SESSIONS) + .tags("successful", "true") + .timer(); + if (successfulRepairSessions != null) + { + LOG.debug("Total repair success in node till now is: {}", successfulRepairSessions.count()); + } + + Gauge nodeTimeSinceLastRepaired = myMeterRegistry.find(NODE_TIME_SINCE_LAST_REPAIRED) + .gauge(); + + if (nodeTimeSinceLastRepaired != null) + { + LOG.debug("Node last repaired at: {}", nodeTimeSinceLastRepaired.value()); + } + + Gauge nodeRemainingRepairTime = myMeterRegistry.find(NODE_REMAINING_REPAIR_TIME) + .gauge(); + if (nodeRemainingRepairTime != null) + { + LOG.debug("Remaining time for node repair: {}", nodeRemainingRepairTime.value()); + } + + Gauge nodeRepairedRatio = myMeterRegistry.find(NODE_REPAIRED_RATIO) + .gauge(); + if (nodeRepairedRatio != null) + { + LOG.debug("Node repair ratio is: {}", nodeRepairedRatio.value()); + } + + } + +} diff --git a/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/TestMetricInspector.java b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/TestMetricInspector.java new file mode 100644 index 000000000..28479056f --- /dev/null +++ b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/metrics/TestMetricInspector.java @@ -0,0 +1,119 @@ +/* + * Copyright 2024 Telefonaktiebolaget LM Ericsson + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.ericsson.bss.cassandra.ecchronos.core.metrics; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.LoggerContext; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import com.ericsson.bss.cassandra.ecchronos.core.TableStorageStates; +import com.ericsson.bss.cassandra.ecchronos.core.utils.StatusLogger; +import com.ericsson.bss.cassandra.ecchronos.core.utils.TableReference; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.composite.CompositeMeterRegistry; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.runners.MockitoJUnitRunner; +import org.slf4j.LoggerFactory; +import java.util.List; +import java.util.concurrent.TimeUnit; +import static com.ericsson.bss.cassandra.ecchronos.core.MockTableReferenceFactory.tableReference; +import static org.junit.Assert.assertEquals; + + +@RunWith(MockitoJUnitRunner.class) +public class TestMetricInspector +{ + private static final String TEST_KEYSPACE = "test_keyspace"; + private static final String TEST_TABLE1 = "test_table1"; + private static final String TEST_TABLE2 = "test_table2"; + + @Mock + private TableStorageStates myTableStorageStates; + private MeterRegistry myMeterRegistry; + private TableRepairMetricsImpl myTableRepairMetricsImpl; + private MetricInspector myMetricInspector; + private LoggerContext loggerContext; + private ListAppender listAppender; + + @Before + public void init() + { + loggerContext = (LoggerContext) LoggerFactory.getILoggerFactory(); + listAppender = new ListAppender<>(); + listAppender.start(); + loggerContext.getLogger(StatusLogger.class).addAppender(listAppender); + CompositeMeterRegistry compositeMeterRegistry = new CompositeMeterRegistry(); + compositeMeterRegistry.add(new SimpleMeterRegistry()); + myMeterRegistry = compositeMeterRegistry; + myTableRepairMetricsImpl = TableRepairMetricsImpl.builder() + .withTableStorageStates(myTableStorageStates) + .withMeterRegistry(myMeterRegistry) + .build(); + myMetricInspector = new MetricInspector(myMeterRegistry, 1, 1, 5000); } + + @Test + public void testInspectMeterRegistryForRepairFailuresWhenFailureThresholdIsBroken() + { + TableReference tableReference1 = tableReference(TEST_KEYSPACE, TEST_TABLE1); + long expectedRepairTime = 12345L; + myTableRepairMetricsImpl.repairSession(tableReference1, expectedRepairTime, TimeUnit.MILLISECONDS, false); + myTableRepairMetricsImpl.repairSession(tableReference1, expectedRepairTime, TimeUnit.MILLISECONDS, false); + myMetricInspector.inspectMeterRegistryForRepairFailures(); + List logsList = listAppender.list; + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + assertEquals("Total repair failures in node till now is: 2", logMessage); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + } + + @Test + public void testInspectMeterRegistryForRepairFailuresWhenFailureThresholdIsIntact() + { + TableReference tableReference = tableReference(TEST_KEYSPACE, TEST_TABLE1); + long expectedRepairTime = 12345L; + myTableRepairMetricsImpl.repairSession(tableReference, expectedRepairTime, TimeUnit.MILLISECONDS, false); + myMetricInspector.inspectMeterRegistryForRepairFailures(); + List logsList = listAppender.list; + long count = logsList.stream().count(); + assertEquals(0, count); + } + + @Test + public void testStartInspectionMethod() throws InterruptedException + { + assertEquals(0, myMetricInspector.getTotalRecordFailures()); + TableReference tableReference1 = tableReference(TEST_KEYSPACE, TEST_TABLE1); + long expectedRepairTime = 12345L; + myTableRepairMetricsImpl.repairSession(tableReference1, expectedRepairTime, TimeUnit.MILLISECONDS, false); + myTableRepairMetricsImpl.repairSession(tableReference1, expectedRepairTime, TimeUnit.MILLISECONDS, false); + List logsList = listAppender.list; + myMetricInspector.startInspection(); + Thread.sleep(5000); + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + assertEquals("Total repair failures in node till now is: 2", logMessage); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + assertEquals(2, myMetricInspector.getTotalRecordFailures()); + myMetricInspector.stopInspection(); + } +} diff --git a/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/utils/TestStatusLogger.java b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/utils/TestStatusLogger.java new file mode 100644 index 000000000..9a2cabf8d --- /dev/null +++ b/core/src/test/java/com/ericsson/bss/cassandra/ecchronos/core/utils/TestStatusLogger.java @@ -0,0 +1,167 @@ +/* + * Copyright 2024 Telefonaktiebolaget LM Ericsson + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.ericsson.bss.cassandra.ecchronos.core.utils; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.LoggerContext; +import ch.qos.logback.classic.spi.ILoggingEvent; +import ch.qos.logback.core.read.ListAppender; +import com.ericsson.bss.cassandra.ecchronos.core.TableStorageStates; +import com.ericsson.bss.cassandra.ecchronos.core.metrics.TableRepairMetricsImpl; +import io.micrometer.core.instrument.MeterRegistry; +import io.micrometer.core.instrument.composite.CompositeMeterRegistry; +import io.micrometer.core.instrument.simple.SimpleMeterRegistry; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mock; +import org.mockito.runners.MockitoJUnitRunner; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static com.ericsson.bss.cassandra.ecchronos.core.MockTableReferenceFactory.tableReference; +import static org.junit.Assert.assertEquals; + +@RunWith(MockitoJUnitRunner.class) +public class TestStatusLogger +{ + private static final String TEST_KEYSPACE = "test_keyspace"; + private static final String TEST_TABLE1 = "test_table1"; + private static final String TEST_TABLE2 = "test_table2"; + @Mock + private TableStorageStates myTableStorageStates; + private MeterRegistry myMeterRegistry; + private TableRepairMetricsImpl myTableRepairMetricsImpl; + + private LoggerContext loggerContext; + private ListAppender listAppender; + + @Before + public void init() + { + loggerContext = (LoggerContext) LoggerFactory.getILoggerFactory(); + listAppender = new ListAppender<>(); + listAppender.start(); + loggerContext.getLogger(StatusLogger.class).addAppender(listAppender); + // Use composite registry here to simulate real world scenario where we have multiple registries + CompositeMeterRegistry compositeMeterRegistry = new CompositeMeterRegistry(); + // Need at least one registry present in composite to record metrics + compositeMeterRegistry.add(new SimpleMeterRegistry()); + myMeterRegistry = compositeMeterRegistry; + myTableRepairMetricsImpl = TableRepairMetricsImpl.builder() + .withTableStorageStates(myTableStorageStates) + .withMeterRegistry(myMeterRegistry) + .build(); + } + + @After + public void cleanup() + { + myMeterRegistry.close(); + myTableRepairMetricsImpl.close(); + } + + @Test + public void testLogForFailedRepairSessionCountLogs() + { + TableReference tableReference = tableReference(TEST_KEYSPACE, TEST_TABLE1); + long expectedRepairTime = 12345L; + myTableRepairMetricsImpl.repairSession(tableReference, expectedRepairTime, TimeUnit.MILLISECONDS, false); + StatusLogger.log(myMeterRegistry); + List logsList = listAppender.list; + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + assertEquals("Total repair failures in node till now is: 1", logMessage); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + } + + @Test + public void testLogForSuccessfulRepairSessionCountLogs() + { + TableReference tableReference = tableReference(TEST_KEYSPACE, TEST_TABLE1); + long expectedRepairTime = 12345L; + myTableRepairMetricsImpl.repairSession(tableReference, expectedRepairTime, TimeUnit.MILLISECONDS, true); + StatusLogger.log(myMeterRegistry); + List logsList = listAppender.list; + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + assertEquals("Total repair success in node till now is: 1", logMessage); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + } + + @Test + public void testLogForRepairedRatioStatusLogs() + { + TableReference tableReference = tableReference(TEST_KEYSPACE, TEST_TABLE1); + myTableRepairMetricsImpl.repairState(tableReference, 1, 0); + StatusLogger.log(myMeterRegistry); + List logsList = listAppender.list; + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + assertEquals("Node repair ratio is: 1.0", logMessage); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + } + + @Test + public void testLogForLastRepairedAtStatusLogs() + { + TableReference tableReference1 = tableReference(TEST_KEYSPACE, TEST_TABLE1); + TableReference tableReference2 = tableReference(TEST_KEYSPACE, TEST_TABLE2); + long timeNow = System.currentTimeMillis(); + long timeDiff1 = 1000L; + long expectedLastRepaired1 = timeNow - timeDiff1; + long timeDiff2 = 5000L; + long expectedLastRepaired2 = timeNow - timeDiff2; + myTableRepairMetricsImpl.lastRepairedAt(tableReference1, expectedLastRepaired1); + myTableRepairMetricsImpl.lastRepairedAt(tableReference2, expectedLastRepaired2); + StatusLogger.log(myMeterRegistry); + List logsList = listAppender.list; + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + assertEquals("Node last repaired at", logMessage.substring(0, 21)); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + } + + @Test + public void testLogForNodeRemainingRepairTimeStatusLogs() + { + TableReference tableReference1 = tableReference(TEST_KEYSPACE, TEST_TABLE1); + TableReference tableReference2 = tableReference(TEST_KEYSPACE, TEST_TABLE2); + long expectedRemainingRepairTime1 = 10L; + long expectedRemainingRepairTime2 = 20L; + myTableRepairMetricsImpl.remainingRepairTime(tableReference1, expectedRemainingRepairTime1); + myTableRepairMetricsImpl.remainingRepairTime(tableReference2, expectedRemainingRepairTime2); + StatusLogger.log(myMeterRegistry); + List logsList = listAppender.list; + long count = logsList.stream().count(); + String logMessage = logsList.get(0).getFormattedMessage(); + Level logLevel = logsList.get(0).getLevel(); + Double expectedRepairTime = (double) (expectedRemainingRepairTime1 + expectedRemainingRepairTime2) / 1000; + assertEquals("Remaining time for node repair: ".concat(expectedRepairTime.toString()), logMessage); + assertEquals(Level.DEBUG, logLevel); + assertEquals(1, count); + } +} diff --git a/core/src/test/resources/logback-test.xml b/core/src/test/resources/logback-test.xml index 8be0b9ea7..570fac349 100644 --- a/core/src/test/resources/logback-test.xml +++ b/core/src/test/resources/logback-test.xml @@ -26,6 +26,7 @@ + diff --git a/docs/METRICS.md b/docs/METRICS.md index a844c6e97..4fdcb7b03 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -742,4 +742,15 @@ repair_sessions_seconds_max{keyspace="ks1",successful="true",table="tbl1",} 0.0 repair_sessions_seconds_count{keyspace="ks1",successful="false",table="tbl1",} 793.0 repair_sessions_seconds_sum{keyspace="ks1",successful="false",table="tbl1",} 5.317104685 repair_sessions_seconds_max{keyspace="ks1",successful="false",table="tbl1",} 0.0 -``` \ No newline at end of file +``` + +## Metric Status Logger +Whenever metric is enabled, a logger is triggered which monitor metrics for +repair failures within a defined time window. If number of repair failures +overshoot the number(`repair_failures_count`) configured in ecc.yml within the time +window then ecchronos metrics are printed in debug logs. + +Repair failures threshold is handled by a property `statistics.repair_failures_count`. The +time window can be configured via `statistics.repair_failures_time_window`. There is +another field `statistics.trigger_interval_for_metric_inspection` which is used for controlling +the repeat time in which metric inspection will take place. \ No newline at end of file diff --git a/docs/autogenerated/EccYamlFile.md b/docs/autogenerated/EccYamlFile.md index 3cfc41648..598dfcd3d 100644 --- a/docs/autogenerated/EccYamlFile.md +++ b/docs/autogenerated/EccYamlFile.md @@ -262,6 +262,24 @@ # The prefix cannot start or end with a dot or any other path separator. # * prefix: '' +# +# Number of repair failures before status logger logs metrics in debug logs +# The number is used to trigger a status once number of failures is breached in a time window mentioned below +# +* repair_failures_count: 5 +# +# Time window over which to track repair failures in node for trigger status logger messages in debug log +# +**repair_failures_time_window:** +* time: 30 +* unit: minutes +# +# Trigger interval for metric inspection. +# This time should always be lesser than repair_failures_time_window +# +**trigger_interval_for_metric_inspection:** +* time: 5 +* unit: seconds **lock_factory:** **cas:**