From dc456a078d4b074736b6e9667a383ea098517e8a Mon Sep 17 00:00:00 2001 From: Simon Chase Date: Wed, 3 Dec 2025 20:29:12 -0800 Subject: [PATCH 1/3] allocation: add timing and iteration to logging The DesiredBalanceComputer now has three groups of timing and iteration counts: the current round iteration and duration, the since-recompute iterations, round count, and duration, and the time since convergence. This logging change includes all available information in the logs, and groups them together by kind. --- .../allocator/DesiredBalanceComputer.java | 38 ++++++---- .../DesiredBalanceComputerTests.java | 74 ++++++++++++------- 2 files changed, 71 insertions(+), 41 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index c973658e1a1a3..fe2596faa0196 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -403,24 +403,29 @@ public DesiredBalance compute( convergenceLogMsgLevel, () -> Strings.format( """ - Desired balance computation for [%d] converged after [%s] and [%d] iterations, \ - resumed computation [%d] times with [%d] iterations since the last resumption [%s] ago""", + Desired balance computation for [%d] converged after [%s] and [%d] iterations this round, \ + resumed computation [%s] ago with [%d] iterations over [%d] rounds since the last convergence \ + [%s] ago""", desiredBalanceInput.index(), + TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), + iterations, TimeValue.timeValueMillis(currentTime - firstComputeStartedSinceConvergedTimeMillis).toString(), numIterationsSinceLastConverged, numComputeCallsSinceLastConverged, - iterations, - TimeValue.timeValueMillis(currentTime - computationStartedTime).toString() + TimeValue.timeValueMillis(currentTime - lastConvergedTimeMillis).toString() ) ); } else { logger.log( convergenceLogMsgLevel, () -> Strings.format( - "Desired balance computation for [%d] converged after [%s] and [%d] iterations", + """ + Desired balance computation for [%d] converged after [%s] and [%d] iterations this round, \ + last convergence was [%s] ago""", desiredBalanceInput.index(), - TimeValue.timeValueMillis(currentTime - firstComputeStartedSinceConvergedTimeMillis).toString(), - numIterationsSinceLastConverged + TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), + iterations, + TimeValue.timeValueMillis(currentTime - lastConvergedTimeMillis).toString() ) ); } @@ -465,24 +470,29 @@ public DesiredBalance compute( logLevel, () -> Strings.format( """ - Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations, \ - resumed computation [%d] times with [%d] iterations since the last resumption [%s] ago""", + Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations this round, \ + resumed computation [%s] ago with [%d] iterations over [%d] rounds since the last convergence \ + [%s] ago""", desiredBalanceInput.index(), + TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), + iterations, TimeValue.timeValueMillis(currentTime - firstComputeStartedSinceConvergedTimeMillis).toString(), numIterationsSinceLastConverged, numComputeCallsSinceLastConverged, - iterations, - TimeValue.timeValueMillis(currentTime - computationStartedTime).toString() + TimeValue.timeValueMillis(currentTime - lastConvergedTimeMillis).toString() ) ); } else { logger.log( logLevel, () -> Strings.format( - "Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations", + """ + Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations this round, \ + last convergence was [%s] ago""", desiredBalanceInput.index(), - TimeValue.timeValueMillis(currentTime - firstComputeStartedSinceConvergedTimeMillis).toString(), - numIterationsSinceLastConverged + TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), + iterations, + TimeValue.timeValueMillis(currentTime - lastConvergedTimeMillis).toString() ) ); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java index 0b894f5acb5fa..397a03eda0efd 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java @@ -1453,7 +1453,7 @@ public void testShouldLogComputationIteration() { "Should report long computation based on iteration count", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, - "Desired balance computation for [*] is still not converged after [10s] and [1000] iterations" + "Desired balance computation for [*] is still not converged after [10s] and [1000] iterations *" ) ); @@ -1464,7 +1464,7 @@ public void testShouldLogComputationIteration() { "Should report long computation based on time", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, - "Desired balance computation for [*] is still not converged after [1m] and [60] iterations" + "Desired balance computation for [*] is still not converged after [1m] and [60] iterations *" ) ); } @@ -1562,36 +1562,44 @@ record ExpectedLastConvergenceInfo(int numComputeCalls, int numTotalIterations, record LogExpectationData( boolean isConverged, - String timeSinceConverged, + String currentDuration, + int currentIterations, + String timeSinceRecompute, int totalIterations, int totalComputeCalls, - int currentIterations, - String currentDuration + String timeSinceConverged ) { - LogExpectationData(boolean isConverged, String timeSinceConverged, int totalIterations) { - this(isConverged, timeSinceConverged, totalIterations, 0, 0, ""); + LogExpectationData(boolean isConverged, String currentDuration, int currentIterations, String timeSinceConverged) { + this(isConverged, currentDuration, currentIterations, "", 0, 0, timeSinceConverged); } } Function getLogExpectation = data -> { - final var singleComputeCallMsg = "Desired balance computation for [%d] " + final var message = "Desired balance computation for [%d] " + (data.isConverged ? "" : "is still not ") - + "converged after [%s] and [%d] iterations"; + + "converged after [%s] and [%d] iterations this round, "; return new MockLog.SeenEventExpectation( "expected a " + (data.isConverged ? "converged" : "not converged") + " log message", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, (data.totalComputeCalls > 1 ? Strings.format( - singleComputeCallMsg + ", resumed computation [%d] times with [%d] iterations since the last resumption [%s] ago", + message + "resumed computation [%s] ago with [%d] iterations over [%d] rounds since the last convergence [%s] ago", indexSequence.get(), - data.timeSinceConverged, + data.currentDuration, + data.currentIterations, + data.timeSinceRecompute, data.totalIterations, data.totalComputeCalls, - data.currentIterations, - data.currentDuration + data.timeSinceConverged ) - : Strings.format(singleComputeCallMsg, indexSequence.get(), data.timeSinceConverged, data.totalIterations)) + : Strings.format( + message + "last convergence was [%s] ago", + indexSequence.get(), + data.currentDuration, + data.currentIterations, + data.timeSinceConverged + )) ); }; @@ -1609,7 +1617,7 @@ record LogExpectationData( // Converges right away, verify the debug level convergence message. assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> true), - getLogExpectation.apply(new LogExpectationData(true, "2ms", 2)) + getLogExpectation.apply(new LogExpectationData(true, "2ms", 2, "3ms")) ); assertFinishReason.accept(DesiredBalance.ComputationFinishReason.CONVERGED); final var lastConvergenceTimestampMillis = computer.getLastConvergedTimeMillis(); @@ -1620,7 +1628,7 @@ record LogExpectationData( // This INFO is triggered from the interval since last convergence timestamp. assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> iterationCounter.get() < 6), - getLogExpectation.apply(new LogExpectationData(false, "5ms", 5)) + getLogExpectation.apply(new LogExpectationData(false, "5ms", 5, "6ms")) ); assertFinishReason.accept(DesiredBalance.ComputationFinishReason.YIELD_TO_NEW_INPUT); assertLastConvergenceInfo.accept(new ExpectedLastConvergenceInfo(1, 6, lastConvergenceTimestampMillis)); @@ -1629,26 +1637,26 @@ record LogExpectationData( // The next INFO is triggered from the interval since last INFO message logged, and then another after the interval period. assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> iterationCounter.get() < 9), - getLogExpectation.apply(new LogExpectationData(false, "10ms", 9, 2, 3, "3ms")), - getLogExpectation.apply(new LogExpectationData(false, "15ms", 14, 2, 8, "8ms")) + getLogExpectation.apply(new LogExpectationData(false, "3ms", 3, "10ms", 9, 2, "11ms")), + getLogExpectation.apply(new LogExpectationData(false, "8ms", 8, "15ms", 14, 2, "16ms")) ); assertFinishReason.accept(DesiredBalance.ComputationFinishReason.YIELD_TO_NEW_INPUT); assertLastConvergenceInfo.accept(new ExpectedLastConvergenceInfo(2, 15, lastConvergenceTimestampMillis)); assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> true), - getLogExpectation.apply(new LogExpectationData(false, "20ms", 18, 3, 3, "3ms")), - getLogExpectation.apply(new LogExpectationData(false, "25ms", 23, 3, 8, "8ms")), - getLogExpectation.apply(new LogExpectationData(true, "29ms", 27, 3, 12, "12ms")) + getLogExpectation.apply(new LogExpectationData(false, "3ms", 3, "20ms", 18, 3, "21ms")), + getLogExpectation.apply(new LogExpectationData(false, "8ms", 8, "25ms", 23, 3, "26ms")), + getLogExpectation.apply(new LogExpectationData(true, "12ms", 12, "29ms", 27, 3, "30ms")) ); assertFinishReason.accept(DesiredBalance.ComputationFinishReason.CONVERGED); // First INFO is triggered from interval since last converged, second is triggered from the inverval since the last INFO log. assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> true), - getLogExpectation.apply(new LogExpectationData(false, "5ms", 5)), - getLogExpectation.apply(new LogExpectationData(false, "10ms", 10)), - getLogExpectation.apply(new LogExpectationData(true, "12ms", 12)) + getLogExpectation.apply(new LogExpectationData(false, "5ms", 5, "6ms")), + getLogExpectation.apply(new LogExpectationData(false, "10ms", 10, "11ms")), + getLogExpectation.apply(new LogExpectationData(true, "12ms", 12, "13ms")) ); assertFinishReason.accept(DesiredBalance.ComputationFinishReason.CONVERGED); @@ -1663,7 +1671,7 @@ record LogExpectationData( requiredIterations.set(2); assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> iterationCounter.get() < 2), - getLogExpectation.apply(new LogExpectationData(true, "2ms", 2)) + getLogExpectation.apply(new LogExpectationData(true, "2ms", 2, "103ms")) ); // test a non-convergence and convergence message with a significant time delta, @@ -1675,8 +1683,20 @@ record LogExpectationData( assertLoggerExpectationsFor( getComputeRunnableForIsFreshPredicate.apply(ignored -> true), - getLogExpectation.apply(new LogExpectationData(false, "103ms", 2, 2, 1, "1ms")), - getLogExpectation.apply(new LogExpectationData(true, "104ms", 3, 2, 2, "2ms")) + getLogExpectation.apply(new LogExpectationData(false, "1ms", 1, "103ms", 2, 2, "204ms")), + getLogExpectation.apply(new LogExpectationData(true, "2ms", 2, "104ms", 3, 2, "205ms")) + ); + + // similarly, test the time-since-convergence and time-since-compute for logging + // caused by intervals instead of iterations + iterationCounter.set(0); + requiredIterations.set(6); + timeInMillis.addAndGet(100L); + + assertLoggerExpectationsFor( + getComputeRunnableForIsFreshPredicate.apply(ignored -> true), + getLogExpectation.apply(new LogExpectationData(false, "5ms", 5, "106ms")), + getLogExpectation.apply(new LogExpectationData(true, "6ms", 6, "107ms")) ); } From 849c1e03878008942275f1051ae0f2d002228edb Mon Sep 17 00:00:00 2001 From: Simon Chase Date: Fri, 5 Dec 2025 13:14:39 -0800 Subject: [PATCH 2/3] Test testShouldLogComputationIteration and add convergence to recompute time delay --- .../allocator/DesiredBalanceComputerTests.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java index 397a03eda0efd..d65280753fd00 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java @@ -1453,7 +1453,9 @@ public void testShouldLogComputationIteration() { "Should report long computation based on iteration count", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, - "Desired balance computation for [*] is still not converged after [10s] and [1000] iterations *" + """ + Desired balance computation for [*] is still not converged after [10s] and [1000] iterations this round, \ + last convergence was [1.1m] ago""" ) ); @@ -1464,7 +1466,9 @@ public void testShouldLogComputationIteration() { "Should report long computation based on time", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, - "Desired balance computation for [*] is still not converged after [1m] and [60] iterations *" + """ + Desired balance computation for [*] is still not converged after [1m] and [60] iterations this round, \ + last convergence was [2m] ago""" ) ); } @@ -1506,6 +1510,9 @@ public ShardAllocationDecision explainShardAllocation(ShardRouting shard, Routin } }, TEST_ONLY_EXPLAINER); + // simulate time lag between last convergence and computation start + currentTime.addAndGet(60 * 1000); + assertLoggerExpectationsFor(() -> { var iteration = new AtomicInteger(0); desiredBalanceComputer.compute( From cf29312d0ea46d39467033f474ea7e7c499c07b8 Mon Sep 17 00:00:00 2001 From: Simon Chase Date: Wed, 10 Dec 2025 19:02:12 -0800 Subject: [PATCH 3/3] Reordered log message to group total time/iterations/rounds first --- .../allocator/DesiredBalanceComputer.java | 22 +++++++++---------- .../DesiredBalanceComputerTests.java | 16 +++++++------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java index fe2596faa0196..5b2e88bfa9ab0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputer.java @@ -403,15 +403,14 @@ public DesiredBalance compute( convergenceLogMsgLevel, () -> Strings.format( """ - Desired balance computation for [%d] converged after [%s] and [%d] iterations this round, \ - resumed computation [%s] ago with [%d] iterations over [%d] rounds since the last convergence \ - [%s] ago""", + Desired balance computation for [%d] converged after [%s] and [%d] iterations in [%d] rounds, \ + [%d] iterations this past round since [%s], last convergence was [%s] ago""", desiredBalanceInput.index(), - TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), - iterations, TimeValue.timeValueMillis(currentTime - firstComputeStartedSinceConvergedTimeMillis).toString(), numIterationsSinceLastConverged, numComputeCallsSinceLastConverged, + iterations, + TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), TimeValue.timeValueMillis(currentTime - lastConvergedTimeMillis).toString() ) ); @@ -420,7 +419,7 @@ public DesiredBalance compute( convergenceLogMsgLevel, () -> Strings.format( """ - Desired balance computation for [%d] converged after [%s] and [%d] iterations this round, \ + Desired balance computation for [%d] converged after [%s] and [%d] iterations in a single round, \ last convergence was [%s] ago""", desiredBalanceInput.index(), TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), @@ -470,15 +469,14 @@ public DesiredBalance compute( logLevel, () -> Strings.format( """ - Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations this round, \ - resumed computation [%s] ago with [%d] iterations over [%d] rounds since the last convergence \ - [%s] ago""", + Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations in [%d] rounds, \ + [%d] iterations this past round since [%s], last convergence was [%s] ago""", desiredBalanceInput.index(), - TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), - iterations, TimeValue.timeValueMillis(currentTime - firstComputeStartedSinceConvergedTimeMillis).toString(), numIterationsSinceLastConverged, numComputeCallsSinceLastConverged, + iterations, + TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), TimeValue.timeValueMillis(currentTime - lastConvergedTimeMillis).toString() ) ); @@ -487,7 +485,7 @@ public DesiredBalance compute( logLevel, () -> Strings.format( """ - Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations this round, \ + Desired balance computation for [%d] is still not converged after [%s] and [%d] iterations in a single round, \ last convergence was [%s] ago""", desiredBalanceInput.index(), TimeValue.timeValueMillis(currentTime - computationStartedTime).toString(), diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java index d65280753fd00..eec68503245b0 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceComputerTests.java @@ -1442,7 +1442,7 @@ public void testShouldLogComputationIteration() { "Should not report long computation too early", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, - "Desired balance computation for [*] is still not converged after [*] and [*] iterations" + "* still not converged after *" ) ); @@ -1454,7 +1454,7 @@ public void testShouldLogComputationIteration() { DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, """ - Desired balance computation for [*] is still not converged after [10s] and [1000] iterations this round, \ + Desired balance computation for [*] is still not converged after [10s] and [1000] iterations in a single round, \ last convergence was [1.1m] ago""" ) ); @@ -1467,7 +1467,7 @@ public void testShouldLogComputationIteration() { DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, """ - Desired balance computation for [*] is still not converged after [1m] and [60] iterations this round, \ + Desired balance computation for [*] is still not converged after [1m] and [60] iterations in a single round, \ last convergence was [2m] ago""" ) ); @@ -1584,24 +1584,24 @@ record LogExpectationData( Function getLogExpectation = data -> { final var message = "Desired balance computation for [%d] " + (data.isConverged ? "" : "is still not ") - + "converged after [%s] and [%d] iterations this round, "; + + "converged after [%s] and [%d] iterations "; return new MockLog.SeenEventExpectation( "expected a " + (data.isConverged ? "converged" : "not converged") + " log message", DesiredBalanceComputer.class.getCanonicalName(), Level.INFO, (data.totalComputeCalls > 1 ? Strings.format( - message + "resumed computation [%s] ago with [%d] iterations over [%d] rounds since the last convergence [%s] ago", + message + "in [%d] rounds, [%d] iterations this past round since [%s], last convergence was [%s] ago", indexSequence.get(), - data.currentDuration, - data.currentIterations, data.timeSinceRecompute, data.totalIterations, data.totalComputeCalls, + data.currentIterations, + data.currentDuration, data.timeSinceConverged ) : Strings.format( - message + "last convergence was [%s] ago", + message + "in a single round, last convergence was [%s] ago", indexSequence.get(), data.currentDuration, data.currentIterations,