From 87b238aecfc1f39210e4e3a0bef54aa700ad5cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 18 Dec 2025 09:44:38 +0100 Subject: [PATCH 1/8] Attempt to fix shutdown of LeaderTragicEventTest --- .../solr/cloud/MiniSolrCloudCluster.java | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index a632928d361..28a581ec6b2 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -673,7 +673,20 @@ public void shutdown() throws Exception { jettys.clear(); final ExecutorService executorCloser = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("jetty-closer")); - Collection> futures = executorCloser.invokeAll(shutdowns); + + // Use a timeout to prevent indefinite hangs during shutdown, especially when cores + // are in a bad state (e.g., after tragic events). 60 seconds should be enough for + // parallel shutdown of all jettys in most cases. + Collection> futures; + try { + futures = executorCloser.invokeAll(shutdowns, 60, TimeUnit.SECONDS); + } catch (InterruptedException e) { + log.warn("Interrupted while shutting down jettys", e); + Thread.currentThread().interrupt(); + executorCloser.shutdownNow(); + throw e; + } + ExecutorUtil.shutdownAndAwaitTermination(executorCloser); Exception shutdownError = checkForExceptions("Error shutting down MiniSolrCloudCluster", futures); @@ -776,6 +789,11 @@ private Exception checkForExceptions(String message, Collection Date: Thu, 18 Dec 2025 12:21:51 +0100 Subject: [PATCH 2/8] Review feedback --- gradle/libs.versions.toml | 1 - .../java/org/apache/solr/cloud/MiniSolrCloudCluster.java | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 4a0531410bc..be018993437 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -182,7 +182,6 @@ openjdk-jmh = "1.37" opentelemetry = "1.56.0" opentelemetry-prometheus = "1.56.0-alpha" opentelemetry-runtime-telemetry = "2.22.0-alpha" -osgi-annotation = "8.1.0" oshai-logging = "7.0.13" # @keep for version alignment ow2-asm = "9.8" diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index 28a581ec6b2..8fc6f0a465a 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -43,6 +43,7 @@ import java.util.Set; import java.util.SortedMap; import java.util.concurrent.Callable; +import java.util.concurrent.CancellationException; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CountDownLatch; @@ -789,9 +790,8 @@ private Exception checkForExceptions(String message, Collection Date: Thu, 18 Dec 2025 12:43:51 +0100 Subject: [PATCH 3/8] Make MiniSolrCloudCluster shutdown behavior configurable --- .../solr/cloud/LeaderTragicEventTest.java | 2 + .../solr/cloud/MiniSolrCloudCluster.java | 61 ++++++++++++++++--- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java index 01d84378474..7ca58ca5204 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java @@ -56,6 +56,8 @@ public static void setupCluster() throws Exception { configureCluster(2) .addConfig( "config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf")) + .withShutdownTimeout(60) + .withShutdownTimeoutIsError(false) .configure(); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index 8fc6f0a465a..c9f32db79fd 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -161,6 +161,8 @@ public class MiniSolrCloudCluster { private final JettyConfig jettyConfig; private final String solrXml; private final boolean trackJettyMetrics; + private final int shutdownTimeout; + private final boolean shutdownTimeoutIsError; private final AtomicInteger nodeIds = new AtomicInteger(); private final Map solrClientByCollection = new ConcurrentHashMap<>(); @@ -243,7 +245,9 @@ public MiniSolrCloudCluster( zkTestServer, securityJson, false, - formatZkServer); + formatZkServer, + 300, + true); } /** @@ -258,6 +262,8 @@ public MiniSolrCloudCluster( * @param zkTestServer ZkTestServer to use. If null, one will be created * @param securityJson A string representation of security.json file (optional). * @param trackJettyMetrics supply jetties with metrics registry + * @param shutdownTimeout timeout in seconds for shutdown (default 300) + * @param shutdownTimeoutIsError whether timeout during shutdown is an error (default true) * @throws Exception if there was an error starting the cluster */ MiniSolrCloudCluster( @@ -268,7 +274,9 @@ public MiniSolrCloudCluster( ZkTestServer zkTestServer, Optional securityJson, boolean trackJettyMetrics, - boolean formatZkServer) + boolean formatZkServer, + int shutdownTimeout, + boolean shutdownTimeoutIsError) throws Exception { Objects.requireNonNull(securityJson); @@ -276,6 +284,8 @@ public MiniSolrCloudCluster( this.jettyConfig = Objects.requireNonNull(jettyConfig); this.solrXml = solrXml == null ? DEFAULT_CLOUD_SOLR_XML : solrXml; this.trackJettyMetrics = trackJettyMetrics; + this.shutdownTimeout = shutdownTimeout; + this.shutdownTimeoutIsError = shutdownTimeoutIsError; log.info("Starting cluster of {} servers in {}", numServers, baseDir); @@ -676,11 +686,11 @@ public void shutdown() throws Exception { ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("jetty-closer")); // Use a timeout to prevent indefinite hangs during shutdown, especially when cores - // are in a bad state (e.g., after tragic events). 60 seconds should be enough for - // parallel shutdown of all jettys in most cases. + // are in a bad state (e.g., after tragic events). The default timeout is 300 seconds, + // but can be configured via the builder. Collection> futures; try { - futures = executorCloser.invokeAll(shutdowns, 60, TimeUnit.SECONDS); + futures = executorCloser.invokeAll(shutdowns, shutdownTimeout, TimeUnit.SECONDS); } catch (InterruptedException e) { log.warn("Interrupted while shutting down jettys", e); Thread.currentThread().interrupt(); @@ -791,9 +801,15 @@ private Exception checkForExceptions(String message, Collection Date: Thu, 18 Dec 2025 12:53:19 +0100 Subject: [PATCH 4/8] Cleanup non-stopped jettys --- .../solr/cloud/MiniSolrCloudCluster.java | 61 +++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index c9f32db79fd..5e34ad2e814 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -677,18 +677,21 @@ public void shutdown() throws Exception { }); solrClientByCollection.clear(); - List> shutdowns = new ArrayList<>(jettys.size()); - for (final JettySolrRunner jetty : jettys) { + // Create a list of shutdown tasks paired with their jetty instances + // so we can force-stop any that fail to shut down cleanly + List jettyList = new ArrayList<>(jettys); + List> shutdowns = new ArrayList<>(jettyList.size()); + for (final JettySolrRunner jetty : jettyList) { shutdowns.add(() -> stopJettySolrRunner(jetty)); } - jettys.clear(); + final ExecutorService executorCloser = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("jetty-closer")); // Use a timeout to prevent indefinite hangs during shutdown, especially when cores // are in a bad state (e.g., after tragic events). The default timeout is 300 seconds, // but can be configured via the builder. - Collection> futures; + List> futures; try { futures = executorCloser.invokeAll(shutdowns, shutdownTimeout, TimeUnit.SECONDS); } catch (InterruptedException e) { @@ -699,6 +702,12 @@ public void shutdown() throws Exception { } ExecutorUtil.shutdownAndAwaitTermination(executorCloser); + + // Force-stop any jettys that failed to shutdown cleanly + forceStopFailedJettys(jettyList, futures); + + jettys.clear(); + Exception shutdownError = checkForExceptions("Error shutting down MiniSolrCloudCluster", futures); if (shutdownError != null) { @@ -789,6 +798,50 @@ public CloudLegacySolrClient.Builder basicSolrClientBuilder() { .withConnectionTimeout(15000); } + /** + * Attempts to forcefully stop any Jetty instances that failed to shut down cleanly. This method + * is called after the normal shutdown timeout to ensure resources are properly cleaned up. + * + * @param jettyList the list of jetty instances that we attempted to shut down + * @param futures the futures corresponding to the shutdown tasks + */ + private void forceStopFailedJettys( + List jettyList, List> futures) { + for (int i = 0; i < futures.size(); i++) { + Future future = futures.get(i); + JettySolrRunner jetty = jettyList.get(i); + try { + future.get(); + // Successfully shut down, nothing to do + } catch (CancellationException e) { + // Task was cancelled due to timeout - try to force stop + log.warn( + "Jetty {} shutdown task was cancelled, attempting forceful stop", jetty.getNodeName()); + try { + jetty.stop(); + log.info("Successfully force-stopped jetty {}", jetty.getNodeName()); + } catch (Exception stopEx) { + log.error("Failed to force-stop jetty " + jetty.getNodeName(), stopEx); + } + } catch (ExecutionException e) { + // Task threw an exception - jetty may still be running, try to force stop + log.warn( + "Jetty {} shutdown task failed with exception, attempting forceful stop", + jetty.getNodeName(), + e); + try { + jetty.stop(); + log.info("Successfully force-stopped jetty {} after exception", jetty.getNodeName()); + } catch (Exception stopEx) { + log.error("Failed to force-stop jetty " + jetty.getNodeName(), stopEx); + } + } catch (InterruptedException e) { + log.warn("Interrupted while checking jetty shutdown status", e); + Thread.currentThread().interrupt(); + } + } + } + private Exception checkForExceptions(String message, Collection> futures) throws InterruptedException { Exception parsed = new Exception(message); From 7c22b04a486d15473f4f698fa27f712b6178b59a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 18 Dec 2025 13:07:10 +0100 Subject: [PATCH 5/8] Do not touch libs.versions.toml in this PR --- gradle/libs.versions.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index be018993437..4a0531410bc 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -182,6 +182,7 @@ openjdk-jmh = "1.37" opentelemetry = "1.56.0" opentelemetry-prometheus = "1.56.0-alpha" opentelemetry-runtime-telemetry = "2.22.0-alpha" +osgi-annotation = "8.1.0" oshai-logging = "7.0.13" # @keep for version alignment ow2-asm = "9.8" From c6b27dfe1d33886478ca56ea313a2a85f8968d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 18 Dec 2025 13:20:54 +0100 Subject: [PATCH 6/8] Use same error msg --- .../src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index 5e34ad2e814..e529afa7017 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -855,7 +855,7 @@ private Exception checkForExceptions(String message, Collection Date: Thu, 18 Dec 2025 13:46:25 +0100 Subject: [PATCH 7/8] Add check for timeoutException --- .../solr/cloud/MiniSolrCloudCluster.java | 81 ++++--------------- 1 file changed, 14 insertions(+), 67 deletions(-) diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index e529afa7017..c9c6f68113a 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -246,7 +246,7 @@ public MiniSolrCloudCluster( securityJson, false, formatZkServer, - 300, + 60, true); } @@ -262,7 +262,7 @@ public MiniSolrCloudCluster( * @param zkTestServer ZkTestServer to use. If null, one will be created * @param securityJson A string representation of security.json file (optional). * @param trackJettyMetrics supply jetties with metrics registry - * @param shutdownTimeout timeout in seconds for shutdown (default 300) + * @param shutdownTimeout timeout in seconds for shutdown (default 60) * @param shutdownTimeoutIsError whether timeout during shutdown is an error (default true) * @throws Exception if there was an error starting the cluster */ @@ -677,11 +677,8 @@ public void shutdown() throws Exception { }); solrClientByCollection.clear(); - // Create a list of shutdown tasks paired with their jetty instances - // so we can force-stop any that fail to shut down cleanly - List jettyList = new ArrayList<>(jettys); - List> shutdowns = new ArrayList<>(jettyList.size()); - for (final JettySolrRunner jetty : jettyList) { + List> shutdowns = new ArrayList<>(jettys.size()); + for (final JettySolrRunner jetty : jettys) { shutdowns.add(() -> stopJettySolrRunner(jetty)); } @@ -689,7 +686,7 @@ public void shutdown() throws Exception { ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("jetty-closer")); // Use a timeout to prevent indefinite hangs during shutdown, especially when cores - // are in a bad state (e.g., after tragic events). The default timeout is 300 seconds, + // are in a bad state (e.g., after tragic events). The default timeout is 60 seconds, // but can be configured via the builder. List> futures; try { @@ -703,9 +700,6 @@ public void shutdown() throws Exception { ExecutorUtil.shutdownAndAwaitTermination(executorCloser); - // Force-stop any jettys that failed to shutdown cleanly - forceStopFailedJettys(jettyList, futures); - jettys.clear(); Exception shutdownError = @@ -798,50 +792,6 @@ public CloudLegacySolrClient.Builder basicSolrClientBuilder() { .withConnectionTimeout(15000); } - /** - * Attempts to forcefully stop any Jetty instances that failed to shut down cleanly. This method - * is called after the normal shutdown timeout to ensure resources are properly cleaned up. - * - * @param jettyList the list of jetty instances that we attempted to shut down - * @param futures the futures corresponding to the shutdown tasks - */ - private void forceStopFailedJettys( - List jettyList, List> futures) { - for (int i = 0; i < futures.size(); i++) { - Future future = futures.get(i); - JettySolrRunner jetty = jettyList.get(i); - try { - future.get(); - // Successfully shut down, nothing to do - } catch (CancellationException e) { - // Task was cancelled due to timeout - try to force stop - log.warn( - "Jetty {} shutdown task was cancelled, attempting forceful stop", jetty.getNodeName()); - try { - jetty.stop(); - log.info("Successfully force-stopped jetty {}", jetty.getNodeName()); - } catch (Exception stopEx) { - log.error("Failed to force-stop jetty " + jetty.getNodeName(), stopEx); - } - } catch (ExecutionException e) { - // Task threw an exception - jetty may still be running, try to force stop - log.warn( - "Jetty {} shutdown task failed with exception, attempting forceful stop", - jetty.getNodeName(), - e); - try { - jetty.stop(); - log.info("Successfully force-stopped jetty {} after exception", jetty.getNodeName()); - } catch (Exception stopEx) { - log.error("Failed to force-stop jetty " + jetty.getNodeName(), stopEx); - } - } catch (InterruptedException e) { - log.warn("Interrupted while checking jetty shutdown status", e); - Thread.currentThread().interrupt(); - } - } - } - private Exception checkForExceptions(String message, Collection> futures) throws InterruptedException { Exception parsed = new Exception(message); @@ -850,18 +800,15 @@ private Exception checkForExceptions(String message, Collection Date: Thu, 18 Dec 2025 13:56:40 +0100 Subject: [PATCH 8/8] Remove configurability of timeout as it was not needed --- .../solr/cloud/LeaderTragicEventTest.java | 1 - .../solr/cloud/MiniSolrCloudCluster.java | 26 +++---------------- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java index 7ca58ca5204..03fcd8a7a7a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderTragicEventTest.java @@ -56,7 +56,6 @@ public static void setupCluster() throws Exception { configureCluster(2) .addConfig( "config", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf")) - .withShutdownTimeout(60) .withShutdownTimeoutIsError(false) .configure(); } diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index c9c6f68113a..f6c8f07501d 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -43,7 +43,6 @@ import java.util.Set; import java.util.SortedMap; import java.util.concurrent.Callable; -import java.util.concurrent.CancellationException; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CountDownLatch; @@ -161,7 +160,6 @@ public class MiniSolrCloudCluster { private final JettyConfig jettyConfig; private final String solrXml; private final boolean trackJettyMetrics; - private final int shutdownTimeout; private final boolean shutdownTimeoutIsError; private final AtomicInteger nodeIds = new AtomicInteger(); @@ -246,7 +244,6 @@ public MiniSolrCloudCluster( securityJson, false, formatZkServer, - 60, true); } @@ -262,7 +259,6 @@ public MiniSolrCloudCluster( * @param zkTestServer ZkTestServer to use. If null, one will be created * @param securityJson A string representation of security.json file (optional). * @param trackJettyMetrics supply jetties with metrics registry - * @param shutdownTimeout timeout in seconds for shutdown (default 60) * @param shutdownTimeoutIsError whether timeout during shutdown is an error (default true) * @throws Exception if there was an error starting the cluster */ @@ -275,7 +271,6 @@ public MiniSolrCloudCluster( Optional securityJson, boolean trackJettyMetrics, boolean formatZkServer, - int shutdownTimeout, boolean shutdownTimeoutIsError) throws Exception { @@ -284,7 +279,6 @@ public MiniSolrCloudCluster( this.jettyConfig = Objects.requireNonNull(jettyConfig); this.solrXml = solrXml == null ? DEFAULT_CLOUD_SOLR_XML : solrXml; this.trackJettyMetrics = trackJettyMetrics; - this.shutdownTimeout = shutdownTimeout; this.shutdownTimeoutIsError = shutdownTimeoutIsError; log.info("Starting cluster of {} servers in {}", numServers, baseDir); @@ -685,12 +679,11 @@ public void shutdown() throws Exception { final ExecutorService executorCloser = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrNamedThreadFactory("jetty-closer")); - // Use a timeout to prevent indefinite hangs during shutdown, especially when cores - // are in a bad state (e.g., after tragic events). The default timeout is 60 seconds, - // but can be configured via the builder. + // Use a 60 second timeout to prevent indefinite hangs during shutdown, especially when cores + // are in a bad state (e.g., after tragic events). This is 2x Jetty's internal timeout. List> futures; try { - futures = executorCloser.invokeAll(shutdowns, shutdownTimeout, TimeUnit.SECONDS); + futures = executorCloser.invokeAll(shutdowns, 60, TimeUnit.SECONDS); } catch (InterruptedException e) { log.warn("Interrupted while shutting down jettys", e); Thread.currentThread().interrupt(); @@ -1066,7 +1059,6 @@ public static class Builder { EnvUtils.getPropertyAsBool("solr.cloud.overseer.enabled", true); private boolean formatZkServer = true; private boolean disableTraceIdGeneration = false; - private int shutdownTimeout = 60; private boolean shutdownTimeoutIsError = true; /** @@ -1186,17 +1178,6 @@ public Builder formatZkServer(boolean formatZkServer) { return this; } - /** - * Set the timeout for shutting down jetty instances - * - * @param shutdownTimeout timeout in seconds (default 60) - * @return the instance of {@linkplain Builder} - */ - public Builder withShutdownTimeout(int shutdownTimeout) { - this.shutdownTimeout = shutdownTimeout; - return this; - } - /** * Configure whether a timeout during shutdown is treated as an error * @@ -1242,7 +1223,6 @@ public MiniSolrCloudCluster build() throws Exception { securityJson, trackJettyMetrics, formatZkServer, - shutdownTimeout, shutdownTimeoutIsError); for (Config config : configs) { cluster.uploadConfigSet(config.path, config.name);