From ee61d42dc3b0422a3dfe918d4e958e08ac6ce45c Mon Sep 17 00:00:00 2001 From: DJ Gregor Date: Wed, 7 Jan 2026 18:30:45 -0800 Subject: [PATCH 1/3] Warn when long running trace feature is enabled but not supported Log a warning message when the long running traces feature is enabled but the tracer is not connected to a Datadog Agent that supports receiving long running traces. Previously the long running traces buffer would always be empty, even though the feature was enabled with dd.trace.experimental.long-running.enabled=true. This led to a good amount of confusion when I was initially developing a feature to dump long running traces without a local Datadog Agent running. --- .../trace/core/LongRunningTracesTracker.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java index 5d2e32fecf6..dae16125735 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java @@ -3,12 +3,17 @@ import datadog.communication.ddagent.DDAgentFeaturesDiscovery; import datadog.communication.ddagent.SharedCommunicationObjects; import datadog.trace.api.Config; +import datadog.trace.api.config.TracerConfig; import datadog.trace.core.monitor.HealthMetrics; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class LongRunningTracesTracker { + private static final Logger LOGGER = LoggerFactory.getLogger(LongRunningTracesTracker.class); + private final DDAgentFeaturesDiscovery features; private final HealthMetrics healthMetrics; private long lastFlushMilli = 0; @@ -41,6 +46,14 @@ public LongRunningTracesTracker( (int) TimeUnit.SECONDS.toMillis(config.getLongRunningTraceFlushInterval()); this.features = sharedCommunicationObjects.featuresDiscovery(config); this.healthMetrics = healthMetrics; + + if (!features.supportsLongRunning()) { + LOGGER.warn( + "Long running trace tracking is enabled via {}, however the Datadog Agent version {} does not support receiving long running traces. " + + "Long running traces will not be tracked.", + "dd." + TracerConfig.TRACE_LONG_RUNNING_ENABLED, + features.getVersion() != null ? features.getVersion() : "unknown"); + } } public boolean add(PendingTraceBuffer.Element element) { From eb0d36d1b67a8dc67df3beaf51e3f50cfbb797b0 Mon Sep 17 00:00:00 2001 From: DJ Gregor Date: Wed, 7 Jan 2026 18:37:01 -0800 Subject: [PATCH 2/3] Always track long running traces when feature is enabled This allows dumping long running traces when not connected to a Datadog Agent when using the new JMX flare feature. This introduces a change to the state handling for long-running traces. Previously, if features.supportsLongRunning() is false, the trace's slot is cleaned (but note that the state would never transition--see discussion below). With this ocmmit, these traces stay in their slot in the TRACKED until another condition removes them, which is the same as what would happen if features.supportsLongRunning() returned true. A note about state transitions: When the "if (trace.empty() || !features.supportsLongRunning())" block was entered previously, the trace's state would be transitioned to NOT_TRACKED, but only if the state was WRITE_RUNNING_SPANS. This would only be true when traces were empty AND they had passed through a flush cycle (which would transition them from TRACKED to WRITE_RUNNING_SPANS). Previously, when features.supportsLongRunning() is false, traces never make that transition so they would always be in the TRACKED state when cleanSlot is called. The only other consumer of the state is in PendingTrace and it is only checking for the WRITE_RUNNING_SPANS state, so I think this is not a problem. I think there might be a similar state transition issue if the sampling priority is ever reduced after a flush cycle, but it also looks innocuous. --- .../trace/core/LongRunningTracesTracker.java | 16 ++++++++++------ .../core/LongRunningTracesTrackerTest.groovy | 6 ++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java index dae16125735..e6dc092cebd 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java @@ -50,9 +50,11 @@ public LongRunningTracesTracker( if (!features.supportsLongRunning()) { LOGGER.warn( "Long running trace tracking is enabled via {}, however the Datadog Agent version {} does not support receiving long running traces. " - + "Long running traces will not be tracked.", + + "Long running traces will be tracked locally in memory (up to {} traces) but will NOT be sent to the agent. " + + "Long running traces are included in tracer flares.", "dd." + TracerConfig.TRACE_LONG_RUNNING_ENABLED, - features.getVersion() != null ? features.getVersion() : "unknown"); + features.getVersion() != null ? features.getVersion() : "unknown", + maxTrackedTraces); } } @@ -91,7 +93,7 @@ public void flushAndCompact(long nowMilli) { cleanSlot(i); continue; } - if (trace.empty() || !features.supportsLongRunning()) { + if (trace.empty()) { trace.compareAndSetLongRunningState(WRITE_RUNNING_SPANS, NOT_TRACKED); cleanSlot(i); continue; @@ -108,9 +110,11 @@ public void flushAndCompact(long nowMilli) { cleanSlot(i); continue; } - trace.compareAndSetLongRunningState(TRACKED, WRITE_RUNNING_SPANS); - write++; - trace.write(); + if (features.supportsLongRunning()) { + trace.compareAndSetLongRunningState(TRACKED, WRITE_RUNNING_SPANS); + write++; + trace.write(); + } } i++; } diff --git a/dd-trace-core/src/test/groovy/datadog/trace/core/LongRunningTracesTrackerTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/core/LongRunningTracesTrackerTest.groovy index 07313ee4282..47780604ee1 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/core/LongRunningTracesTrackerTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/core/LongRunningTracesTrackerTest.groovy @@ -123,7 +123,7 @@ class LongRunningTracesTrackerTest extends DDSpecification { trace.longRunningTrackedState == LongRunningTracesTracker.EXPIRED } - def "agent disabled feature"() { + def "trace remains tracked but not written when agent long running feature not available"() { given: def trace = newTraceToTrack() tracker.add(trace) @@ -133,7 +133,9 @@ class LongRunningTracesTrackerTest extends DDSpecification { then: 1 * features.supportsLongRunning() >> false - tracker.traceArray.size() == 0 + tracker.traceArray.size() == 1 + tracker.traceArray[0].longRunningTrackedState == LongRunningTracesTracker.TRACKED + tracker.traceArray[0].getLastWriteTime() == 0 } def flushAt(long timeMilli) { From 8be81a393c29774feb1d11553ee9bc9ea3254c88 Mon Sep 17 00:00:00 2001 From: DJ Gregor Date: Tue, 28 Oct 2025 09:32:57 -0700 Subject: [PATCH 3/3] Add long-running traces metric for drops due to sampling priority This likely isn't an important metric to track, but I noticed these long-running traces were the only ones not reflected in existing metrics when they are dropped from the tracker, so I thought it might be good to add this metric for completeness. This change introduces a new metric, "long-running.dropped_sampling", to count traces that are dropped when negativeOrNullPriority(trace) is true. There is an existing metric, "long-running.dropped", for long-running traces that are dropped on input to the tracker when there are no slots free. That metric name was kept as-is to not disturb any existing consumers downstream. If that is not a concern, it might be good to rename the existing metric to clarify that it captures traces dropped on input. --- .../datadog/trace/core/LongRunningTracesTracker.java | 5 ++++- .../datadog/trace/core/monitor/HealthMetrics.java | 3 ++- .../trace/core/monitor/TracerHealthMetrics.java | 12 +++++++++++- .../datadog/trace/core/PendingTraceBufferTest.groovy | 2 +- .../trace/core/monitor/HealthMetricsTest.groovy | 3 ++- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java index e6dc092cebd..8314f9cfb91 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/LongRunningTracesTracker.java @@ -26,6 +26,7 @@ public class LongRunningTracesTracker { private int dropped = 0; private int write = 0; private int expired = 0; + private int droppedSampling = 0; public static final int NOT_TRACKED = -1; public static final int UNDEFINED = 0; @@ -107,6 +108,7 @@ public void flushAndCompact(long nowMilli) { if (shouldFlush(nowMilli, trace)) { if (negativeOrNullPriority(trace)) { trace.compareAndSetLongRunningState(TRACKED, NOT_TRACKED); + droppedSampling++; cleanSlot(i); continue; } @@ -151,9 +153,10 @@ private boolean negativeOrNullPriority(PendingTrace trace) { } private void flushStats() { - healthMetrics.onLongRunningUpdate(dropped, write, expired); + healthMetrics.onLongRunningUpdate(dropped, write, expired, droppedSampling); dropped = 0; write = 0; expired = 0; + droppedSampling = 0; } } diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java index d0531a330cf..e315ce4e88b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/HealthMetrics.java @@ -71,7 +71,8 @@ public void onSend( public void onFailedSend( final int traceCount, final int sizeInBytes, final RemoteApi.Response response) {} - public void onLongRunningUpdate(final int dropped, final int write, final int expired) {} + public void onLongRunningUpdate( + final int dropped, final int write, final int expired, final int droppedSampling) {} /** * Report that a trace has been used to compute client stats. diff --git a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java index 54024e85721..cecc158007e 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/monitor/TracerHealthMetrics.java @@ -88,6 +88,7 @@ public class TracerHealthMetrics extends HealthMetrics implements AutoCloseable private final LongAdder longRunningTracesWrite = new LongAdder(); private final LongAdder longRunningTracesDropped = new LongAdder(); private final LongAdder longRunningTracesExpired = new LongAdder(); + private final LongAdder longRunningTracesDroppedSampling = new LongAdder(); private final LongAdder clientStatsProcessedSpans = new LongAdder(); private final LongAdder clientStatsProcessedTraces = new LongAdder(); @@ -296,10 +297,12 @@ public void onFailedSend( } @Override - public void onLongRunningUpdate(final int dropped, final int write, final int expired) { + public void onLongRunningUpdate( + final int dropped, final int write, final int expired, final int droppedSampling) { longRunningTracesWrite.add(write); longRunningTracesDropped.add(dropped); longRunningTracesExpired.add(expired); + longRunningTracesDroppedSampling.add(droppedSampling); } private void onSendAttempt( @@ -479,6 +482,11 @@ public void run(TracerHealthMetrics target) { target.statsd, "long-running.dropped", target.longRunningTracesDropped, NO_TAGS); reportIfChanged( target.statsd, "long-running.expired", target.longRunningTracesExpired, NO_TAGS); + reportIfChanged( + target.statsd, + "long-running.dropped_sampling", + target.longRunningTracesDroppedSampling, + NO_TAGS); reportIfChanged( target.statsd, "stats.traces_in", target.clientStatsProcessedTraces, NO_TAGS); @@ -608,6 +616,8 @@ public String summary() { + longRunningTracesDropped.sum() + "\nlongRunningTracesExpired=" + longRunningTracesExpired.sum() + + "\nlongRunningTracesDroppedSampling=" + + longRunningTracesDroppedSampling.sum() + "\n" + "\nclientStatsRequests=" + clientStatsRequests.sum() diff --git a/dd-trace-core/src/test/groovy/datadog/trace/core/PendingTraceBufferTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/core/PendingTraceBufferTest.groovy index b6ca943fa6c..d0140460ae0 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/core/PendingTraceBufferTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/core/PendingTraceBufferTest.groovy @@ -40,7 +40,7 @@ This can manifest when creating mocks. @Timeout(5) class PendingTraceBufferTest extends DDSpecification { @Subject - def buffer = PendingTraceBuffer.delaying(SystemTimeSource.INSTANCE, Mock(Config), null, null) + def buffer = PendingTraceBuffer.delaying(SystemTimeSource.INSTANCE, Mock(Config), null, HealthMetrics.NO_OP) def bufferSpy = Spy(buffer) def tracer = Mock(CoreTracer) diff --git a/dd-trace-core/src/test/groovy/datadog/trace/core/monitor/HealthMetricsTest.groovy b/dd-trace-core/src/test/groovy/datadog/trace/core/monitor/HealthMetricsTest.groovy index 7c6154876b0..406d839a28e 100644 --- a/dd-trace-core/src/test/groovy/datadog/trace/core/monitor/HealthMetricsTest.groovy +++ b/dd-trace-core/src/test/groovy/datadog/trace/core/monitor/HealthMetricsTest.groovy @@ -400,12 +400,13 @@ class HealthMetricsTest extends Specification { def healthMetrics = new TracerHealthMetrics(new Latched(statsD, latch), 100, TimeUnit.MILLISECONDS) healthMetrics.start() when: - healthMetrics.onLongRunningUpdate(3,10,1) + healthMetrics.onLongRunningUpdate(3,10,1,5) latch.await(10, TimeUnit.SECONDS) then: 1 * statsD.count("long-running.write", 10, _) 1 * statsD.count("long-running.dropped", 3, _) 1 * statsD.count("long-running.expired", 1, _) + 1 * statsD.count("long-running.dropped_sampling", 5, _) cleanup: healthMetrics.close() }