A bunch of improvements thanks to claudes team of review agents

ericallam · ericallam · commit e6eac4a2c3dc · 2026-02-15T22:04:30.000Z
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
@@ -1233,6 +1233,9 @@ const EnvironmentSchema = z
     EVENTS_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"),
     EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000),
     EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000),
+    METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000),
+    METRICS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000),
+    METRICS_CLICKHOUSE_MAX_CONCURRENCY: z.coerce.number().int().default(3),
     EVENTS_CLICKHOUSE_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"),
     EVENTS_CLICKHOUSE_WAIT_FOR_ASYNC_INSERT: z.string().default("1"),
     EVENTS_CLICKHOUSE_ASYNC_INSERT_MAX_DATA_SIZE: z.coerce.number().int().default(10485760),
diff --git a/apps/webapp/app/v3/otlpExporter.server.ts b/apps/webapp/app/v3/otlpExporter.server.ts
@@ -21,6 +21,7 @@ import {
 import type { MetricsV1Input } from "@internal/clickhouse";
 import { logger } from "~/services/logger.server";
 import { clickhouseClient } from "~/services/clickhouseInstance.server";
+import { DynamicFlushScheduler } from "./dynamicFlushScheduler.server";
 import { ClickhouseEventRepository } from "./eventRepository/clickhouseEventRepository.server";
 import {
   clickhouseEventRepository,
@@ -47,6 +48,7 @@ class OTLPExporter {
     private readonly _eventRepository: EventRepository,
     private readonly _clickhouseEventRepository: ClickhouseEventRepository,
     private readonly _clickhouseEventRepositoryV2: ClickhouseEventRepository,
+    private readonly _metricsFlushScheduler: DynamicFlushScheduler<MetricsV1Input>,
     private readonly _verbose: boolean,
     private readonly _spanAttributeValueLengthLimit: number
   ) {
@@ -87,7 +89,7 @@ class OTLPExporter {
       span.setAttribute("metric_row_count", rows.length);
 
       if (rows.length > 0) {
-        await clickhouseClient.metrics.insert(rows);
+        this._metricsFlushScheduler.addToBatch(rows);
       }
 
       return ExportMetricsServiceResponse.create();
@@ -490,7 +492,7 @@ function convertMetricsToClickhouseRows(
       if (metric.gauge) {
         for (const dp of metric.gauge.dataPoints) {
           const value: number =
-            (dp.asDouble ?? 0) !== 0 ? dp.asDouble! : dp.asInt !== BigInt(0) ? Number(dp.asInt) : 0;
+            dp.asDouble !== undefined ? dp.asDouble : dp.asInt !== undefined ? Number(dp.asInt) : 0;
           const resolved = resolveDataPointContext(dp.attributes ?? [], resourceCtx);
 
           rows.push({
@@ -515,7 +517,7 @@ function convertMetricsToClickhouseRows(
       if (metric.sum) {
         for (const dp of metric.sum.dataPoints) {
           const value: number =
-            (dp.asDouble ?? 0) !== 0 ? dp.asDouble! : dp.asInt !== BigInt(0) ? Number(dp.asInt) : 0;
+            dp.asDouble !== undefined ? dp.asDouble : dp.asInt !== undefined ? Number(dp.asInt) : 0;
           const resolved = resolveDataPointContext(dp.attributes ?? [], resourceCtx);
 
           rows.push({
@@ -1133,10 +1135,22 @@ function hasUnpairedSurrogateAtEnd(str: string): boolean {
 export const otlpExporter = singleton("otlpExporter", initializeOTLPExporter);
 
 function initializeOTLPExporter() {
+  const metricsFlushScheduler = new DynamicFlushScheduler<MetricsV1Input>({
+    batchSize: env.METRICS_CLICKHOUSE_BATCH_SIZE,
+    flushInterval: env.METRICS_CLICKHOUSE_FLUSH_INTERVAL_MS,
+    callback: async (_flushId, batch) => {
+      await clickhouseClient.metrics.insert(batch);
+    },
+    minConcurrency: 1,
+    maxConcurrency: env.METRICS_CLICKHOUSE_MAX_CONCURRENCY,
+    loadSheddingEnabled: false,
+  });
+
   return new OTLPExporter(
     eventRepository,
     clickhouseEventRepository,
     clickhouseEventRepositoryV2,
+    metricsFlushScheduler,
     process.env.OTLP_EXPORTER_VERBOSE === "1",
     process.env.SERVER_OTEL_SPAN_ATTRIBUTE_VALUE_LENGTH_LIMIT
       ? parseInt(process.env.SERVER_OTEL_SPAN_ATTRIBUTE_VALUE_LENGTH_LIMIT, 10)
diff --git a/apps/webapp/app/v3/querySchemas.ts b/apps/webapp/app/v3/querySchemas.ts
@@ -557,7 +557,7 @@ export const metricsSchema: TableSchema = {
     },
     attempt_number: {
       name: "attempt_number",
-      ...column("String", {
+      ...column("UInt64", {
         description: "The attempt number for this metric",
         example: "1",
       }),
diff --git a/apps/webapp/app/v3/services/aiQueryService.server.ts b/apps/webapp/app/v3/services/aiQueryService.server.ts
@@ -453,13 +453,13 @@ Only use explicit \`toStartOfHour\`/\`toStartOfDay\` etc. if the user specifical
 - Filter by metric name: WHERE metric_name = 'process.cpu.utilization'
 - Filter by run: WHERE run_id = 'run_abc123'
 - Filter by task: WHERE task_identifier = 'my-task'
-- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors
+- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors, nodejs.event_loop.utilization, nodejs.event_loop.delay.p50, nodejs.event_loop.delay.p99, nodejs.event_loop.delay.max, nodejs.heap.used, nodejs.heap.total
 - Use max_value or last_value for gauges (CPU utilization, memory usage), sum_value for counters (CPU time, network IO)
 - Use prettyFormat(expr, 'bytes') to tell the UI to format values as bytes (e.g., "1.50 GiB") — keeps values numeric for charts
 - Use prettyFormat(expr, 'percent') for percentage values
 - prettyFormat does NOT change the SQL — it only adds a display hint
 - Available format types: bytes, decimalBytes, percent, quantity, duration, durationSeconds, costInDollars
-- For memory metrics, always use prettyFormat with 'bytes'
+- For memory metrics (including nodejs.heap.*), always use prettyFormat with 'bytes'
 - For CPU utilization, consider prettyFormat with 'percent'
 
 \`\`\`sql
@@ -588,9 +588,9 @@ LIMIT 1000
 
 ### Common Metrics Patterns
 - Filter by metric: WHERE metric_name = 'process.cpu.utilization'
-- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors
+- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors, nodejs.event_loop.utilization, nodejs.event_loop.delay.p50, nodejs.event_loop.delay.p99, nodejs.event_loop.delay.max, nodejs.heap.used, nodejs.heap.total
 - Use max_value or last_value for gauges (CPU utilization, memory usage), sum_value for counters (CPU time, network IO)
-- Use prettyFormat(expr, 'bytes') for memory metrics, prettyFormat(expr, 'percent') for CPU utilization
+- Use prettyFormat(expr, 'bytes') for memory metrics (including nodejs.heap.*), prettyFormat(expr, 'percent') for CPU utilization
 - prettyFormat does NOT change the SQL — it only adds a display hint for the UI
 
 ## Important Rules
diff --git a/internal-packages/clickhouse/schema/017_create_metrics_v1.sql b/internal-packages/clickhouse/schema/017_create_metrics_v1.sql
@@ -3,16 +3,16 @@ CREATE TABLE IF NOT EXISTS trigger_dev.metrics_v1
 (
   organization_id     LowCardinality(String),
   project_id          LowCardinality(String),
-  environment_id      String,
+  environment_id      String CODEC(ZSTD(1)),
   metric_name         LowCardinality(String),
   metric_type         LowCardinality(String),
-  metric_subject      String,
-  bucket_start        DateTime,
-  count               UInt64 DEFAULT 0,
-  sum_value           Float64 DEFAULT 0,
-  max_value           Float64 DEFAULT 0,
-  min_value           Float64 DEFAULT 0,
-  last_value          Float64 DEFAULT 0,
+  metric_subject      String CODEC(ZSTD(1)),
+  bucket_start        DateTime CODEC(Delta(4), ZSTD(1)),
+  count               UInt64 DEFAULT 0 CODEC(ZSTD(1)),
+  sum_value           Float64 DEFAULT 0 CODEC(ZSTD(1)),
+  max_value           Float64 DEFAULT 0 CODEC(ZSTD(1)),
+  min_value           Float64 DEFAULT 0 CODEC(ZSTD(1)),
+  last_value          Float64 DEFAULT 0 CODEC(ZSTD(1)),
   attributes          JSON(
     `trigger.run_id` String,
     `trigger.task_slug` String,
@@ -29,12 +29,15 @@ CREATE TABLE IF NOT EXISTS trigger_dev.metrics_v1
     `process.cpu.state` LowCardinality(String),
     `network.io.direction` LowCardinality(String),
     max_dynamic_paths=8
-  )
+  ),
+  INDEX idx_run_id attributes.trigger.run_id TYPE bloom_filter(0.001) GRANULARITY 1,
+  INDEX idx_task_slug attributes.trigger.task_slug TYPE bloom_filter(0.001) GRANULARITY 1
 )
 ENGINE = MergeTree()
-PARTITION BY toYYYYMM(bucket_start)
+PARTITION BY toDate(bucket_start)
 ORDER BY (organization_id, project_id, environment_id, metric_name, metric_subject, bucket_start)
-TTL bucket_start + INTERVAL 30 DAY;
+TTL bucket_start + INTERVAL 60 DAY
+SETTINGS ttl_only_drop_parts = 1;
 
 -- +goose Down
 DROP TABLE IF EXISTS trigger_dev.metrics_v1;
diff --git a/packages/cli-v3/src/entryPoints/managed-run-worker.ts b/packages/cli-v3/src/entryPoints/managed-run-worker.ts
@@ -184,7 +184,7 @@ async function doBootstrap() {
     const tracingSDK = new TracingSDK({
       url: env.TRIGGER_OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://0.0.0.0:4318",
       metricsUrl: env.TRIGGER_OTEL_METRICS_ENDPOINT,
-      instrumentations: config.instrumentations ?? [],
+      instrumentations: config.telemetry?.instrumentations ?? config.instrumentations ?? [],
       diagLogLevel: (env.TRIGGER_OTEL_LOG_LEVEL as TracingDiagnosticLogLevel) ?? "none",
       forceFlushTimeoutMillis: 30_000,
       exporters: config.telemetry?.exporters ?? [],
diff --git a/packages/core/src/v3/otel/tracingSDK.ts b/packages/core/src/v3/otel/tracingSDK.ts
@@ -308,8 +308,8 @@ export class TracingSDK {
     const metricReaders: MetricReader[] = [
       new PeriodicExportingMetricReader({
         exporter: metricExporter,
-        exportIntervalMillis: Math.max(collectionIntervalMs, exportTimeoutMillis),
-        exportTimeoutMillis,
+        exportIntervalMillis: collectionIntervalMs,
+        exportTimeoutMillis: Math.min(exportTimeoutMillis, collectionIntervalMs),
       }),
       ...(config.metricReaders ?? []),
     ];
diff --git a/packages/core/src/v3/taskContext/otelProcessors.ts b/packages/core/src/v3/taskContext/otelProcessors.ts
@@ -131,8 +131,8 @@ export class TaskContextMetricExporter implements PushMetricExporter {
 
   export(metrics: ResourceMetrics, resultCallback: (result: ExportResult) => void): void {
     if (!taskContext.ctx) {
-      // No context at all — drop metrics
-      resultCallback({ code: ExportResultCode.SUCCESS });
+      // No task context yet — pass through without adding context attributes
+      this._innerExporter.export(metrics, resultCallback);
       return;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -131,8 +131,8 @@ export class TaskContextMetricExporter implements PushMetricExporter {`
`131`	`131`
`132`	`132`	`export(metrics: ResourceMetrics, resultCallback: (result: ExportResult) => void): void {`
`133`	`133`	`if (!taskContext.ctx) {`
`134`		`- // No context at all — drop metrics`
`135`		`- resultCallback({ code: ExportResultCode.SUCCESS });`
	`134`	`+ // No task context yet — pass through without adding context attributes`
	`135`	`+ this._innerExporter.export(metrics, resultCallback);`
`136`	`136`	`return;`
`137`	`137`	`}`
`138`	`138`