Skip to content

Commit e6eac4a

Browse files
committed
A bunch of improvements thanks to claudes team of review agents
1 parent ed7e891 commit e6eac4a

File tree

8 files changed

+44
-24
lines changed

8 files changed

+44
-24
lines changed

apps/webapp/app/env.server.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,6 +1233,9 @@ const EnvironmentSchema = z
12331233
EVENTS_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"),
12341234
EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000),
12351235
EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000),
1236+
METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000),
1237+
METRICS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000),
1238+
METRICS_CLICKHOUSE_MAX_CONCURRENCY: z.coerce.number().int().default(3),
12361239
EVENTS_CLICKHOUSE_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"),
12371240
EVENTS_CLICKHOUSE_WAIT_FOR_ASYNC_INSERT: z.string().default("1"),
12381241
EVENTS_CLICKHOUSE_ASYNC_INSERT_MAX_DATA_SIZE: z.coerce.number().int().default(10485760),

apps/webapp/app/v3/otlpExporter.server.ts

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import {
2121
import type { MetricsV1Input } from "@internal/clickhouse";
2222
import { logger } from "~/services/logger.server";
2323
import { clickhouseClient } from "~/services/clickhouseInstance.server";
24+
import { DynamicFlushScheduler } from "./dynamicFlushScheduler.server";
2425
import { ClickhouseEventRepository } from "./eventRepository/clickhouseEventRepository.server";
2526
import {
2627
clickhouseEventRepository,
@@ -47,6 +48,7 @@ class OTLPExporter {
4748
private readonly _eventRepository: EventRepository,
4849
private readonly _clickhouseEventRepository: ClickhouseEventRepository,
4950
private readonly _clickhouseEventRepositoryV2: ClickhouseEventRepository,
51+
private readonly _metricsFlushScheduler: DynamicFlushScheduler<MetricsV1Input>,
5052
private readonly _verbose: boolean,
5153
private readonly _spanAttributeValueLengthLimit: number
5254
) {
@@ -87,7 +89,7 @@ class OTLPExporter {
8789
span.setAttribute("metric_row_count", rows.length);
8890

8991
if (rows.length > 0) {
90-
await clickhouseClient.metrics.insert(rows);
92+
this._metricsFlushScheduler.addToBatch(rows);
9193
}
9294

9395
return ExportMetricsServiceResponse.create();
@@ -490,7 +492,7 @@ function convertMetricsToClickhouseRows(
490492
if (metric.gauge) {
491493
for (const dp of metric.gauge.dataPoints) {
492494
const value: number =
493-
(dp.asDouble ?? 0) !== 0 ? dp.asDouble! : dp.asInt !== BigInt(0) ? Number(dp.asInt) : 0;
495+
dp.asDouble !== undefined ? dp.asDouble : dp.asInt !== undefined ? Number(dp.asInt) : 0;
494496
const resolved = resolveDataPointContext(dp.attributes ?? [], resourceCtx);
495497

496498
rows.push({
@@ -515,7 +517,7 @@ function convertMetricsToClickhouseRows(
515517
if (metric.sum) {
516518
for (const dp of metric.sum.dataPoints) {
517519
const value: number =
518-
(dp.asDouble ?? 0) !== 0 ? dp.asDouble! : dp.asInt !== BigInt(0) ? Number(dp.asInt) : 0;
520+
dp.asDouble !== undefined ? dp.asDouble : dp.asInt !== undefined ? Number(dp.asInt) : 0;
519521
const resolved = resolveDataPointContext(dp.attributes ?? [], resourceCtx);
520522

521523
rows.push({
@@ -1133,10 +1135,22 @@ function hasUnpairedSurrogateAtEnd(str: string): boolean {
11331135
export const otlpExporter = singleton("otlpExporter", initializeOTLPExporter);
11341136

11351137
function initializeOTLPExporter() {
1138+
const metricsFlushScheduler = new DynamicFlushScheduler<MetricsV1Input>({
1139+
batchSize: env.METRICS_CLICKHOUSE_BATCH_SIZE,
1140+
flushInterval: env.METRICS_CLICKHOUSE_FLUSH_INTERVAL_MS,
1141+
callback: async (_flushId, batch) => {
1142+
await clickhouseClient.metrics.insert(batch);
1143+
},
1144+
minConcurrency: 1,
1145+
maxConcurrency: env.METRICS_CLICKHOUSE_MAX_CONCURRENCY,
1146+
loadSheddingEnabled: false,
1147+
});
1148+
11361149
return new OTLPExporter(
11371150
eventRepository,
11381151
clickhouseEventRepository,
11391152
clickhouseEventRepositoryV2,
1153+
metricsFlushScheduler,
11401154
process.env.OTLP_EXPORTER_VERBOSE === "1",
11411155
process.env.SERVER_OTEL_SPAN_ATTRIBUTE_VALUE_LENGTH_LIMIT
11421156
? parseInt(process.env.SERVER_OTEL_SPAN_ATTRIBUTE_VALUE_LENGTH_LIMIT, 10)

apps/webapp/app/v3/querySchemas.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ export const metricsSchema: TableSchema = {
557557
},
558558
attempt_number: {
559559
name: "attempt_number",
560-
...column("String", {
560+
...column("UInt64", {
561561
description: "The attempt number for this metric",
562562
example: "1",
563563
}),

apps/webapp/app/v3/services/aiQueryService.server.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -453,13 +453,13 @@ Only use explicit \`toStartOfHour\`/\`toStartOfDay\` etc. if the user specifical
453453
- Filter by metric name: WHERE metric_name = 'process.cpu.utilization'
454454
- Filter by run: WHERE run_id = 'run_abc123'
455455
- Filter by task: WHERE task_identifier = 'my-task'
456-
- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors
456+
- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors, nodejs.event_loop.utilization, nodejs.event_loop.delay.p50, nodejs.event_loop.delay.p99, nodejs.event_loop.delay.max, nodejs.heap.used, nodejs.heap.total
457457
- Use max_value or last_value for gauges (CPU utilization, memory usage), sum_value for counters (CPU time, network IO)
458458
- Use prettyFormat(expr, 'bytes') to tell the UI to format values as bytes (e.g., "1.50 GiB") — keeps values numeric for charts
459459
- Use prettyFormat(expr, 'percent') for percentage values
460460
- prettyFormat does NOT change the SQL — it only adds a display hint
461461
- Available format types: bytes, decimalBytes, percent, quantity, duration, durationSeconds, costInDollars
462-
- For memory metrics, always use prettyFormat with 'bytes'
462+
- For memory metrics (including nodejs.heap.*), always use prettyFormat with 'bytes'
463463
- For CPU utilization, consider prettyFormat with 'percent'
464464
465465
\`\`\`sql
@@ -588,9 +588,9 @@ LIMIT 1000
588588
589589
### Common Metrics Patterns
590590
- Filter by metric: WHERE metric_name = 'process.cpu.utilization'
591-
- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors
591+
- Available metric names: process.cpu.utilization, process.cpu.time, process.memory.usage, system.memory.usage, system.memory.utilization, system.network.io, system.network.dropped, system.network.errors, nodejs.event_loop.utilization, nodejs.event_loop.delay.p50, nodejs.event_loop.delay.p99, nodejs.event_loop.delay.max, nodejs.heap.used, nodejs.heap.total
592592
- Use max_value or last_value for gauges (CPU utilization, memory usage), sum_value for counters (CPU time, network IO)
593-
- Use prettyFormat(expr, 'bytes') for memory metrics, prettyFormat(expr, 'percent') for CPU utilization
593+
- Use prettyFormat(expr, 'bytes') for memory metrics (including nodejs.heap.*), prettyFormat(expr, 'percent') for CPU utilization
594594
- prettyFormat does NOT change the SQL — it only adds a display hint for the UI
595595
596596
## Important Rules

internal-packages/clickhouse/schema/016_create_metrics_v1.sql renamed to internal-packages/clickhouse/schema/017_create_metrics_v1.sql

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@ CREATE TABLE IF NOT EXISTS trigger_dev.metrics_v1
33
(
44
organization_id LowCardinality(String),
55
project_id LowCardinality(String),
6-
environment_id String,
6+
environment_id String CODEC(ZSTD(1)),
77
metric_name LowCardinality(String),
88
metric_type LowCardinality(String),
9-
metric_subject String,
10-
bucket_start DateTime,
11-
count UInt64 DEFAULT 0,
12-
sum_value Float64 DEFAULT 0,
13-
max_value Float64 DEFAULT 0,
14-
min_value Float64 DEFAULT 0,
15-
last_value Float64 DEFAULT 0,
9+
metric_subject String CODEC(ZSTD(1)),
10+
bucket_start DateTime CODEC(Delta(4), ZSTD(1)),
11+
count UInt64 DEFAULT 0 CODEC(ZSTD(1)),
12+
sum_value Float64 DEFAULT 0 CODEC(ZSTD(1)),
13+
max_value Float64 DEFAULT 0 CODEC(ZSTD(1)),
14+
min_value Float64 DEFAULT 0 CODEC(ZSTD(1)),
15+
last_value Float64 DEFAULT 0 CODEC(ZSTD(1)),
1616
attributes JSON(
1717
`trigger.run_id` String,
1818
`trigger.task_slug` String,
@@ -29,12 +29,15 @@ CREATE TABLE IF NOT EXISTS trigger_dev.metrics_v1
2929
`process.cpu.state` LowCardinality(String),
3030
`network.io.direction` LowCardinality(String),
3131
max_dynamic_paths=8
32-
)
32+
),
33+
INDEX idx_run_id attributes.trigger.run_id TYPE bloom_filter(0.001) GRANULARITY 1,
34+
INDEX idx_task_slug attributes.trigger.task_slug TYPE bloom_filter(0.001) GRANULARITY 1
3335
)
3436
ENGINE = MergeTree()
35-
PARTITION BY toYYYYMM(bucket_start)
37+
PARTITION BY toDate(bucket_start)
3638
ORDER BY (organization_id, project_id, environment_id, metric_name, metric_subject, bucket_start)
37-
TTL bucket_start + INTERVAL 30 DAY;
39+
TTL bucket_start + INTERVAL 60 DAY
40+
SETTINGS ttl_only_drop_parts = 1;
3841

3942
-- +goose Down
4043
DROP TABLE IF EXISTS trigger_dev.metrics_v1;

packages/cli-v3/src/entryPoints/managed-run-worker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ async function doBootstrap() {
184184
const tracingSDK = new TracingSDK({
185185
url: env.TRIGGER_OTEL_EXPORTER_OTLP_ENDPOINT ?? "http://0.0.0.0:4318",
186186
metricsUrl: env.TRIGGER_OTEL_METRICS_ENDPOINT,
187-
instrumentations: config.instrumentations ?? [],
187+
instrumentations: config.telemetry?.instrumentations ?? config.instrumentations ?? [],
188188
diagLogLevel: (env.TRIGGER_OTEL_LOG_LEVEL as TracingDiagnosticLogLevel) ?? "none",
189189
forceFlushTimeoutMillis: 30_000,
190190
exporters: config.telemetry?.exporters ?? [],

packages/core/src/v3/otel/tracingSDK.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,8 +308,8 @@ export class TracingSDK {
308308
const metricReaders: MetricReader[] = [
309309
new PeriodicExportingMetricReader({
310310
exporter: metricExporter,
311-
exportIntervalMillis: Math.max(collectionIntervalMs, exportTimeoutMillis),
312-
exportTimeoutMillis,
311+
exportIntervalMillis: collectionIntervalMs,
312+
exportTimeoutMillis: Math.min(exportTimeoutMillis, collectionIntervalMs),
313313
}),
314314
...(config.metricReaders ?? []),
315315
];

packages/core/src/v3/taskContext/otelProcessors.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ export class TaskContextMetricExporter implements PushMetricExporter {
131131

132132
export(metrics: ResourceMetrics, resultCallback: (result: ExportResult) => void): void {
133133
if (!taskContext.ctx) {
134-
// No context at all — drop metrics
135-
resultCallback({ code: ExportResultCode.SUCCESS });
134+
// No task context yet — pass through without adding context attributes
135+
this._innerExporter.export(metrics, resultCallback);
136136
return;
137137
}
138138

0 commit comments

Comments
 (0)