Skip to content

Commit 8778403

Browse files
committed
speed up deduplicating of runs by making it more efficient
1 parent 777db7c commit 8778403

File tree

3 files changed

+492
-42
lines changed

3 files changed

+492
-42
lines changed

apps/webapp/app/services/runsReplicationService.server.ts

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ type TaskRunInsert = {
8282
export type RunsReplicationServiceEvents = {
8383
message: [{ lsn: string; message: PgoutputMessage; service: RunsReplicationService }];
8484
batchFlushed: [
85-
{ flushId: string; taskRunInserts: TaskRunInsertArray[]; payloadInserts: PayloadInsertArray[] },
85+
{ flushId: string; taskRunInserts: TaskRunInsertArray[]; payloadInserts: PayloadInsertArray[] }
8686
];
8787
};
8888

@@ -214,34 +214,14 @@ export class RunsReplicationService {
214214
flushInterval: options.flushIntervalMs ?? 100,
215215
maxConcurrency: options.maxFlushConcurrency ?? 100,
216216
callback: this.#flushBatch.bind(this),
217-
// we can do some pre-merging to reduce the amount of data we need to send to clickhouse
218-
mergeBatch: (existingBatch: TaskRunInsert[], newBatch: TaskRunInsert[]) => {
219-
const merged = new Map<string, TaskRunInsert>();
220-
221-
for (const item of existingBatch) {
222-
const key = `${item.event}_${item.run.id}`;
223-
merged.set(key, item);
224-
}
225-
226-
for (const item of newBatch) {
227-
if (!item?.run?.id) {
228-
this.logger.warn("Skipping replication event with null run", { event: item });
229-
continue;
230-
}
231-
232-
const key = `${item.event}_${item.run.id}`;
233-
const existingItem = merged.get(key);
234-
235-
// Keep the run with the higher version (latest)
236-
// and take the last occurrence for that version.
237-
// Items originating from the same DB transaction have the same version.
238-
if (!existingItem || item._version >= existingItem._version) {
239-
merged.set(key, item);
240-
}
241-
}
242-
243-
return Array.from(merged.values());
217+
// Key-based deduplication to reduce duplicates sent to ClickHouse
218+
getKey: (item) => {
219+
return `${item.event}_${item.run.id}`;
244220
},
221+
// Keep the run with the higher version (latest)
222+
// and take the last occurrence for that version.
223+
// Items originating from the same DB transaction have the same version.
224+
shouldReplace: (existing, incoming) => incoming._version >= existing._version,
245225
logger: new Logger("ConcurrentFlushScheduler", options.logLevel ?? "info"),
246226
tracer: options.tracer,
247227
});
@@ -504,7 +484,7 @@ export class RunsReplicationService {
504484
this._eventsProcessedCounter.add(1, { event_type: event.tag });
505485
}
506486

507-
this.logger.info("handle_transaction", {
487+
this.logger.debug("handle_transaction", {
508488
transaction: {
509489
xid: transaction.xid,
510490
commitLsn: transaction.commitLsn,
@@ -974,13 +954,16 @@ export type ConcurrentFlushSchedulerConfig<T> = {
974954
flushInterval: number;
975955
maxConcurrency?: number;
976956
callback: (flushId: string, batch: T[]) => Promise<void>;
977-
mergeBatch?: (existingBatch: T[], newBatch: T[]) => T[];
957+
/** Key-based deduplication. Return null to skip the item. */
958+
getKey: (item: T) => string | null;
959+
/** Determine if incoming item should replace existing. */
960+
shouldReplace: (existing: T, incoming: T) => boolean;
978961
tracer?: Tracer;
979962
logger?: Logger;
980963
};
981964

982965
export class ConcurrentFlushScheduler<T> {
983-
private currentBatch: T[];
966+
private batch = new Map<string, T>();
984967
private readonly BATCH_SIZE: number;
985968
private readonly flushInterval: number;
986969
private readonly MAX_CONCURRENCY: number;
@@ -995,7 +978,6 @@ export class ConcurrentFlushScheduler<T> {
995978
this.logger = config.logger ?? new Logger("ConcurrentFlushScheduler", "info");
996979
this._tracer = config.tracer ?? trace.getTracer("concurrent-flush-scheduler");
997980

998-
this.currentBatch = [];
999981
this.BATCH_SIZE = config.batchSize;
1000982
this.flushInterval = config.flushInterval;
1001983
this.MAX_CONCURRENCY = config.maxConcurrency || 1;
@@ -1005,9 +987,17 @@ export class ConcurrentFlushScheduler<T> {
1005987
}
1006988

1007989
addToBatch(items: T[]): void {
1008-
this.currentBatch = this.config.mergeBatch
1009-
? this.config.mergeBatch(this.currentBatch, items)
1010-
: this.currentBatch.concat(items);
990+
for (const item of items) {
991+
const key = this.config.getKey(item);
992+
if (key === null) {
993+
continue;
994+
}
995+
996+
const existing = this.batch.get(key);
997+
if (!existing || this.config.shouldReplace(existing, item)) {
998+
this.batch.set(key, item);
999+
}
1000+
}
10111001

10121002
this.#flushNextBatchIfNeeded();
10131003
}
@@ -1031,11 +1021,16 @@ export class ConcurrentFlushScheduler<T> {
10311021
this.#flushNextBatchIfNeeded();
10321022
}
10331023

1024+
#getBatchSize(): number {
1025+
return this.batch.size;
1026+
}
1027+
10341028
#flushNextBatchIfNeeded(): void {
1035-
if (this.currentBatch.length >= this.BATCH_SIZE || this._isShutDown) {
1029+
const currentSize = this.#getBatchSize();
1030+
if (currentSize >= this.BATCH_SIZE || this._isShutDown) {
10361031
this.logger.debug("Batch size threshold reached, initiating flush", {
10371032
batchSize: this.BATCH_SIZE,
1038-
currentSize: this.currentBatch.length,
1033+
currentSize,
10391034
isShutDown: this._isShutDown,
10401035
});
10411036

@@ -1060,19 +1055,20 @@ export class ConcurrentFlushScheduler<T> {
10601055
}
10611056

10621057
async #checkAndFlush(): Promise<void> {
1063-
if (this.currentBatch.length > 0) {
1058+
const currentSize = this.#getBatchSize();
1059+
if (currentSize > 0) {
10641060
this.logger.debug("Periodic flush check triggered", {
1065-
currentBatchSize: this.currentBatch.length,
1061+
currentBatchSize: currentSize,
10661062
});
10671063
await this.#flushNextBatch();
10681064
}
10691065
}
10701066

10711067
async #flushNextBatch(): Promise<void> {
1072-
if (this.currentBatch.length === 0) return;
1068+
if (this.batch.size === 0) return;
10731069

1074-
const batch = this.currentBatch;
1075-
this.currentBatch = [];
1070+
const batch = Array.from(this.batch.values());
1071+
this.batch.clear();
10761072

10771073
const callback = this.config.callback;
10781074

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import { ConcurrentFlushScheduler } from "~/services/runsReplicationService.server";
2+
3+
vi.setConfig({ testTimeout: 10_000 });
4+
5+
type TestItem = {
6+
id: string;
7+
event: "insert" | "update";
8+
version: number;
9+
};
10+
11+
describe("ConcurrentFlushScheduler", () => {
12+
it("should deduplicate items by key, keeping the latest version", async () => {
13+
const flushedBatches: TestItem[][] = [];
14+
15+
const scheduler = new ConcurrentFlushScheduler<TestItem>({
16+
batchSize: 100,
17+
flushInterval: 50,
18+
maxConcurrency: 1,
19+
callback: async (_flushId, batch) => {
20+
flushedBatches.push([...batch]);
21+
},
22+
getKey: (item) => `${item.event}_${item.id}`,
23+
shouldReplace: (existing, incoming) => incoming.version >= existing.version,
24+
});
25+
26+
scheduler.start();
27+
28+
// Add items with duplicate keys but different versions
29+
scheduler.addToBatch([
30+
{ id: "run_1", event: "insert", version: 1 },
31+
{ id: "run_1", event: "update", version: 2 },
32+
{ id: "run_2", event: "insert", version: 1 },
33+
]);
34+
35+
// Add more items - should merge with existing
36+
scheduler.addToBatch([
37+
{ id: "run_1", event: "insert", version: 3 }, // Higher version, should replace
38+
{ id: "run_1", event: "update", version: 1 }, // Lower version, should NOT replace
39+
{ id: "run_2", event: "update", version: 4 },
40+
]);
41+
42+
// Wait for flush
43+
await new Promise((resolve) => setTimeout(resolve, 100));
44+
45+
scheduler.shutdown();
46+
47+
// Should have flushed once with deduplicated items
48+
expect(flushedBatches.length).toBeGreaterThanOrEqual(1);
49+
50+
const allFlushed = flushedBatches.flat();
51+
52+
// Find items by their key
53+
const insertRun1 = allFlushed.find((i) => i.id === "run_1" && i.event === "insert");
54+
const updateRun1 = allFlushed.find((i) => i.id === "run_1" && i.event === "update");
55+
const insertRun2 = allFlushed.find((i) => i.id === "run_2" && i.event === "insert");
56+
const updateRun2 = allFlushed.find((i) => i.id === "run_2" && i.event === "update");
57+
58+
// Verify correct versions were kept
59+
expect(insertRun1?.version).toBe(3); // Latest version for insert_run_1
60+
expect(updateRun1?.version).toBe(2); // Original update_run_1 (v1 didn't replace v2)
61+
expect(insertRun2?.version).toBe(1); // Only version for insert_run_2
62+
expect(updateRun2?.version).toBe(4); // Only version for update_run_2
63+
});
64+
65+
it("should skip items where getKey returns null", async () => {
66+
const flushedBatches: TestItem[][] = [];
67+
68+
const scheduler = new ConcurrentFlushScheduler<TestItem>({
69+
batchSize: 100,
70+
flushInterval: 50,
71+
maxConcurrency: 1,
72+
callback: async (_flushId, batch) => {
73+
flushedBatches.push([...batch]);
74+
},
75+
getKey: (item) => {
76+
if (!item.id) {
77+
return null;
78+
}
79+
return `${item.event}_${item.id}`;
80+
},
81+
shouldReplace: (existing, incoming) => incoming.version >= existing.version,
82+
});
83+
84+
scheduler.start();
85+
86+
scheduler.addToBatch([
87+
{ id: "run_1", event: "insert", version: 1 },
88+
{ id: "", event: "insert", version: 2 }, // Should be skipped (null key)
89+
{ id: "run_2", event: "insert", version: 1 },
90+
]);
91+
92+
await new Promise((resolve) => setTimeout(resolve, 100));
93+
94+
scheduler.shutdown();
95+
96+
const allFlushed = flushedBatches.flat();
97+
expect(allFlushed).toHaveLength(2);
98+
expect(allFlushed.map((i) => i.id).sort()).toEqual(["run_1", "run_2"]);
99+
});
100+
101+
it("should flush when batch size threshold is reached", async () => {
102+
const flushedBatches: TestItem[][] = [];
103+
104+
const scheduler = new ConcurrentFlushScheduler<TestItem>({
105+
batchSize: 3,
106+
flushInterval: 10000, // Long interval so timer doesn't trigger
107+
maxConcurrency: 1,
108+
callback: async (_flushId, batch) => {
109+
flushedBatches.push([...batch]);
110+
},
111+
getKey: (item) => `${item.event}_${item.id}`,
112+
shouldReplace: (existing, incoming) => incoming.version >= existing.version,
113+
});
114+
115+
scheduler.start();
116+
117+
// Add 3 unique items - should trigger flush
118+
scheduler.addToBatch([
119+
{ id: "run_1", event: "insert", version: 1 },
120+
{ id: "run_2", event: "insert", version: 1 },
121+
{ id: "run_3", event: "insert", version: 1 },
122+
]);
123+
124+
await new Promise((resolve) => setTimeout(resolve, 50));
125+
126+
expect(flushedBatches.length).toBe(1);
127+
expect(flushedBatches[0]).toHaveLength(3);
128+
129+
scheduler.shutdown();
130+
});
131+
132+
it("should respect shouldReplace returning false", async () => {
133+
const flushedBatches: TestItem[][] = [];
134+
135+
const scheduler = new ConcurrentFlushScheduler<TestItem>({
136+
batchSize: 100,
137+
flushInterval: 50,
138+
maxConcurrency: 1,
139+
callback: async (_flushId, batch) => {
140+
flushedBatches.push([...batch]);
141+
},
142+
getKey: (item) => `${item.event}_${item.id}`,
143+
// Never replace - first item wins
144+
shouldReplace: () => false,
145+
});
146+
147+
scheduler.start();
148+
149+
scheduler.addToBatch([{ id: "run_1", event: "insert", version: 10 }]);
150+
151+
scheduler.addToBatch([{ id: "run_1", event: "insert", version: 999 }]);
152+
153+
await new Promise((resolve) => setTimeout(resolve, 100));
154+
155+
scheduler.shutdown();
156+
157+
const allFlushed = flushedBatches.flat();
158+
const insertRun1 = allFlushed.find((i) => i.id === "run_1" && i.event === "insert");
159+
expect(insertRun1?.version).toBe(10); // First one wins
160+
});
161+
});

0 commit comments

Comments
 (0)