Skip to content

Commit 689d531

Browse files
committed
test: simplify reasoning tests to two cases
1 parent f071a88 commit 689d531

File tree

1 file changed

+42
-185
lines changed

1 file changed

+42
-185
lines changed
Lines changed: 42 additions & 185 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,15 @@
11
/**
2-
* Integration tests for reasoning/thinking functionality across Anthropic models
3-
*
4-
* These tests verify that:
5-
* 1. Opus 4.5 uses the `effort` parameter correctly
6-
* 2. Sonnet 4.5 uses the `thinking.budgetTokens` parameter correctly
7-
* 3. Both models can successfully stream responses with reasoning enabled
8-
*
9-
* This prevents regressions where the wrong parameter is used for a model.
2+
* Integration tests for reasoning/thinking functionality across Anthropic models.
3+
* Verifies Opus 4.5 uses `effort` and Sonnet 4.5 uses `thinking.budgetTokens`.
104
*/
115

126
import { shouldRunIntegrationTests, validateApiKeys } from "./setup";
13-
import {
14-
sendMessage,
15-
assertStreamSuccess,
16-
waitForStreamSuccess,
17-
configureTestRetries,
18-
} from "./helpers";
7+
import { sendMessage, assertStreamSuccess, waitForStreamSuccess } from "./helpers";
198
import { createSharedRepo, cleanupSharedRepo, withSharedWorkspace } from "./sendMessageTestHelpers";
209
import { KNOWN_MODELS } from "@/common/constants/knownModels";
2110

22-
// Skip all tests if TEST_INTEGRATION is not set
2311
const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
2412

25-
// Validate API keys before running tests
2613
if (shouldRunIntegrationTests()) {
2714
validateApiKeys(["ANTHROPIC_API_KEY"]);
2815
}
@@ -31,173 +18,43 @@ beforeAll(createSharedRepo);
3118
afterAll(cleanupSharedRepo);
3219

3320
describeIntegration("Anthropic reasoning parameter tests", () => {
34-
configureTestRetries(3);
35-
36-
describe("Sonnet 4.5 (thinking.budgetTokens)", () => {
37-
test.concurrent(
38-
"should successfully send message with low thinking level",
39-
async () => {
40-
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
41-
// Send a message with low thinking level
42-
// Sonnet 4.5 should use thinking.budgetTokens=4000
43-
const result = await sendMessage(
44-
env.mockIpcRenderer,
45-
workspaceId,
46-
"What is 2+2? Answer in one word.",
47-
{
48-
model: KNOWN_MODELS.SONNET.id,
49-
thinkingLevel: "low",
50-
}
51-
);
52-
53-
expect(result.success).toBe(true);
54-
55-
// Wait for stream to complete
56-
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 30000);
57-
58-
// Verify we got a successful response
59-
assertStreamSuccess(collector);
60-
61-
// Verify we received deltas (actual response content)
62-
const deltas = collector.getDeltas();
63-
expect(deltas.length).toBeGreaterThan(0);
64-
65-
// Verify reasoning occurred (Sonnet 4.5 with thinking enabled should produce reasoning)
66-
const events = collector.getEvents();
67-
const hasReasoning = events.some((e) => "type" in e && e.type === "reasoning-delta");
68-
expect(hasReasoning).toBe(true);
69-
});
70-
},
71-
60000
72-
);
73-
74-
test.concurrent(
75-
"should successfully send message with medium thinking level",
76-
async () => {
77-
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
78-
// Send a message with medium thinking level
79-
// Sonnet 4.5 should use thinking.budgetTokens=10000
80-
const result = await sendMessage(
81-
env.mockIpcRenderer,
82-
workspaceId,
83-
"What is 3+3? Answer in one word.",
84-
{
85-
model: KNOWN_MODELS.SONNET.id,
86-
thinkingLevel: "medium",
87-
}
88-
);
89-
90-
expect(result.success).toBe(true);
91-
92-
// Wait for stream to complete
93-
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 30000);
94-
95-
assertStreamSuccess(collector);
96-
97-
const deltas = collector.getDeltas();
98-
expect(deltas.length).toBeGreaterThan(0);
99-
});
100-
},
101-
60000
102-
);
103-
});
104-
105-
describe("Opus 4.5 (effort parameter)", () => {
106-
test.concurrent(
107-
"should successfully send message with low effort level",
108-
async () => {
109-
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
110-
// Send a message with low thinking level
111-
// Opus 4.5 should use effort="low" (NOT thinking.budgetTokens)
112-
const result = await sendMessage(
113-
env.mockIpcRenderer,
114-
workspaceId,
115-
"What is 4+4? Answer in one word.",
116-
{
117-
model: KNOWN_MODELS.OPUS.id,
118-
thinkingLevel: "low",
119-
}
120-
);
121-
122-
expect(result.success).toBe(true);
123-
124-
// Wait for stream to complete
125-
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 60000);
126-
127-
// Verify we got a successful response
128-
assertStreamSuccess(collector);
129-
130-
// Verify we received deltas (actual response content)
131-
const deltas = collector.getDeltas();
132-
expect(deltas.length).toBeGreaterThan(0);
133-
});
134-
},
135-
90000 // Opus is slower, give more time
136-
);
137-
138-
test.concurrent(
139-
"should successfully send message with medium effort level",
140-
async () => {
141-
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
142-
// Send a message with medium thinking level
143-
// Opus 4.5 should use effort="medium"
144-
const result = await sendMessage(
145-
env.mockIpcRenderer,
146-
workspaceId,
147-
"What is 5+5? Answer in one word.",
148-
{
149-
model: KNOWN_MODELS.OPUS.id,
150-
thinkingLevel: "medium",
151-
}
152-
);
153-
154-
expect(result.success).toBe(true);
155-
156-
// Wait for stream to complete
157-
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 60000);
158-
159-
assertStreamSuccess(collector);
160-
161-
const deltas = collector.getDeltas();
162-
expect(deltas.length).toBeGreaterThan(0);
163-
});
164-
},
165-
90000
166-
);
167-
168-
test.concurrent(
169-
"should successfully send message with thinking off",
170-
async () => {
171-
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
172-
// Send a message with thinking off
173-
// Opus 4.5 should NOT include effort parameter
174-
const result = await sendMessage(
175-
env.mockIpcRenderer,
176-
workspaceId,
177-
"What is 6+6? Answer in one word.",
178-
{
179-
model: KNOWN_MODELS.OPUS.id,
180-
thinkingLevel: "off",
181-
}
182-
);
183-
184-
expect(result.success).toBe(true);
185-
186-
// Wait for stream to complete
187-
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 60000);
188-
189-
assertStreamSuccess(collector);
190-
191-
const deltas = collector.getDeltas();
192-
expect(deltas.length).toBeGreaterThan(0);
193-
194-
// With thinking off, we should NOT have reasoning events
195-
const events = collector.getEvents();
196-
const hasReasoning = events.some((e) => "type" in e && e.type === "reasoning-delta");
197-
expect(hasReasoning).toBe(false);
198-
});
199-
},
200-
90000
201-
);
202-
});
21+
test.concurrent(
22+
"Sonnet 4.5 with thinking (budgetTokens)",
23+
async () => {
24+
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
25+
const result = await sendMessage(
26+
env.mockIpcRenderer,
27+
workspaceId,
28+
"What is 2+2? Answer in one word.",
29+
{ model: KNOWN_MODELS.SONNET.id, thinkingLevel: "low" }
30+
);
31+
expect(result.success).toBe(true);
32+
33+
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 30000);
34+
assertStreamSuccess(collector);
35+
expect(collector.getDeltas().length).toBeGreaterThan(0);
36+
});
37+
},
38+
60000
39+
);
40+
41+
test.concurrent(
42+
"Opus 4.5 with thinking (effort)",
43+
async () => {
44+
await withSharedWorkspace("anthropic", async ({ env, workspaceId }) => {
45+
const result = await sendMessage(
46+
env.mockIpcRenderer,
47+
workspaceId,
48+
"What is 4+4? Answer in one word.",
49+
{ model: KNOWN_MODELS.OPUS.id, thinkingLevel: "low" }
50+
);
51+
expect(result.success).toBe(true);
52+
53+
const collector = await waitForStreamSuccess(env.sentEvents, workspaceId, 60000);
54+
assertStreamSuccess(collector);
55+
expect(collector.getDeltas().length).toBeGreaterThan(0);
56+
});
57+
},
58+
90000
59+
);
20360
});

0 commit comments

Comments
 (0)