11/**
2- * Integration tests for reasoning/thinking functionality across Anthropic models
3- *
4- * These tests verify that:
5- * 1. Opus 4.5 uses the `effort` parameter correctly
6- * 2. Sonnet 4.5 uses the `thinking.budgetTokens` parameter correctly
7- * 3. Both models can successfully stream responses with reasoning enabled
8- *
9- * This prevents regressions where the wrong parameter is used for a model.
2+ * Integration tests for reasoning/thinking functionality across Anthropic models.
3+ * Verifies Opus 4.5 uses `effort` and Sonnet 4.5 uses `thinking.budgetTokens`.
104 */
115
126import { shouldRunIntegrationTests , validateApiKeys } from "./setup" ;
13- import {
14- sendMessage ,
15- assertStreamSuccess ,
16- waitForStreamSuccess ,
17- configureTestRetries ,
18- } from "./helpers" ;
7+ import { sendMessage , assertStreamSuccess , waitForStreamSuccess } from "./helpers" ;
198import { createSharedRepo , cleanupSharedRepo , withSharedWorkspace } from "./sendMessageTestHelpers" ;
209import { KNOWN_MODELS } from "@/common/constants/knownModels" ;
2110
22- // Skip all tests if TEST_INTEGRATION is not set
2311const describeIntegration = shouldRunIntegrationTests ( ) ? describe : describe . skip ;
2412
25- // Validate API keys before running tests
2613if ( shouldRunIntegrationTests ( ) ) {
2714 validateApiKeys ( [ "ANTHROPIC_API_KEY" ] ) ;
2815}
@@ -31,173 +18,43 @@ beforeAll(createSharedRepo);
3118afterAll ( cleanupSharedRepo ) ;
3219
3320describeIntegration ( "Anthropic reasoning parameter tests" , ( ) => {
34- configureTestRetries ( 3 ) ;
35-
36- describe ( "Sonnet 4.5 (thinking.budgetTokens)" , ( ) => {
37- test . concurrent (
38- "should successfully send message with low thinking level" ,
39- async ( ) => {
40- await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
41- // Send a message with low thinking level
42- // Sonnet 4.5 should use thinking.budgetTokens=4000
43- const result = await sendMessage (
44- env . mockIpcRenderer ,
45- workspaceId ,
46- "What is 2+2? Answer in one word." ,
47- {
48- model : KNOWN_MODELS . SONNET . id ,
49- thinkingLevel : "low" ,
50- }
51- ) ;
52-
53- expect ( result . success ) . toBe ( true ) ;
54-
55- // Wait for stream to complete
56- const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 30000 ) ;
57-
58- // Verify we got a successful response
59- assertStreamSuccess ( collector ) ;
60-
61- // Verify we received deltas (actual response content)
62- const deltas = collector . getDeltas ( ) ;
63- expect ( deltas . length ) . toBeGreaterThan ( 0 ) ;
64-
65- // Verify reasoning occurred (Sonnet 4.5 with thinking enabled should produce reasoning)
66- const events = collector . getEvents ( ) ;
67- const hasReasoning = events . some ( ( e ) => "type" in e && e . type === "reasoning-delta" ) ;
68- expect ( hasReasoning ) . toBe ( true ) ;
69- } ) ;
70- } ,
71- 60000
72- ) ;
73-
74- test . concurrent (
75- "should successfully send message with medium thinking level" ,
76- async ( ) => {
77- await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
78- // Send a message with medium thinking level
79- // Sonnet 4.5 should use thinking.budgetTokens=10000
80- const result = await sendMessage (
81- env . mockIpcRenderer ,
82- workspaceId ,
83- "What is 3+3? Answer in one word." ,
84- {
85- model : KNOWN_MODELS . SONNET . id ,
86- thinkingLevel : "medium" ,
87- }
88- ) ;
89-
90- expect ( result . success ) . toBe ( true ) ;
91-
92- // Wait for stream to complete
93- const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 30000 ) ;
94-
95- assertStreamSuccess ( collector ) ;
96-
97- const deltas = collector . getDeltas ( ) ;
98- expect ( deltas . length ) . toBeGreaterThan ( 0 ) ;
99- } ) ;
100- } ,
101- 60000
102- ) ;
103- } ) ;
104-
105- describe ( "Opus 4.5 (effort parameter)" , ( ) => {
106- test . concurrent (
107- "should successfully send message with low effort level" ,
108- async ( ) => {
109- await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
110- // Send a message with low thinking level
111- // Opus 4.5 should use effort="low" (NOT thinking.budgetTokens)
112- const result = await sendMessage (
113- env . mockIpcRenderer ,
114- workspaceId ,
115- "What is 4+4? Answer in one word." ,
116- {
117- model : KNOWN_MODELS . OPUS . id ,
118- thinkingLevel : "low" ,
119- }
120- ) ;
121-
122- expect ( result . success ) . toBe ( true ) ;
123-
124- // Wait for stream to complete
125- const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 60000 ) ;
126-
127- // Verify we got a successful response
128- assertStreamSuccess ( collector ) ;
129-
130- // Verify we received deltas (actual response content)
131- const deltas = collector . getDeltas ( ) ;
132- expect ( deltas . length ) . toBeGreaterThan ( 0 ) ;
133- } ) ;
134- } ,
135- 90000 // Opus is slower, give more time
136- ) ;
137-
138- test . concurrent (
139- "should successfully send message with medium effort level" ,
140- async ( ) => {
141- await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
142- // Send a message with medium thinking level
143- // Opus 4.5 should use effort="medium"
144- const result = await sendMessage (
145- env . mockIpcRenderer ,
146- workspaceId ,
147- "What is 5+5? Answer in one word." ,
148- {
149- model : KNOWN_MODELS . OPUS . id ,
150- thinkingLevel : "medium" ,
151- }
152- ) ;
153-
154- expect ( result . success ) . toBe ( true ) ;
155-
156- // Wait for stream to complete
157- const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 60000 ) ;
158-
159- assertStreamSuccess ( collector ) ;
160-
161- const deltas = collector . getDeltas ( ) ;
162- expect ( deltas . length ) . toBeGreaterThan ( 0 ) ;
163- } ) ;
164- } ,
165- 90000
166- ) ;
167-
168- test . concurrent (
169- "should successfully send message with thinking off" ,
170- async ( ) => {
171- await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
172- // Send a message with thinking off
173- // Opus 4.5 should NOT include effort parameter
174- const result = await sendMessage (
175- env . mockIpcRenderer ,
176- workspaceId ,
177- "What is 6+6? Answer in one word." ,
178- {
179- model : KNOWN_MODELS . OPUS . id ,
180- thinkingLevel : "off" ,
181- }
182- ) ;
183-
184- expect ( result . success ) . toBe ( true ) ;
185-
186- // Wait for stream to complete
187- const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 60000 ) ;
188-
189- assertStreamSuccess ( collector ) ;
190-
191- const deltas = collector . getDeltas ( ) ;
192- expect ( deltas . length ) . toBeGreaterThan ( 0 ) ;
193-
194- // With thinking off, we should NOT have reasoning events
195- const events = collector . getEvents ( ) ;
196- const hasReasoning = events . some ( ( e ) => "type" in e && e . type === "reasoning-delta" ) ;
197- expect ( hasReasoning ) . toBe ( false ) ;
198- } ) ;
199- } ,
200- 90000
201- ) ;
202- } ) ;
21+ test . concurrent (
22+ "Sonnet 4.5 with thinking (budgetTokens)" ,
23+ async ( ) => {
24+ await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
25+ const result = await sendMessage (
26+ env . mockIpcRenderer ,
27+ workspaceId ,
28+ "What is 2+2? Answer in one word." ,
29+ { model : KNOWN_MODELS . SONNET . id , thinkingLevel : "low" }
30+ ) ;
31+ expect ( result . success ) . toBe ( true ) ;
32+
33+ const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 30000 ) ;
34+ assertStreamSuccess ( collector ) ;
35+ expect ( collector . getDeltas ( ) . length ) . toBeGreaterThan ( 0 ) ;
36+ } ) ;
37+ } ,
38+ 60000
39+ ) ;
40+
41+ test . concurrent (
42+ "Opus 4.5 with thinking (effort)" ,
43+ async ( ) => {
44+ await withSharedWorkspace ( "anthropic" , async ( { env, workspaceId } ) => {
45+ const result = await sendMessage (
46+ env . mockIpcRenderer ,
47+ workspaceId ,
48+ "What is 4+4? Answer in one word." ,
49+ { model : KNOWN_MODELS . OPUS . id , thinkingLevel : "low" }
50+ ) ;
51+ expect ( result . success ) . toBe ( true ) ;
52+
53+ const collector = await waitForStreamSuccess ( env . sentEvents , workspaceId , 60000 ) ;
54+ assertStreamSuccess ( collector ) ;
55+ expect ( collector . getDeltas ( ) . length ) . toBeGreaterThan ( 0 ) ;
56+ } ) ;
57+ } ,
58+ 90000
59+ ) ;
20360} ) ;
0 commit comments