Skip to content

Commit 6a5c43c

Browse files
committed
adapt tests
1 parent 120da10 commit 6a5c43c

File tree

1 file changed

+93
-55
lines changed

1 file changed

+93
-55
lines changed

tests/integrations/anthropic/test_anthropic.py

Lines changed: 93 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2265,66 +2265,107 @@ def test_cache_tokens_nonstreaming(sentry_init, capture_events):
22652265
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20
22662266

22672267

2268-
def test_input_tokens_include_cached_nonstreaming(sentry_init, capture_events):
2268+
def test_input_tokens_include_cache_write_nonstreaming(sentry_init, capture_events):
22692269
"""
2270-
Test that gen_ai.usage.input_tokens includes cached tokens.
2270+
Test that gen_ai.usage.input_tokens includes cache_write tokens (non-streaming).
22712271
2272-
Anthropic's usage.input_tokens excludes cached/cache_write tokens,
2273-
but gen_ai.usage.input_tokens should be the TOTAL input tokens
2274-
(including cached + cache_write) so that downstream cost calculations
2275-
don't produce negative values.
2272+
Reproduces a real Anthropic cache-write response. Anthropic's usage.input_tokens
2273+
only counts non-cached tokens, but gen_ai.usage.input_tokens should be the TOTAL
2274+
so downstream cost calculations don't produce negative values.
22762275
2277-
See: negative gen_ai.cost.input_tokens bug when cache_read > input_tokens.
2276+
Real Anthropic response (from E2E test):
2277+
Usage(input_tokens=19, output_tokens=14,
2278+
cache_creation_input_tokens=2846, cache_read_input_tokens=0)
22782279
"""
22792280
sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
22802281
events = capture_events()
22812282
client = Anthropic(api_key="z")
22822283

2283-
# Simulate Anthropic response where input_tokens=100 EXCLUDES cached tokens
2284-
# cache_read=80 and cache_write=20 are separate
2285-
# Total input tokens processed = 100 + 80 + 20 = 200
22862284
client.messages._post = mock.Mock(
22872285
return_value=Message(
22882286
id="id",
2289-
model="claude-3-5-sonnet-20241022",
2287+
model="claude-sonnet-4-20250514",
22902288
role="assistant",
2291-
content=[TextBlock(type="text", text="Response")],
2289+
content=[TextBlock(type="text", text="3 + 3 equals 6.")],
22922290
type="message",
22932291
usage=Usage(
2294-
input_tokens=100,
2295-
output_tokens=50,
2296-
cache_read_input_tokens=80,
2297-
cache_creation_input_tokens=20,
2292+
input_tokens=19,
2293+
output_tokens=14,
2294+
cache_read_input_tokens=0,
2295+
cache_creation_input_tokens=2846,
22982296
),
22992297
)
23002298
)
23012299

23022300
with start_transaction(name="anthropic"):
23032301
client.messages.create(
23042302
max_tokens=1024,
2305-
messages=[{"role": "user", "content": "Hello"}],
2306-
model="claude-3-5-sonnet-20241022",
2303+
messages=[{"role": "user", "content": "What is 3+3?"}],
2304+
model="claude-sonnet-4-20250514",
23072305
)
23082306

23092307
(span,) = events[0]["spans"]
23102308

2311-
# input_tokens should be total: 100 (non-cached) + 80 (cache_read) + 20 (cache_write) = 200
2312-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200
2309+
# input_tokens should be total: 19 (non-cached) + 2846 (cache_write) = 2865
2310+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865
2311+
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14
2312+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 0
2313+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 2846
23132314

2314-
# total_tokens should include the full input count
2315-
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 # 200 + 50
23162315

2317-
# Cache fields should still be reported correctly
2318-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80
2319-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20
2316+
def test_input_tokens_include_cache_read_nonstreaming(sentry_init, capture_events):
2317+
"""
2318+
Test that gen_ai.usage.input_tokens includes cache_read tokens (non-streaming).
2319+
2320+
Reproduces a real Anthropic cache-hit response. This is the scenario that
2321+
caused negative gen_ai.cost.input_tokens: input_tokens=19 but cached=2846,
2322+
so the backend computed 19 - 2846 = -2827 "regular" tokens.
2323+
2324+
Real Anthropic response (from E2E test):
2325+
Usage(input_tokens=19, output_tokens=14,
2326+
cache_creation_input_tokens=0, cache_read_input_tokens=2846)
2327+
"""
2328+
sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
2329+
events = capture_events()
2330+
client = Anthropic(api_key="z")
2331+
2332+
client.messages._post = mock.Mock(
2333+
return_value=Message(
2334+
id="id",
2335+
model="claude-sonnet-4-20250514",
2336+
role="assistant",
2337+
content=[TextBlock(type="text", text="5 + 5 = 10.")],
2338+
type="message",
2339+
usage=Usage(
2340+
input_tokens=19,
2341+
output_tokens=14,
2342+
cache_read_input_tokens=2846,
2343+
cache_creation_input_tokens=0,
2344+
),
2345+
)
2346+
)
23202347

2348+
with start_transaction(name="anthropic"):
2349+
client.messages.create(
2350+
max_tokens=1024,
2351+
messages=[{"role": "user", "content": "What is 5+5?"}],
2352+
model="claude-sonnet-4-20250514",
2353+
)
2354+
2355+
(span,) = events[0]["spans"]
23212356

2322-
def test_input_tokens_include_cached_streaming(sentry_init, capture_events):
2357+
# input_tokens should be total: 19 (non-cached) + 2846 (cache_read) = 2865
2358+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865
2359+
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14
2360+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846
2361+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0
2362+
2363+
2364+
def test_input_tokens_include_cache_read_streaming(sentry_init, capture_events):
23232365
"""
2324-
Test that gen_ai.usage.input_tokens includes cached tokens for streaming responses.
2366+
Test that gen_ai.usage.input_tokens includes cache_read tokens (streaming).
23252367
2326-
Same bug as non-streaming: Anthropic's input_tokens excludes cached tokens,
2327-
leading to negative cost calculations when cache_read > input_tokens.
2368+
Same cache-hit scenario as non-streaming, using realistic streaming events.
23282369
"""
23292370
client = Anthropic(api_key="z")
23302371
returned_stream = Stream(cast_to=None, response=None, client=client)
@@ -2333,22 +2374,22 @@ def test_input_tokens_include_cached_streaming(sentry_init, capture_events):
23332374
type="message_start",
23342375
message=Message(
23352376
id="id",
2336-
model="claude-3-5-sonnet-20241022",
2377+
model="claude-sonnet-4-20250514",
23372378
role="assistant",
23382379
content=[],
23392380
type="message",
23402381
usage=Usage(
2341-
input_tokens=100,
2382+
input_tokens=19,
23422383
output_tokens=0,
2343-
cache_read_input_tokens=80,
2344-
cache_creation_input_tokens=20,
2384+
cache_read_input_tokens=2846,
2385+
cache_creation_input_tokens=0,
23452386
),
23462387
),
23472388
),
23482389
MessageDeltaEvent(
23492390
type="message_delta",
23502391
delta=Delta(stop_reason="end_turn"),
2351-
usage=MessageDeltaUsage(output_tokens=50),
2392+
usage=MessageDeltaUsage(output_tokens=14),
23522393
),
23532394
]
23542395

@@ -2359,29 +2400,27 @@ def test_input_tokens_include_cached_streaming(sentry_init, capture_events):
23592400
with start_transaction(name="anthropic"):
23602401
for _ in client.messages.create(
23612402
max_tokens=1024,
2362-
messages=[{"role": "user", "content": "Hello"}],
2363-
model="claude-3-5-sonnet-20241022",
2403+
messages=[{"role": "user", "content": "What is 5+5?"}],
2404+
model="claude-sonnet-4-20250514",
23642405
stream=True,
23652406
):
23662407
pass
23672408

23682409
(span,) = events[0]["spans"]
23692410

2370-
# input_tokens should be total: 100 + 80 + 20 = 200
2371-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 200
2372-
2373-
# total_tokens should include the full input count
2374-
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 250 # 200 + 50
2375-
2376-
# Cache fields should still be reported correctly
2377-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 80
2378-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 20
2411+
# input_tokens should be total: 19 + 2846 = 2865
2412+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 2865
2413+
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 2879 # 2865 + 14
2414+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED] == 2846
2415+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHE_WRITE] == 0
23792416

23802417

23812418
def test_input_tokens_unchanged_without_caching(sentry_init, capture_events):
23822419
"""
23832420
Test that input_tokens is unchanged when there are no cached tokens.
2384-
Ensures the fix doesn't break the non-caching case.
2421+
2422+
Real Anthropic response (from E2E test, simple call without caching):
2423+
Usage(input_tokens=20, output_tokens=12)
23852424
"""
23862425
sentry_init(integrations=[AnthropicIntegration()], traces_sample_rate=1.0)
23872426
events = capture_events()
@@ -2390,29 +2429,28 @@ def test_input_tokens_unchanged_without_caching(sentry_init, capture_events):
23902429
client.messages._post = mock.Mock(
23912430
return_value=Message(
23922431
id="id",
2393-
model="claude-3-5-sonnet-20241022",
2432+
model="claude-sonnet-4-20250514",
23942433
role="assistant",
2395-
content=[TextBlock(type="text", text="Response")],
2434+
content=[TextBlock(type="text", text="2+2 equals 4.")],
23962435
type="message",
23972436
usage=Usage(
2398-
input_tokens=100,
2399-
output_tokens=50,
2437+
input_tokens=20,
2438+
output_tokens=12,
24002439
),
24012440
)
24022441
)
24032442

24042443
with start_transaction(name="anthropic"):
24052444
client.messages.create(
24062445
max_tokens=1024,
2407-
messages=[{"role": "user", "content": "Hello"}],
2408-
model="claude-3-5-sonnet-20241022",
2446+
messages=[{"role": "user", "content": "What is 2+2?"}],
2447+
model="claude-sonnet-4-20250514",
24092448
)
24102449

24112450
(span,) = events[0]["spans"]
24122451

2413-
# Without caching, input_tokens should remain as-is
2414-
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 100
2415-
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 150 # 100 + 50
2452+
assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 20
2453+
assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 32 # 20 + 12
24162454

24172455

24182456
def test_cache_tokens_streaming(sentry_init, capture_events):

0 commit comments

Comments
 (0)