diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index ed9acd29..1ae25264 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "0.2.23-alpha.1"
+ ".": "0.3.0-alpha.1"
}
diff --git a/.stats.yml b/.stats.yml
index fa9edfc7..755df453 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
-configured_endpoints: 111
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-f252873ea1e1f38fd207331ef2621c511154d5be3f4076e59cc15754fc58eee4.yml
-openapi_spec_hash: 10cbb4337a06a9fdd7d08612dd6044c3
-config_hash: 0358112cc0f3d880b4d55debdbe1cfa3
+configured_endpoints: 105
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml
+openapi_spec_hash: f73b3af77108625edae3f25972b9e665
+config_hash: 548f336ac1b68ab1dfe385b79df764dd
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0011c19f..93d68692 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,36 @@
# Changelog
+## 0.3.0-alpha.1 (2025-09-30)
+
+Full Changelog: [v0.2.23-alpha.1...v0.3.0-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.23-alpha.1...v0.3.0-alpha.1)
+
+### ⚠ BREAKING CHANGES
+
+* **api:** fixes to remove deprecated inference resources
+
+### Features
+
+* **api:** expires_after changes for /files ([7f24c43](https://github.com/llamastack/llama-stack-client-python/commit/7f24c432dc1859312710a4a1ff4a80f6f861bee8))
+* **api:** fixes to remove deprecated inference resources ([04834d2](https://github.com/llamastack/llama-stack-client-python/commit/04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec))
+* **api:** removing openai/v1 ([a918b43](https://github.com/llamastack/llama-stack-client-python/commit/a918b4323118c18f77c2abe7e1a3054c1eebeaac))
+* **api:** updating post /v1/files to have correct multipart/form-data ([433a996](https://github.com/llamastack/llama-stack-client-python/commit/433a996527bcca131ada4730376d8993f34ad6f5))
+
+
+### Bug Fixes
+
+* clean up deprecated code ([f10ead0](https://github.com/llamastack/llama-stack-client-python/commit/f10ead00522b7ca803cd7dc3617da0d451efa7da))
+* Don't retry for non-recoverable server http errors ([#212](https://github.com/llamastack/llama-stack-client-python/issues/212)) ([6782e8f](https://github.com/llamastack/llama-stack-client-python/commit/6782e8fc5931369223ed4446f8e7732f62712eff))
+
+
+### Documentation
+
+* update examples ([f896747](https://github.com/llamastack/llama-stack-client-python/commit/f89674726f55915a8cda0e2b4284be3c92978121))
+
+
+### Build System
+
+* Bump version to 0.2.23 ([0d4dc64](https://github.com/llamastack/llama-stack-client-python/commit/0d4dc6449224fa2a0f6d20f6229dd9d1a5427861))
+
## 0.2.23-alpha.1 (2025-09-26)
Full Changelog: [v0.2.19-alpha.1...v0.2.23-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.19-alpha.1...v0.2.23-alpha.1)
diff --git a/README.md b/README.md
index 928458d2..c8cebcc3 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,50 @@ asyncio.run(main())
Functionality between the synchronous and asynchronous clients is otherwise identical.
+## Streaming responses
+
+We provide support for streaming responses using Server Side Events (SSE).
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+stream = client.chat.completions.create(
+ messages=[
+ {
+ "content": "string",
+ "role": "user",
+ }
+ ],
+ model="model",
+ stream=True,
+)
+for completion in stream:
+ print(completion)
+```
+
+The async client uses the exact same interface.
+
+```python
+from llama_stack_client import AsyncLlamaStackClient
+
+client = AsyncLlamaStackClient()
+
+stream = await client.chat.completions.create(
+ messages=[
+ {
+ "content": "string",
+ "role": "user",
+ }
+ ],
+ model="model",
+ stream=True,
+)
+async for completion in stream:
+ print(completion)
+```
+
## Using types
Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:
@@ -118,6 +162,40 @@ Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typ
Typed requests and responses provide autocomplete and documentation within your editor. If you would like to see type errors in VS Code to help catch bugs earlier, set `python.analysis.typeCheckingMode` to `basic`.
+## Nested params
+
+Nested parameters are dictionaries, typed using `TypedDict`, for example:
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+client.toolgroups.register(
+ provider_id="provider_id",
+ toolgroup_id="toolgroup_id",
+ mcp_endpoint={"uri": "uri"},
+)
+```
+
+## File uploads
+
+Request parameters that correspond to file uploads can be passed as `bytes`, or a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance or a tuple of `(filename, contents, media type)`.
+
+```python
+from pathlib import Path
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+client.files.create(
+ file=Path("/path/to/file"),
+ purpose="assistants",
+)
+```
+
+The async client uses the exact same interface. If you pass a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance, the file contents will be read asynchronously automatically.
+
## Handling errors
When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `llama_stack_client.APIConnectionError` is raised.
@@ -134,9 +212,14 @@ from llama_stack_client import LlamaStackClient
client = LlamaStackClient()
try:
- client.agents.sessions.create(
- agent_id="agent_id",
- session_name="session_name",
+ client.chat.completions.create(
+ messages=[
+ {
+ "content": "string",
+ "role": "user",
+ }
+ ],
+ model="model",
)
except llama_stack_client.APIConnectionError as e:
print("The server could not be reached")
@@ -180,9 +263,14 @@ client = LlamaStackClient(
)
# Or, configure per-request:
-client.with_options(max_retries=5).agents.sessions.create(
- agent_id="agent_id",
- session_name="session_name",
+client.with_options(max_retries=5).chat.completions.create(
+ messages=[
+ {
+ "content": "string",
+ "role": "user",
+ }
+ ],
+ model="model",
)
```
@@ -206,9 +294,14 @@ client = LlamaStackClient(
)
# Override per-request:
-client.with_options(timeout=5.0).agents.sessions.create(
- agent_id="agent_id",
- session_name="session_name",
+client.with_options(timeout=5.0).chat.completions.create(
+ messages=[
+ {
+ "content": "string",
+ "role": "user",
+ }
+ ],
+ model="model",
)
```
@@ -248,14 +341,17 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to
from llama_stack_client import LlamaStackClient
client = LlamaStackClient()
-response = client.agents.sessions.with_raw_response.create(
- agent_id="agent_id",
- session_name="session_name",
+response = client.chat.completions.with_raw_response.create(
+ messages=[{
+ "content": "string",
+ "role": "user",
+ }],
+ model="model",
)
print(response.headers.get('X-My-Header'))
-session = response.parse() # get the object that `agents.sessions.create()` would have returned
-print(session.session_id)
+completion = response.parse() # get the object that `chat.completions.create()` would have returned
+print(completion)
```
These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object.
@@ -269,9 +365,14 @@ The above interface eagerly reads the full response body when you make the reque
To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods.
```python
-with client.agents.sessions.with_streaming_response.create(
- agent_id="agent_id",
- session_name="session_name",
+with client.chat.completions.with_streaming_response.create(
+ messages=[
+ {
+ "content": "string",
+ "role": "user",
+ }
+ ],
+ model="model",
) as response:
print(response.headers.get("X-My-Header"))
diff --git a/api.md b/api.md
index 22c2120f..c246f4c1 100644
--- a/api.md
+++ b/api.md
@@ -3,10 +3,8 @@
```python
from llama_stack_client.types import (
AgentConfig,
- BatchCompletion,
ChatCompletionResponse,
CompletionMessage,
- ContentDelta,
Document,
InterleavedContent,
InterleavedContentItem,
@@ -14,7 +12,6 @@ from llama_stack_client.types import (
Metric,
ParamType,
QueryConfig,
- QueryGeneratorConfig,
QueryResult,
ResponseFormat,
SafetyViolation,
@@ -91,10 +88,10 @@ from llama_stack_client.types import (
Methods:
-- client.responses.create(\*\*params) -> ResponseObject
-- client.responses.retrieve(response_id) -> ResponseObject
-- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse]
-- client.responses.delete(response_id) -> ResponseDeleteResponse
+- client.responses.create(\*\*params) -> ResponseObject
+- client.responses.retrieve(response_id) -> ResponseObject
+- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse]
+- client.responses.delete(response_id) -> ResponseDeleteResponse
## InputItems
@@ -106,7 +103,7 @@ from llama_stack_client.types.responses import InputItemListResponse
Methods:
-- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse
+- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse
# Agents
@@ -164,12 +161,7 @@ Methods:
Types:
```python
-from llama_stack_client.types.agents import (
- AgentTurnResponseStreamChunk,
- Turn,
- TurnResponseEvent,
- TurnResponseEventPayload,
-)
+from llama_stack_client.types.agents import AgentTurnResponseStreamChunk, Turn, TurnResponseEvent
```
Methods:
@@ -206,7 +198,7 @@ Methods:
Types:
```python
-from llama_stack_client.types import BenchmarkConfig, EvalCandidate, EvaluateResponse, Job
+from llama_stack_client.types import BenchmarkConfig, EvaluateResponse, Job
```
Methods:
@@ -242,24 +234,12 @@ Methods:
Types:
```python
-from llama_stack_client.types import (
- ChatCompletionResponseStreamChunk,
- CompletionResponse,
- EmbeddingsResponse,
- TokenLogProbs,
- InferenceBatchChatCompletionResponse,
- InferenceRerankResponse,
-)
+from llama_stack_client.types import InferenceRerankResponse
```
Methods:
-- client.inference.batch_chat_completion(\*\*params) -> InferenceBatchChatCompletionResponse
-- client.inference.batch_completion(\*\*params) -> BatchCompletion
-- client.inference.chat_completion(\*\*params) -> ChatCompletionResponse
-- client.inference.completion(\*\*params) -> CompletionResponse
-- client.inference.embeddings(\*\*params) -> EmbeddingsResponse
-- client.inference.rerank(\*\*params) -> InferenceRerankResponse
+- client.inference.rerank(\*\*params) -> InferenceRerankResponse
# Embeddings
@@ -271,7 +251,7 @@ from llama_stack_client.types import CreateEmbeddingsResponse
Methods:
-- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse
+- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse
# Chat
@@ -295,9 +275,9 @@ from llama_stack_client.types.chat import (
Methods:
-- client.chat.completions.create(\*\*params) -> CompletionCreateResponse
-- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse
-- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse]
+- client.chat.completions.create(\*\*params) -> CompletionCreateResponse
+- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse
+- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse]
# Completions
@@ -309,7 +289,7 @@ from llama_stack_client.types import CompletionCreateResponse
Methods:
-- client.completions.create(\*\*params) -> CompletionCreateResponse
+- client.completions.create(\*\*params) -> CompletionCreateResponse
# VectorIo
@@ -359,12 +339,12 @@ from llama_stack_client.types import (
Methods:
-- client.vector_stores.create(\*\*params) -> VectorStore
-- client.vector_stores.retrieve(vector_store_id) -> VectorStore
-- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore
-- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore]
-- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse
-- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse
+- client.vector_stores.create(\*\*params) -> VectorStore
+- client.vector_stores.retrieve(vector_store_id) -> VectorStore
+- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore
+- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore]
+- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse
+- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse
## Files
@@ -380,12 +360,12 @@ from llama_stack_client.types.vector_stores import (
Methods:
-- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile
-- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile
-- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile
-- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile]
-- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse
-- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse
+- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile
+- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile
+- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile
+- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile]
+- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse
+- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse
# Models
@@ -412,7 +392,7 @@ from llama_stack_client.types.models import OpenAIListResponse
Methods:
-- client.models.openai.list() -> OpenAIListResponse
+- client.models.openai.list() -> ModelListResponse
# PostTraining
@@ -481,7 +461,7 @@ from llama_stack_client.types import CreateResponse
Methods:
-- client.moderations.create(\*\*params) -> CreateResponse
+- client.moderations.create(\*\*params) -> CreateResponse
# Safety
@@ -608,8 +588,8 @@ from llama_stack_client.types import DeleteFileResponse, File, ListFilesResponse
Methods:
-- client.files.create(\*\*params) -> File
-- client.files.retrieve(file_id) -> File
-- client.files.list(\*\*params) -> SyncOpenAICursorPage[File]
-- client.files.delete(file_id) -> DeleteFileResponse
-- client.files.content(file_id) -> object
+- client.files.create(\*\*params) -> File
+- client.files.retrieve(file_id) -> File
+- client.files.list(\*\*params) -> SyncOpenAICursorPage[File]
+- client.files.delete(file_id) -> DeleteFileResponse
+- client.files.content(file_id) -> object
diff --git a/pyproject.toml b/pyproject.toml
index 843dd9b7..3b50518e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "llama_stack_client"
-version = "0.2.23"
+version = "0.3.0-alpha.1"
description = "The official Python library for the llama-stack-client API"
dynamic = ["readme"]
license = "MIT"
diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
index 14b46372..cbf5f680 100644
--- a/src/llama_stack_client/lib/inference/event_logger.py
+++ b/src/llama_stack_client/lib/inference/event_logger.py
@@ -5,7 +5,7 @@
# the root directory of this source tree.
from typing import Generator
from termcolor import cprint
-from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk
+from llama_stack_client.types import ChatCompletionChunk
class InferenceStreamPrintableEvent:
@@ -28,35 +28,11 @@ def __init__(self):
self.is_thinking = False
def yield_printable_events(
- self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
+ self, chunk: ChatCompletionChunk
) -> Generator[InferenceStreamPrintableEvent, None, None]:
- # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
- if hasattr(chunk, "event"):
- yield from self._handle_inference_stream_chunk(chunk)
- # Check if the chunk has choices attribute (ChatCompletionChunk)
- elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
+ if hasattr(chunk, "choices") and len(chunk.choices) > 0:
yield from self._handle_chat_completion_chunk(chunk)
- def _handle_inference_stream_chunk(
- self, chunk: ChatCompletionResponseStreamChunk
- ) -> Generator[InferenceStreamPrintableEvent, None, None]:
- event = chunk.event
- if event.event_type == "start":
- yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
- elif event.event_type == "progress":
- if event.delta.type == "reasoning":
- if not self.is_thinking:
- yield InferenceStreamPrintableEvent(" ", color="magenta", end="")
- self.is_thinking = True
- yield InferenceStreamPrintableEvent(event.delta.reasoning, color="magenta", end="")
- else:
- if self.is_thinking:
- yield InferenceStreamPrintableEvent("", color="magenta", end="")
- self.is_thinking = False
- yield InferenceStreamPrintableEvent(event.delta.text, color="yellow", end="")
- elif event.event_type == "complete":
- yield InferenceStreamPrintableEvent("")
-
def _handle_chat_completion_chunk(
self, chunk: ChatCompletionChunk
) -> Generator[InferenceStreamPrintableEvent, None, None]:
diff --git a/src/llama_stack_client/resources/chat/completions.py b/src/llama_stack_client/resources/chat/completions.py
index 5445a2d1..2fb19980 100644
--- a/src/llama_stack_client/resources/chat/completions.py
+++ b/src/llama_stack_client/resources/chat/completions.py
@@ -372,7 +372,7 @@ def create(
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
return self._post(
- "/v1/openai/v1/chat/completions",
+ "/v1/chat/completions",
body=maybe_transform(
{
"messages": messages,
@@ -439,7 +439,7 @@ def retrieve(
if not completion_id:
raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}")
return self._get(
- f"/v1/openai/v1/chat/completions/{completion_id}",
+ f"/v1/chat/completions/{completion_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -481,7 +481,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/chat/completions",
+ "/v1/chat/completions",
page=SyncOpenAICursorPage[CompletionListResponse],
options=make_request_options(
extra_headers=extra_headers,
@@ -845,7 +845,7 @@ async def create(
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
return await self._post(
- "/v1/openai/v1/chat/completions",
+ "/v1/chat/completions",
body=await async_maybe_transform(
{
"messages": messages,
@@ -912,7 +912,7 @@ async def retrieve(
if not completion_id:
raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}")
return await self._get(
- f"/v1/openai/v1/chat/completions/{completion_id}",
+ f"/v1/chat/completions/{completion_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -954,7 +954,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/chat/completions",
+ "/v1/chat/completions",
page=AsyncOpenAICursorPage[CompletionListResponse],
options=make_request_options(
extra_headers=extra_headers,
diff --git a/src/llama_stack_client/resources/completions.py b/src/llama_stack_client/resources/completions.py
index 2c1475de..caeab7a1 100644
--- a/src/llama_stack_client/resources/completions.py
+++ b/src/llama_stack_client/resources/completions.py
@@ -326,7 +326,7 @@ def create(
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> CompletionCreateResponse | Stream[CompletionCreateResponse]:
return self._post(
- "/v1/openai/v1/completions",
+ "/v1/completions",
body=maybe_transform(
{
"model": model,
@@ -664,7 +664,7 @@ async def create(
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]:
return await self._post(
- "/v1/openai/v1/completions",
+ "/v1/completions",
body=await async_maybe_transform(
{
"model": model,
diff --git a/src/llama_stack_client/resources/embeddings.py b/src/llama_stack_client/resources/embeddings.py
index 60c38cb2..29cd69d8 100644
--- a/src/llama_stack_client/resources/embeddings.py
+++ b/src/llama_stack_client/resources/embeddings.py
@@ -87,7 +87,7 @@ def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._post(
- "/v1/openai/v1/embeddings",
+ "/v1/embeddings",
body=maybe_transform(
{
"input": input,
@@ -169,7 +169,7 @@ async def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
return await self._post(
- "/v1/openai/v1/embeddings",
+ "/v1/embeddings",
body=await async_maybe_transform(
{
"input": input,
diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py
index 6b395e52..39add811 100644
--- a/src/llama_stack_client/resources/files.py
+++ b/src/llama_stack_client/resources/files.py
@@ -51,6 +51,7 @@ def create(
*,
file: FileTypes,
purpose: Literal["assistants", "batch"],
+ expires_after: file_create_params.ExpiresAfter | Omit = omit,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
extra_headers: Headers | None = None,
@@ -65,10 +66,17 @@ def create(
- file: The File object (not file name) to be uploaded.
- purpose: The intended purpose of the uploaded file.
+ - expires_after: Optional form values describing expiration for the file.
Args:
purpose: Valid purpose values for OpenAI Files API.
+ expires_after:
+ Control expiration of uploaded files. Params:
+
+ - anchor, must be "created_at"
+ - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -81,6 +89,7 @@ def create(
{
"file": file,
"purpose": purpose,
+ "expires_after": expires_after,
}
)
files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
@@ -89,7 +98,7 @@ def create(
# multipart/form-data; boundary=---abc--
extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
return self._post(
- "/v1/openai/v1/files",
+ "/v1/files",
body=maybe_transform(body, file_create_params.FileCreateParams),
files=files,
options=make_request_options(
@@ -124,7 +133,7 @@ def retrieve(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._get(
- f"/v1/openai/v1/files/{file_id}",
+ f"/v1/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -171,7 +180,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/files",
+ "/v1/files",
page=SyncOpenAICursorPage[File],
options=make_request_options(
extra_headers=extra_headers,
@@ -217,7 +226,7 @@ def delete(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._delete(
- f"/v1/openai/v1/files/{file_id}",
+ f"/v1/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -250,7 +259,7 @@ def content(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._get(
- f"/v1/openai/v1/files/{file_id}/content",
+ f"/v1/files/{file_id}/content",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -283,6 +292,7 @@ async def create(
*,
file: FileTypes,
purpose: Literal["assistants", "batch"],
+ expires_after: file_create_params.ExpiresAfter | Omit = omit,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
extra_headers: Headers | None = None,
@@ -297,10 +307,17 @@ async def create(
- file: The File object (not file name) to be uploaded.
- purpose: The intended purpose of the uploaded file.
+ - expires_after: Optional form values describing expiration for the file.
Args:
purpose: Valid purpose values for OpenAI Files API.
+ expires_after:
+ Control expiration of uploaded files. Params:
+
+ - anchor, must be "created_at"
+ - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+
extra_headers: Send extra headers
extra_query: Add additional query parameters to the request
@@ -313,6 +330,7 @@ async def create(
{
"file": file,
"purpose": purpose,
+ "expires_after": expires_after,
}
)
files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
@@ -321,7 +339,7 @@ async def create(
# multipart/form-data; boundary=---abc--
extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
return await self._post(
- "/v1/openai/v1/files",
+ "/v1/files",
body=await async_maybe_transform(body, file_create_params.FileCreateParams),
files=files,
options=make_request_options(
@@ -356,7 +374,7 @@ async def retrieve(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._get(
- f"/v1/openai/v1/files/{file_id}",
+ f"/v1/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -403,7 +421,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/files",
+ "/v1/files",
page=AsyncOpenAICursorPage[File],
options=make_request_options(
extra_headers=extra_headers,
@@ -449,7 +467,7 @@ async def delete(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._delete(
- f"/v1/openai/v1/files/{file_id}",
+ f"/v1/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -482,7 +500,7 @@ async def content(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._get(
- f"/v1/openai/v1/files/{file_id}/content",
+ f"/v1/files/{file_id}/content",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index 732025cc..e5cf7b6b 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -2,1106 +2,76 @@
from __future__ import annotations
-import typing_extensions
-from typing import Type, Union, Iterable, cast
-from typing_extensions import Literal, overload
+from typing import Type, cast
import httpx
-from ..types import (
- inference_rerank_params,
- inference_completion_params,
- inference_embeddings_params,
- inference_chat_completion_params,
- inference_batch_completion_params,
- inference_batch_chat_completion_params,
-)
-from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
-from .._utils import required_args, maybe_transform, async_maybe_transform
-from .._compat import cached_property
-from .._resource import SyncAPIResource, AsyncAPIResource
-from .._response import (
- to_raw_response_wrapper,
- to_streamed_response_wrapper,
- async_to_raw_response_wrapper,
- async_to_streamed_response_wrapper,
-)
-from .._wrappers import DataWrapper
-from .._streaming import Stream, AsyncStream
-from .._base_client import make_request_options
-from ..types.completion_response import CompletionResponse
-from ..types.embeddings_response import EmbeddingsResponse
-from ..types.shared_params.message import Message
-from ..types.shared.batch_completion import BatchCompletion
-from ..types.inference_rerank_response import InferenceRerankResponse
-from ..types.shared_params.response_format import ResponseFormat
-from ..types.shared_params.sampling_params import SamplingParams
-from ..types.shared.chat_completion_response import ChatCompletionResponse
-from ..types.shared_params.interleaved_content import InterleavedContent
-from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk
-from ..types.shared_params.interleaved_content_item import InterleavedContentItem
-from ..types.inference_batch_chat_completion_response import InferenceBatchChatCompletionResponse
-
-__all__ = ["InferenceResource", "AsyncInferenceResource"]
-
-
-class InferenceResource(SyncAPIResource):
- @cached_property
- def with_raw_response(self) -> InferenceResourceWithRawResponse:
- """
- This property can be used as a prefix for any HTTP method call to return
- the raw response object instead of the parsed content.
-
- For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
- """
- return InferenceResourceWithRawResponse(self)
-
- @cached_property
- def with_streaming_response(self) -> InferenceResourceWithStreamingResponse:
- """
- An alternative to `.with_raw_response` that doesn't eagerly read the response body.
-
- For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
- """
- return InferenceResourceWithStreamingResponse(self)
-
- def batch_chat_completion(
- self,
- *,
- messages_batch: Iterable[Iterable[Message]],
- model_id: str,
- logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit,
- tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> InferenceBatchChatCompletionResponse:
- """
- Generate chat completions for a batch of messages using the specified model.
-
- Args:
- messages_batch: The messages to generate completions for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- tool_config: (Optional) Configuration for tool use.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- return self._post(
- "/v1/inference/batch-chat-completion",
- body=maybe_transform(
- {
- "messages_batch": messages_batch,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- "tool_config": tool_config,
- "tools": tools,
- },
- inference_batch_chat_completion_params.InferenceBatchChatCompletionParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=InferenceBatchChatCompletionResponse,
- )
-
- def batch_completion(
- self,
- *,
- content_batch: SequenceNotStr[InterleavedContent],
- model_id: str,
- logprobs: inference_batch_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> BatchCompletion:
- """
- Generate completions for a batch of content using the specified model.
-
- Args:
- content_batch: The content to generate completions for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- return self._post(
- "/v1/inference/batch-completion",
- body=maybe_transform(
- {
- "content_batch": content_batch,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- },
- inference_batch_completion_params.InferenceBatchCompletionParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=BatchCompletion,
- )
-
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @overload
- def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> ChatCompletionResponse:
- """
- Generate a chat completion for the given messages using the specified model.
-
- Args:
- messages: List of messages in the conversation.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
- options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
- providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
- grammar. This format is more flexible, but not all providers support it.
-
- sampling_params: Parameters to control the sampling strategy.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
- ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
- tool_config: (Optional) Configuration for tool use.
-
- tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
- will attempt to use a format that is best adapted to the model. -
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @overload
- def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- stream: Literal[True],
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Stream[ChatCompletionResponseStreamChunk]:
- """
- Generate a chat completion for the given messages using the specified model.
-
- Args:
- messages: List of messages in the conversation.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
- options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
- providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
- grammar. This format is more flexible, but not all providers support it.
-
- sampling_params: Parameters to control the sampling strategy.
-
- tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
- ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
- tool_config: (Optional) Configuration for tool use.
-
- tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
- will attempt to use a format that is best adapted to the model. -
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @overload
- def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- stream: bool,
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]:
- """
- Generate a chat completion for the given messages using the specified model.
-
- Args:
- messages: List of messages in the conversation.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
- options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
- providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
- grammar. This format is more flexible, but not all providers support it.
-
- sampling_params: Parameters to control the sampling strategy.
-
- tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
- ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
- tool_config: (Optional) Configuration for tool use.
-
- tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
- will attempt to use a format that is best adapted to the model. -
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
- def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Literal[True] | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]:
- if stream:
- extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
- return self._post(
- "/v1/inference/chat-completion",
- body=maybe_transform(
- {
- "messages": messages,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- "stream": stream,
- "tool_choice": tool_choice,
- "tool_config": tool_config,
- "tool_prompt_format": tool_prompt_format,
- "tools": tools,
- },
- inference_chat_completion_params.InferenceChatCompletionParamsStreaming
- if stream
- else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=ChatCompletionResponse,
- stream=stream or False,
- stream_cls=Stream[ChatCompletionResponseStreamChunk],
- )
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @overload
- def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> CompletionResponse:
- """
- Generate a completion for the given content using the specified model.
-
- Args:
- content: The content to generate a completion for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @overload
- def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- stream: Literal[True],
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> Stream[CompletionResponse]:
- """
- Generate a completion for the given content using the specified model.
-
- Args:
- content: The content to generate a completion for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @overload
- def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- stream: bool,
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> CompletionResponse | Stream[CompletionResponse]:
- """
- Generate a completion for the given content using the specified model.
-
- Args:
- content: The content to generate a completion for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @required_args(["content", "model_id"], ["content", "model_id", "stream"])
- def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Literal[True] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> CompletionResponse | Stream[CompletionResponse]:
- if stream:
- extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
- return self._post(
- "/v1/inference/completion",
- body=maybe_transform(
- {
- "content": content,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- "stream": stream,
- },
- inference_completion_params.InferenceCompletionParamsStreaming
- if stream
- else inference_completion_params.InferenceCompletionParamsNonStreaming,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=CompletionResponse,
- stream=stream or False,
- stream_cls=Stream[CompletionResponse],
- )
-
- @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.")
- def embeddings(
- self,
- *,
- contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]],
- model_id: str,
- output_dimension: int | Omit = omit,
- task_type: Literal["query", "document"] | Omit = omit,
- text_truncation: Literal["none", "start", "end"] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> EmbeddingsResponse:
- """
- Generate embeddings for content pieces using the specified model.
-
- Args:
- contents: List of contents to generate embeddings for. Each content can be a string or an
- InterleavedContentItem (and hence can be multimodal). The behavior depends on
- the model and provider. Some models may only support text.
-
- model_id: The identifier of the model to use. The model must be an embedding model
- registered with Llama Stack and available via the /models endpoint.
-
- output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
- Matryoshka models.
-
- task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
- embedding models.
-
- text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
- than the model's max sequence length.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- return self._post(
- "/v1/inference/embeddings",
- body=maybe_transform(
- {
- "contents": contents,
- "model_id": model_id,
- "output_dimension": output_dimension,
- "task_type": task_type,
- "text_truncation": text_truncation,
- },
- inference_embeddings_params.InferenceEmbeddingsParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=EmbeddingsResponse,
- )
-
- def rerank(
- self,
- *,
- items: SequenceNotStr[inference_rerank_params.Item],
- model: str,
- query: inference_rerank_params.Query,
- max_num_results: int | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> InferenceRerankResponse:
- """
- Rerank a list of documents based on their relevance to a query.
-
- Args:
- items: List of items to rerank. Each item can be a string, text content part, or image
- content part. Each input must not exceed the model's max input token length.
-
- model: The identifier of the reranking model to use.
-
- query: The search query to rank items against. Can be a string, text content part, or
- image content part. The input must not exceed the model's max input token
- length.
-
- max_num_results: (Optional) Maximum number of results to return. Default: returns all.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- return self._post(
- "/v1/inference/rerank",
- body=maybe_transform(
- {
- "items": items,
- "model": model,
- "query": query,
- "max_num_results": max_num_results,
- },
- inference_rerank_params.InferenceRerankParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers,
- extra_query=extra_query,
- extra_body=extra_body,
- timeout=timeout,
- post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
- ),
- cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
- )
-
-
-class AsyncInferenceResource(AsyncAPIResource):
- @cached_property
- def with_raw_response(self) -> AsyncInferenceResourceWithRawResponse:
- """
- This property can be used as a prefix for any HTTP method call to return
- the raw response object instead of the parsed content.
-
- For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
- """
- return AsyncInferenceResourceWithRawResponse(self)
-
- @cached_property
- def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse:
- """
- An alternative to `.with_raw_response` that doesn't eagerly read the response body.
-
- For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
- """
- return AsyncInferenceResourceWithStreamingResponse(self)
-
- async def batch_chat_completion(
- self,
- *,
- messages_batch: Iterable[Iterable[Message]],
- model_id: str,
- logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit,
- tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> InferenceBatchChatCompletionResponse:
- """
- Generate chat completions for a batch of messages using the specified model.
-
- Args:
- messages_batch: The messages to generate completions for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- tool_config: (Optional) Configuration for tool use.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- return await self._post(
- "/v1/inference/batch-chat-completion",
- body=await async_maybe_transform(
- {
- "messages_batch": messages_batch,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- "tool_config": tool_config,
- "tools": tools,
- },
- inference_batch_chat_completion_params.InferenceBatchChatCompletionParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=InferenceBatchChatCompletionResponse,
- )
-
- async def batch_completion(
- self,
- *,
- content_batch: SequenceNotStr[InterleavedContent],
- model_id: str,
- logprobs: inference_batch_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> BatchCompletion:
- """
- Generate completions for a batch of content using the specified model.
-
- Args:
- content_batch: The content to generate completions for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- return await self._post(
- "/v1/inference/batch-completion",
- body=await async_maybe_transform(
- {
- "content_batch": content_batch,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- },
- inference_batch_completion_params.InferenceBatchCompletionParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=BatchCompletion,
- )
-
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @overload
- async def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> ChatCompletionResponse:
- """
- Generate a chat completion for the given messages using the specified model.
-
- Args:
- messages: List of messages in the conversation.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
- options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
- providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
- grammar. This format is more flexible, but not all providers support it.
-
- sampling_params: Parameters to control the sampling strategy.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
- ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
- tool_config: (Optional) Configuration for tool use.
-
- tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
- will attempt to use a format that is best adapted to the model. -
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
+from ..types import inference_rerank_params
+from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
+from .._utils import maybe_transform, async_maybe_transform
+from .._compat import cached_property
+from .._resource import SyncAPIResource, AsyncAPIResource
+from .._response import (
+ to_raw_response_wrapper,
+ to_streamed_response_wrapper,
+ async_to_raw_response_wrapper,
+ async_to_streamed_response_wrapper,
+)
+from .._wrappers import DataWrapper
+from .._base_client import make_request_options
+from ..types.inference_rerank_response import InferenceRerankResponse
- extra_query: Add additional query parameters to the request
+__all__ = ["InferenceResource", "AsyncInferenceResource"]
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
+class InferenceResource(SyncAPIResource):
+ @cached_property
+ def with_raw_response(self) -> InferenceResourceWithRawResponse:
"""
- ...
+ This property can be used as a prefix for any HTTP method call to return
+ the raw response object instead of the parsed content.
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @overload
- async def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- stream: Literal[True],
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> AsyncStream[ChatCompletionResponseStreamChunk]:
+ For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
"""
- Generate a chat completion for the given messages using the specified model.
-
- Args:
- messages: List of messages in the conversation.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
- options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
- providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
- grammar. This format is more flexible, but not all providers support it.
-
- sampling_params: Parameters to control the sampling strategy.
-
- tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
- ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
- tool_config: (Optional) Configuration for tool use.
-
- tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
- will attempt to use a format that is best adapted to the model. -
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
-
- tools: (Optional) List of tool definitions available to the model.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
+ return InferenceResourceWithRawResponse(self)
- extra_body: Add additional JSON properties to the request
+ @cached_property
+ def with_streaming_response(self) -> InferenceResourceWithStreamingResponse:
+ """
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
- timeout: Override the client-level default timeout for this request, in seconds
+ For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
"""
- ...
+ return InferenceResourceWithStreamingResponse(self)
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @overload
- async def chat_completion(
+ def rerank(
self,
*,
- messages: Iterable[Message],
- model_id: str,
- stream: bool,
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
+ items: SequenceNotStr[inference_rerank_params.Item],
+ model: str,
+ query: inference_rerank_params.Query,
+ max_num_results: int | Omit = omit,
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
# The extra values given here take precedence over values defined on the client or passed to this method.
extra_headers: Headers | None = None,
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]:
+ ) -> InferenceRerankResponse:
"""
- Generate a chat completion for the given messages using the specified model.
+ Rerank a list of documents based on their relevance to a query.
Args:
- messages: List of messages in the conversation.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
- options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
- providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
- grammar. This format is more flexible, but not all providers support it.
-
- sampling_params: Parameters to control the sampling strategy.
-
- tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
- ToolChoice.auto. .. deprecated:: Use tool_config instead.
+ items: List of items to rerank. Each item can be a string, text content part, or image
+ content part. Each input must not exceed the model's max input token length.
- tool_config: (Optional) Configuration for tool use.
+ model: The identifier of the reranking model to use.
- tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
- will attempt to use a format that is best adapted to the model. -
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
+ query: The search query to rank items against. Can be a string, text content part, or
+ image content part. The input must not exceed the model's max input token
+ length.
- tools: (Optional) List of tool definitions available to the model.
+ max_num_results: (Optional) Maximum number of results to return. Default: returns all.
extra_headers: Send extra headers
@@ -1111,306 +81,47 @@ async def chat_completion(
timeout: Override the client-level default timeout for this request, in seconds
"""
- ...
-
- @typing_extensions.deprecated(
- "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
- )
- @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
- async def chat_completion(
- self,
- *,
- messages: Iterable[Message],
- model_id: str,
- logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Literal[True] | Omit = omit,
- tool_choice: Literal["auto", "required", "none"] | Omit = omit,
- tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
- tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
- tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]:
- if stream:
- extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
- return await self._post(
- "/v1/inference/chat-completion",
- body=await async_maybe_transform(
+ return self._post(
+ "/v1alpha/inference/rerank",
+ body=maybe_transform(
{
- "messages": messages,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- "stream": stream,
- "tool_choice": tool_choice,
- "tool_config": tool_config,
- "tool_prompt_format": tool_prompt_format,
- "tools": tools,
+ "items": items,
+ "model": model,
+ "query": query,
+ "max_num_results": max_num_results,
},
- inference_chat_completion_params.InferenceChatCompletionParamsStreaming
- if stream
- else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming,
+ inference_rerank_params.InferenceRerankParams,
),
options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+ extra_headers=extra_headers,
+ extra_query=extra_query,
+ extra_body=extra_body,
+ timeout=timeout,
+ post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
),
- cast_to=ChatCompletionResponse,
- stream=stream or False,
- stream_cls=AsyncStream[ChatCompletionResponseStreamChunk],
+ cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
)
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @overload
- async def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> CompletionResponse:
- """
- Generate a completion for the given content using the specified model.
-
- Args:
- content: The content to generate a completion for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @overload
- async def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- stream: Literal[True],
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> AsyncStream[CompletionResponse]:
- """
- Generate a completion for the given content using the specified model.
-
- Args:
- content: The content to generate a completion for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
-
- timeout: Override the client-level default timeout for this request, in seconds
- """
- ...
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @overload
- async def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- stream: bool,
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> CompletionResponse | AsyncStream[CompletionResponse]:
+class AsyncInferenceResource(AsyncAPIResource):
+ @cached_property
+ def with_raw_response(self) -> AsyncInferenceResourceWithRawResponse:
"""
- Generate a completion for the given content using the specified model.
-
- Args:
- content: The content to generate a completion for.
-
- model_id: The identifier of the model to use. The model must be registered with Llama
- Stack and available via the /models endpoint.
-
- stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
- False.
-
- logprobs: (Optional) If specified, log probabilities for each token position will be
- returned.
-
- response_format: (Optional) Grammar specification for guided (structured) decoding.
-
- sampling_params: (Optional) Parameters to control the sampling strategy.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
+ This property can be used as a prefix for any HTTP method call to return
+ the raw response object instead of the parsed content.
- timeout: Override the client-level default timeout for this request, in seconds
+ For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
"""
- ...
-
- @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
- @required_args(["content", "model_id"], ["content", "model_id", "stream"])
- async def completion(
- self,
- *,
- content: InterleavedContent,
- model_id: str,
- logprobs: inference_completion_params.Logprobs | Omit = omit,
- response_format: ResponseFormat | Omit = omit,
- sampling_params: SamplingParams | Omit = omit,
- stream: Literal[False] | Literal[True] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> CompletionResponse | AsyncStream[CompletionResponse]:
- if stream:
- extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
- return await self._post(
- "/v1/inference/completion",
- body=await async_maybe_transform(
- {
- "content": content,
- "model_id": model_id,
- "logprobs": logprobs,
- "response_format": response_format,
- "sampling_params": sampling_params,
- "stream": stream,
- },
- inference_completion_params.InferenceCompletionParamsStreaming
- if stream
- else inference_completion_params.InferenceCompletionParamsNonStreaming,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=CompletionResponse,
- stream=stream or False,
- stream_cls=AsyncStream[CompletionResponse],
- )
+ return AsyncInferenceResourceWithRawResponse(self)
- @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.")
- async def embeddings(
- self,
- *,
- contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]],
- model_id: str,
- output_dimension: int | Omit = omit,
- task_type: Literal["query", "document"] | Omit = omit,
- text_truncation: Literal["none", "start", "end"] | Omit = omit,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> EmbeddingsResponse:
+ @cached_property
+ def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse:
"""
- Generate embeddings for content pieces using the specified model.
-
- Args:
- contents: List of contents to generate embeddings for. Each content can be a string or an
- InterleavedContentItem (and hence can be multimodal). The behavior depends on
- the model and provider. Some models may only support text.
-
- model_id: The identifier of the model to use. The model must be an embedding model
- registered with Llama Stack and available via the /models endpoint.
-
- output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
- Matryoshka models.
-
- task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
- embedding models.
-
- text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
- than the model's max sequence length.
-
- extra_headers: Send extra headers
-
- extra_query: Add additional query parameters to the request
-
- extra_body: Add additional JSON properties to the request
+ An alternative to `.with_raw_response` that doesn't eagerly read the response body.
- timeout: Override the client-level default timeout for this request, in seconds
+ For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
"""
- return await self._post(
- "/v1/inference/embeddings",
- body=await async_maybe_transform(
- {
- "contents": contents,
- "model_id": model_id,
- "output_dimension": output_dimension,
- "task_type": task_type,
- "text_truncation": text_truncation,
- },
- inference_embeddings_params.InferenceEmbeddingsParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=EmbeddingsResponse,
- )
+ return AsyncInferenceResourceWithStreamingResponse(self)
async def rerank(
self,
@@ -1450,7 +161,7 @@ async def rerank(
timeout: Override the client-level default timeout for this request, in seconds
"""
return await self._post(
- "/v1/inference/rerank",
+ "/v1alpha/inference/rerank",
body=await async_maybe_transform(
{
"items": items,
@@ -1475,27 +186,6 @@ class InferenceResourceWithRawResponse:
def __init__(self, inference: InferenceResource) -> None:
self._inference = inference
- self.batch_chat_completion = to_raw_response_wrapper(
- inference.batch_chat_completion,
- )
- self.batch_completion = to_raw_response_wrapper(
- inference.batch_completion,
- )
- self.chat_completion = ( # pyright: ignore[reportDeprecated]
- to_raw_response_wrapper(
- inference.chat_completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.completion = ( # pyright: ignore[reportDeprecated]
- to_raw_response_wrapper(
- inference.completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.embeddings = ( # pyright: ignore[reportDeprecated]
- to_raw_response_wrapper(
- inference.embeddings, # pyright: ignore[reportDeprecated],
- )
- )
self.rerank = to_raw_response_wrapper(
inference.rerank,
)
@@ -1505,27 +195,6 @@ class AsyncInferenceResourceWithRawResponse:
def __init__(self, inference: AsyncInferenceResource) -> None:
self._inference = inference
- self.batch_chat_completion = async_to_raw_response_wrapper(
- inference.batch_chat_completion,
- )
- self.batch_completion = async_to_raw_response_wrapper(
- inference.batch_completion,
- )
- self.chat_completion = ( # pyright: ignore[reportDeprecated]
- async_to_raw_response_wrapper(
- inference.chat_completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.completion = ( # pyright: ignore[reportDeprecated]
- async_to_raw_response_wrapper(
- inference.completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.embeddings = ( # pyright: ignore[reportDeprecated]
- async_to_raw_response_wrapper(
- inference.embeddings, # pyright: ignore[reportDeprecated],
- )
- )
self.rerank = async_to_raw_response_wrapper(
inference.rerank,
)
@@ -1535,27 +204,6 @@ class InferenceResourceWithStreamingResponse:
def __init__(self, inference: InferenceResource) -> None:
self._inference = inference
- self.batch_chat_completion = to_streamed_response_wrapper(
- inference.batch_chat_completion,
- )
- self.batch_completion = to_streamed_response_wrapper(
- inference.batch_completion,
- )
- self.chat_completion = ( # pyright: ignore[reportDeprecated]
- to_streamed_response_wrapper(
- inference.chat_completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.completion = ( # pyright: ignore[reportDeprecated]
- to_streamed_response_wrapper(
- inference.completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.embeddings = ( # pyright: ignore[reportDeprecated]
- to_streamed_response_wrapper(
- inference.embeddings, # pyright: ignore[reportDeprecated],
- )
- )
self.rerank = to_streamed_response_wrapper(
inference.rerank,
)
@@ -1565,27 +213,6 @@ class AsyncInferenceResourceWithStreamingResponse:
def __init__(self, inference: AsyncInferenceResource) -> None:
self._inference = inference
- self.batch_chat_completion = async_to_streamed_response_wrapper(
- inference.batch_chat_completion,
- )
- self.batch_completion = async_to_streamed_response_wrapper(
- inference.batch_completion,
- )
- self.chat_completion = ( # pyright: ignore[reportDeprecated]
- async_to_streamed_response_wrapper(
- inference.chat_completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.completion = ( # pyright: ignore[reportDeprecated]
- async_to_streamed_response_wrapper(
- inference.completion, # pyright: ignore[reportDeprecated],
- )
- )
- self.embeddings = ( # pyright: ignore[reportDeprecated]
- async_to_streamed_response_wrapper(
- inference.embeddings, # pyright: ignore[reportDeprecated],
- )
- )
self.rerank = async_to_streamed_response_wrapper(
inference.rerank,
)
diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py
index e4b2fbd8..ab4b4038 100644
--- a/src/llama_stack_client/resources/models/openai.py
+++ b/src/llama_stack_client/resources/models/openai.py
@@ -17,7 +17,7 @@
)
from ..._wrappers import DataWrapper
from ..._base_client import make_request_options
-from ...types.models.openai_list_response import OpenAIListResponse
+from ...types.model_list_response import ModelListResponse
__all__ = ["OpenAIResource", "AsyncOpenAIResource"]
@@ -51,18 +51,18 @@ def list(
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> OpenAIListResponse:
- """List models using the OpenAI API."""
+ ) -> ModelListResponse:
+ """List all models."""
return self._get(
- "/v1/openai/v1/models",
+ "/v1/models",
options=make_request_options(
extra_headers=extra_headers,
extra_query=extra_query,
extra_body=extra_body,
timeout=timeout,
- post_parser=DataWrapper[OpenAIListResponse]._unwrapper,
+ post_parser=DataWrapper[ModelListResponse]._unwrapper,
),
- cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]),
+ cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]),
)
@@ -95,18 +95,18 @@ async def list(
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = not_given,
- ) -> OpenAIListResponse:
- """List models using the OpenAI API."""
+ ) -> ModelListResponse:
+ """List all models."""
return await self._get(
- "/v1/openai/v1/models",
+ "/v1/models",
options=make_request_options(
extra_headers=extra_headers,
extra_query=extra_query,
extra_body=extra_body,
timeout=timeout,
- post_parser=DataWrapper[OpenAIListResponse]._unwrapper,
+ post_parser=DataWrapper[ModelListResponse]._unwrapper,
),
- cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]),
+ cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]),
)
diff --git a/src/llama_stack_client/resources/moderations.py b/src/llama_stack_client/resources/moderations.py
index a016b5b0..a73dc85a 100644
--- a/src/llama_stack_client/resources/moderations.py
+++ b/src/llama_stack_client/resources/moderations.py
@@ -73,7 +73,7 @@ def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._post(
- "/v1/openai/v1/moderations",
+ "/v1/moderations",
body=maybe_transform(
{
"input": input,
@@ -138,7 +138,7 @@ async def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
return await self._post(
- "/v1/openai/v1/moderations",
+ "/v1/moderations",
body=await async_maybe_transform(
{
"input": input,
diff --git a/src/llama_stack_client/resources/responses/input_items.py b/src/llama_stack_client/resources/responses/input_items.py
index da06debd..a5836ba7 100644
--- a/src/llama_stack_client/resources/responses/input_items.py
+++ b/src/llama_stack_client/resources/responses/input_items.py
@@ -85,7 +85,7 @@ def list(
if not response_id:
raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
return self._get(
- f"/v1/openai/v1/responses/{response_id}/input_items",
+ f"/v1/responses/{response_id}/input_items",
options=make_request_options(
extra_headers=extra_headers,
extra_query=extra_query,
@@ -168,7 +168,7 @@ async def list(
if not response_id:
raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
return await self._get(
- f"/v1/openai/v1/responses/{response_id}/input_items",
+ f"/v1/responses/{response_id}/input_items",
options=make_request_options(
extra_headers=extra_headers,
extra_query=extra_query,
diff --git a/src/llama_stack_client/resources/responses/responses.py b/src/llama_stack_client/resources/responses/responses.py
index 7f21f3ea..16e38fd0 100644
--- a/src/llama_stack_client/resources/responses/responses.py
+++ b/src/llama_stack_client/resources/responses/responses.py
@@ -228,7 +228,7 @@ def create(
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> ResponseObject | Stream[ResponseObjectStream]:
return self._post(
- "/v1/openai/v1/responses",
+ "/v1/responses",
body=maybe_transform(
{
"input": input,
@@ -281,7 +281,7 @@ def retrieve(
if not response_id:
raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
return self._get(
- f"/v1/openai/v1/responses/{response_id}",
+ f"/v1/responses/{response_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -323,7 +323,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/responses",
+ "/v1/responses",
page=SyncOpenAICursorPage[ResponseListResponse],
options=make_request_options(
extra_headers=extra_headers,
@@ -369,7 +369,7 @@ def delete(
if not response_id:
raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
return self._delete(
- f"/v1/openai/v1/responses/{response_id}",
+ f"/v1/responses/{response_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -568,7 +568,7 @@ async def create(
timeout: float | httpx.Timeout | None | NotGiven = not_given,
) -> ResponseObject | AsyncStream[ResponseObjectStream]:
return await self._post(
- "/v1/openai/v1/responses",
+ "/v1/responses",
body=await async_maybe_transform(
{
"input": input,
@@ -621,7 +621,7 @@ async def retrieve(
if not response_id:
raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
return await self._get(
- f"/v1/openai/v1/responses/{response_id}",
+ f"/v1/responses/{response_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -663,7 +663,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/responses",
+ "/v1/responses",
page=AsyncOpenAICursorPage[ResponseListResponse],
options=make_request_options(
extra_headers=extra_headers,
@@ -709,7 +709,7 @@ async def delete(
if not response_id:
raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
return await self._delete(
- f"/v1/openai/v1/responses/{response_id}",
+ f"/v1/responses/{response_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
diff --git a/src/llama_stack_client/resources/vector_stores/files.py b/src/llama_stack_client/resources/vector_stores/files.py
index 39f16a66..f9a1ef31 100644
--- a/src/llama_stack_client/resources/vector_stores/files.py
+++ b/src/llama_stack_client/resources/vector_stores/files.py
@@ -82,7 +82,7 @@ def create(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+ f"/v1/vector_stores/{vector_store_id}/files",
body=maybe_transform(
{
"file_id": file_id,
@@ -126,7 +126,7 @@ def retrieve(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._get(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -165,7 +165,7 @@ def update(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
body=maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams),
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -218,7 +218,7 @@ def list(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._get_api_list(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+ f"/v1/vector_stores/{vector_store_id}/files",
page=SyncOpenAICursorPage[VectorStoreFile],
options=make_request_options(
extra_headers=extra_headers,
@@ -268,7 +268,7 @@ def delete(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._delete(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -304,7 +304,7 @@ def content(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return self._get(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -367,7 +367,7 @@ async def create(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return await self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+ f"/v1/vector_stores/{vector_store_id}/files",
body=await async_maybe_transform(
{
"file_id": file_id,
@@ -411,7 +411,7 @@ async def retrieve(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._get(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -450,7 +450,7 @@ async def update(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
body=await async_maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams),
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -503,7 +503,7 @@ def list(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._get_api_list(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+ f"/v1/vector_stores/{vector_store_id}/files",
page=AsyncOpenAICursorPage[VectorStoreFile],
options=make_request_options(
extra_headers=extra_headers,
@@ -553,7 +553,7 @@ async def delete(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._delete(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -589,7 +589,7 @@ async def content(
if not file_id:
raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
return await self._get(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+ f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
diff --git a/src/llama_stack_client/resources/vector_stores/vector_stores.py b/src/llama_stack_client/resources/vector_stores/vector_stores.py
index f3ab01f2..f858100b 100644
--- a/src/llama_stack_client/resources/vector_stores/vector_stores.py
+++ b/src/llama_stack_client/resources/vector_stores/vector_stores.py
@@ -112,7 +112,7 @@ def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._post(
- "/v1/openai/v1/vector_stores",
+ "/v1/vector_stores",
body=maybe_transform(
{
"chunking_strategy": chunking_strategy,
@@ -158,7 +158,7 @@ def retrieve(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._get(
- f"/v1/openai/v1/vector_stores/{vector_store_id}",
+ f"/v1/vector_stores/{vector_store_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -200,7 +200,7 @@ def update(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}",
+ f"/v1/vector_stores/{vector_store_id}",
body=maybe_transform(
{
"expires_after": expires_after,
@@ -255,7 +255,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/vector_stores",
+ "/v1/vector_stores",
page=SyncOpenAICursorPage[VectorStore],
options=make_request_options(
extra_headers=extra_headers,
@@ -301,7 +301,7 @@ def delete(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._delete(
- f"/v1/openai/v1/vector_stores/{vector_store_id}",
+ f"/v1/vector_stores/{vector_store_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -354,7 +354,7 @@ def search(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/search",
+ f"/v1/vector_stores/{vector_store_id}/search",
body=maybe_transform(
{
"query": query,
@@ -446,7 +446,7 @@ async def create(
timeout: Override the client-level default timeout for this request, in seconds
"""
return await self._post(
- "/v1/openai/v1/vector_stores",
+ "/v1/vector_stores",
body=await async_maybe_transform(
{
"chunking_strategy": chunking_strategy,
@@ -492,7 +492,7 @@ async def retrieve(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return await self._get(
- f"/v1/openai/v1/vector_stores/{vector_store_id}",
+ f"/v1/vector_stores/{vector_store_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -534,7 +534,7 @@ async def update(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return await self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}",
+ f"/v1/vector_stores/{vector_store_id}",
body=await async_maybe_transform(
{
"expires_after": expires_after,
@@ -589,7 +589,7 @@ def list(
timeout: Override the client-level default timeout for this request, in seconds
"""
return self._get_api_list(
- "/v1/openai/v1/vector_stores",
+ "/v1/vector_stores",
page=AsyncOpenAICursorPage[VectorStore],
options=make_request_options(
extra_headers=extra_headers,
@@ -635,7 +635,7 @@ async def delete(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return await self._delete(
- f"/v1/openai/v1/vector_stores/{vector_store_id}",
+ f"/v1/vector_stores/{vector_store_id}",
options=make_request_options(
extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
),
@@ -688,7 +688,7 @@ async def search(
if not vector_store_id:
raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
return await self._post(
- f"/v1/openai/v1/vector_stores/{vector_store_id}/search",
+ f"/v1/vector_stores/{vector_store_id}/search",
body=await async_maybe_transform(
{
"query": query,
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index 56b7f887..f81ada61 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -17,18 +17,15 @@
QueryConfig as QueryConfig,
QueryResult as QueryResult,
UserMessage as UserMessage,
- ContentDelta as ContentDelta,
ScoringResult as ScoringResult,
SystemMessage as SystemMessage,
ResponseFormat as ResponseFormat,
SamplingParams as SamplingParams,
- BatchCompletion as BatchCompletion,
SafetyViolation as SafetyViolation,
CompletionMessage as CompletionMessage,
InterleavedContent as InterleavedContent,
ToolParamDefinition as ToolParamDefinition,
ToolResponseMessage as ToolResponseMessage,
- QueryGeneratorConfig as QueryGeneratorConfig,
ChatCompletionResponse as ChatCompletionResponse,
InterleavedContentItem as InterleavedContentItem,
)
@@ -48,7 +45,6 @@
from .tool_def_param import ToolDefParam as ToolDefParam
from .create_response import CreateResponse as CreateResponse
from .response_object import ResponseObject as ResponseObject
-from .token_log_probs import TokenLogProbs as TokenLogProbs
from .file_list_params import FileListParams as FileListParams
from .shield_call_step import ShieldCallStep as ShieldCallStep
from .span_with_status import SpanWithStatus as SpanWithStatus
@@ -61,8 +57,6 @@
from .tool_list_response import ToolListResponse as ToolListResponse
from .agent_create_params import AgentCreateParams as AgentCreateParams
from .agent_list_response import AgentListResponse as AgentListResponse
-from .completion_response import CompletionResponse as CompletionResponse
-from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse
from .list_files_response import ListFilesResponse as ListFilesResponse
from .list_tools_response import ListToolsResponse as ListToolsResponse
from .model_list_response import ModelListResponse as ModelListResponse
@@ -71,7 +65,6 @@
from .tool_execution_step import ToolExecutionStep as ToolExecutionStep
from .tool_response_param import ToolResponseParam as ToolResponseParam
from .delete_file_response import DeleteFileResponse as DeleteFileResponse
-from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam
from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams
from .list_models_response import ListModelsResponse as ListModelsResponse
from .list_routes_response import ListRoutesResponse as ListRoutesResponse
@@ -134,8 +127,6 @@
from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams
from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams
from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams
-from .inference_completion_params import InferenceCompletionParams as InferenceCompletionParams
-from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams
from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse
from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse
from .vector_db_register_response import VectorDBRegisterResponse as VectorDBRegisterResponse
@@ -154,26 +145,15 @@
from .list_scoring_functions_response import ListScoringFunctionsResponse as ListScoringFunctionsResponse
from .telemetry_query_traces_response import TelemetryQueryTracesResponse as TelemetryQueryTracesResponse
from .tool_runtime_invoke_tool_params import ToolRuntimeInvokeToolParams as ToolRuntimeInvokeToolParams
-from .inference_chat_completion_params import InferenceChatCompletionParams as InferenceChatCompletionParams
from .list_post_training_jobs_response import ListPostTrainingJobsResponse as ListPostTrainingJobsResponse
from .scoring_function_register_params import ScoringFunctionRegisterParams as ScoringFunctionRegisterParams
from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse
from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse
from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse
-from .inference_batch_completion_params import InferenceBatchCompletionParams as InferenceBatchCompletionParams
from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse
-from .chat_completion_response_stream_chunk import (
- ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk,
-)
-from .inference_batch_chat_completion_params import (
- InferenceBatchChatCompletionParams as InferenceBatchChatCompletionParams,
-)
from .telemetry_save_spans_to_dataset_params import (
TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams,
)
-from .inference_batch_chat_completion_response import (
- InferenceBatchChatCompletionResponse as InferenceBatchChatCompletionResponse,
-)
from .post_training_preference_optimize_params import (
PostTrainingPreferenceOptimizeParams as PostTrainingPreferenceOptimizeParams,
)
diff --git a/src/llama_stack_client/types/agents/__init__.py b/src/llama_stack_client/types/agents/__init__.py
index f4f48353..3a144840 100644
--- a/src/llama_stack_client/types/agents/__init__.py
+++ b/src/llama_stack_client/types/agents/__init__.py
@@ -13,5 +13,4 @@
from .step_retrieve_response import StepRetrieveResponse as StepRetrieveResponse
from .session_create_response import SessionCreateResponse as SessionCreateResponse
from .session_retrieve_params import SessionRetrieveParams as SessionRetrieveParams
-from .turn_response_event_payload import TurnResponseEventPayload as TurnResponseEventPayload
from .agent_turn_response_stream_chunk import AgentTurnResponseStreamChunk as AgentTurnResponseStreamChunk
diff --git a/src/llama_stack_client/types/agents/turn_response_event.py b/src/llama_stack_client/types/agents/turn_response_event.py
index df213246..c52121ab 100644
--- a/src/llama_stack_client/types/agents/turn_response_event.py
+++ b/src/llama_stack_client/types/agents/turn_response_event.py
@@ -1,11 +1,160 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+from typing import Dict, List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from .turn import Turn
+from ..._utils import PropertyInfo
from ..._models import BaseModel
-from .turn_response_event_payload import TurnResponseEventPayload
+from ..inference_step import InferenceStep
+from ..shared.tool_call import ToolCall
+from ..shield_call_step import ShieldCallStep
+from ..tool_execution_step import ToolExecutionStep
+from ..memory_retrieval_step import MemoryRetrievalStep
+
+__all__ = [
+ "TurnResponseEvent",
+ "Payload",
+ "PayloadAgentTurnResponseStepStartPayload",
+ "PayloadAgentTurnResponseStepProgressPayload",
+ "PayloadAgentTurnResponseStepProgressPayloadDelta",
+ "PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta",
+ "PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta",
+ "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta",
+ "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall",
+ "PayloadAgentTurnResponseStepCompletePayload",
+ "PayloadAgentTurnResponseStepCompletePayloadStepDetails",
+ "PayloadAgentTurnResponseTurnStartPayload",
+ "PayloadAgentTurnResponseTurnCompletePayload",
+ "PayloadAgentTurnResponseTurnAwaitingInputPayload",
+]
+
+
+class PayloadAgentTurnResponseStepStartPayload(BaseModel):
+ event_type: Literal["step_start"]
+ """Type of event being reported"""
+
+ step_id: str
+ """Unique identifier for the step within a turn"""
+
+ step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+ """Type of step being executed"""
+
+ metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
+ """(Optional) Additional metadata for the step"""
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta(BaseModel):
+ text: str
+ """The incremental text content"""
+
+ type: Literal["text"]
+ """Discriminator type of the delta. Always "text" """
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta(BaseModel):
+ image: str
+ """The incremental image data as bytes"""
+
+ type: Literal["image"]
+ """Discriminator type of the delta. Always "image" """
+
+
+PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta(BaseModel):
+ parse_status: Literal["started", "in_progress", "failed", "succeeded"]
+ """Current parsing status of the tool call"""
+
+ tool_call: PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall
+ """Either an in-progress tool call string or the final parsed tool call"""
+
+ type: Literal["tool_call"]
+ """Discriminator type of the delta. Always "tool_call" """
+
+
+PayloadAgentTurnResponseStepProgressPayloadDelta: TypeAlias = Annotated[
+ Union[
+ PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta,
+ PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta,
+ PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta,
+ ],
+ PropertyInfo(discriminator="type"),
+]
+
+
+class PayloadAgentTurnResponseStepProgressPayload(BaseModel):
+ delta: PayloadAgentTurnResponseStepProgressPayloadDelta
+ """Incremental content changes during step execution"""
+
+ event_type: Literal["step_progress"]
+ """Type of event being reported"""
+
+ step_id: str
+ """Unique identifier for the step within a turn"""
+
+ step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+ """Type of step being executed"""
+
+
+PayloadAgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[
+ Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep],
+ PropertyInfo(discriminator="step_type"),
+]
+
+
+class PayloadAgentTurnResponseStepCompletePayload(BaseModel):
+ event_type: Literal["step_complete"]
+ """Type of event being reported"""
+
+ step_details: PayloadAgentTurnResponseStepCompletePayloadStepDetails
+ """Complete details of the executed step"""
+
+ step_id: str
+ """Unique identifier for the step within a turn"""
+
+ step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+ """Type of step being executed"""
+
+
+class PayloadAgentTurnResponseTurnStartPayload(BaseModel):
+ event_type: Literal["turn_start"]
+ """Type of event being reported"""
+
+ turn_id: str
+ """Unique identifier for the turn within a session"""
+
+
+class PayloadAgentTurnResponseTurnCompletePayload(BaseModel):
+ event_type: Literal["turn_complete"]
+ """Type of event being reported"""
+
+ turn: Turn
+ """Complete turn data including all steps and results"""
+
+
+class PayloadAgentTurnResponseTurnAwaitingInputPayload(BaseModel):
+ event_type: Literal["turn_awaiting_input"]
+ """Type of event being reported"""
+
+ turn: Turn
+ """Turn data when waiting for external tool responses"""
+
-__all__ = ["TurnResponseEvent"]
+Payload: TypeAlias = Annotated[
+ Union[
+ PayloadAgentTurnResponseStepStartPayload,
+ PayloadAgentTurnResponseStepProgressPayload,
+ PayloadAgentTurnResponseStepCompletePayload,
+ PayloadAgentTurnResponseTurnStartPayload,
+ PayloadAgentTurnResponseTurnCompletePayload,
+ PayloadAgentTurnResponseTurnAwaitingInputPayload,
+ ],
+ PropertyInfo(discriminator="event_type"),
+]
class TurnResponseEvent(BaseModel):
- payload: TurnResponseEventPayload
+ payload: Payload
"""Event-specific payload containing event data"""
diff --git a/src/llama_stack_client/types/agents/turn_response_event_payload.py b/src/llama_stack_client/types/agents/turn_response_event_payload.py
deleted file mode 100644
index 1844c61e..00000000
--- a/src/llama_stack_client/types/agents/turn_response_event_payload.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Dict, List, Union, Optional
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from .turn import Turn
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-from ..inference_step import InferenceStep
-from ..shield_call_step import ShieldCallStep
-from ..tool_execution_step import ToolExecutionStep
-from ..shared.content_delta import ContentDelta
-from ..memory_retrieval_step import MemoryRetrievalStep
-
-__all__ = [
- "TurnResponseEventPayload",
- "AgentTurnResponseStepStartPayload",
- "AgentTurnResponseStepProgressPayload",
- "AgentTurnResponseStepCompletePayload",
- "AgentTurnResponseStepCompletePayloadStepDetails",
- "AgentTurnResponseTurnStartPayload",
- "AgentTurnResponseTurnCompletePayload",
- "AgentTurnResponseTurnAwaitingInputPayload",
-]
-
-
-class AgentTurnResponseStepStartPayload(BaseModel):
- event_type: Literal["step_start"]
- """Type of event being reported"""
-
- step_id: str
- """Unique identifier for the step within a turn"""
-
- step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
- """Type of step being executed"""
-
- metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
- """(Optional) Additional metadata for the step"""
-
-
-class AgentTurnResponseStepProgressPayload(BaseModel):
- delta: ContentDelta
- """Incremental content changes during step execution"""
-
- event_type: Literal["step_progress"]
- """Type of event being reported"""
-
- step_id: str
- """Unique identifier for the step within a turn"""
-
- step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
- """Type of step being executed"""
-
-
-AgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[
- Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep],
- PropertyInfo(discriminator="step_type"),
-]
-
-
-class AgentTurnResponseStepCompletePayload(BaseModel):
- event_type: Literal["step_complete"]
- """Type of event being reported"""
-
- step_details: AgentTurnResponseStepCompletePayloadStepDetails
- """Complete details of the executed step"""
-
- step_id: str
- """Unique identifier for the step within a turn"""
-
- step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
- """Type of step being executed"""
-
-
-class AgentTurnResponseTurnStartPayload(BaseModel):
- event_type: Literal["turn_start"]
- """Type of event being reported"""
-
- turn_id: str
- """Unique identifier for the turn within a session"""
-
-
-class AgentTurnResponseTurnCompletePayload(BaseModel):
- event_type: Literal["turn_complete"]
- """Type of event being reported"""
-
- turn: Turn
- """Complete turn data including all steps and results"""
-
-
-class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
- event_type: Literal["turn_awaiting_input"]
- """Type of event being reported"""
-
- turn: Turn
- """Turn data when waiting for external tool responses"""
-
-
-TurnResponseEventPayload: TypeAlias = Annotated[
- Union[
- AgentTurnResponseStepStartPayload,
- AgentTurnResponseStepProgressPayload,
- AgentTurnResponseStepCompletePayload,
- AgentTurnResponseTurnStartPayload,
- AgentTurnResponseTurnCompletePayload,
- AgentTurnResponseTurnAwaitingInputPayload,
- ],
- PropertyInfo(discriminator="event_type"),
-]
diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py
index 740bf99b..dc968521 100644
--- a/src/llama_stack_client/types/benchmark_config_param.py
+++ b/src/llama_stack_client/types/benchmark_config_param.py
@@ -2,17 +2,42 @@
from __future__ import annotations
-from typing import Dict
-from typing_extensions import Required, TypedDict
+from typing import Dict, Union
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
-from .eval_candidate_param import EvalCandidateParam
from .scoring_fn_params_param import ScoringFnParamsParam
+from .shared_params.agent_config import AgentConfig
+from .shared_params.system_message import SystemMessage
+from .shared_params.sampling_params import SamplingParams
-__all__ = ["BenchmarkConfigParam"]
+__all__ = ["BenchmarkConfigParam", "EvalCandidate", "EvalCandidateModelCandidate", "EvalCandidateAgentCandidate"]
+
+
+class EvalCandidateModelCandidate(TypedDict, total=False):
+ model: Required[str]
+ """The model ID to evaluate."""
+
+ sampling_params: Required[SamplingParams]
+ """The sampling parameters for the model."""
+
+ type: Required[Literal["model"]]
+
+ system_message: SystemMessage
+ """(Optional) The system message providing instructions or context to the model."""
+
+
+class EvalCandidateAgentCandidate(TypedDict, total=False):
+ config: Required[AgentConfig]
+ """The configuration for the agent candidate."""
+
+ type: Required[Literal["agent"]]
+
+
+EvalCandidate: TypeAlias = Union[EvalCandidateModelCandidate, EvalCandidateAgentCandidate]
class BenchmarkConfigParam(TypedDict, total=False):
- eval_candidate: Required[EvalCandidateParam]
+ eval_candidate: Required[EvalCandidate]
"""The candidate to evaluate."""
scoring_params: Required[Dict[str, ScoringFnParamsParam]]
diff --git a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
deleted file mode 100644
index 1a55f3d1..00000000
--- a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Optional
-from typing_extensions import Literal
-
-from .._models import BaseModel
-from .shared.metric import Metric
-from .token_log_probs import TokenLogProbs
-from .shared.content_delta import ContentDelta
-
-__all__ = ["ChatCompletionResponseStreamChunk", "Event"]
-
-
-class Event(BaseModel):
- delta: ContentDelta
- """Content generated since last event.
-
- This can be one or more tokens, or a tool call.
- """
-
- event_type: Literal["start", "complete", "progress"]
- """Type of the event"""
-
- logprobs: Optional[List[TokenLogProbs]] = None
- """Optional log probabilities for generated tokens"""
-
- stop_reason: Optional[Literal["end_of_turn", "end_of_message", "out_of_tokens"]] = None
- """Optional reason why generation stopped, if complete"""
-
-
-class ChatCompletionResponseStreamChunk(BaseModel):
- event: Event
- """The event containing the new content"""
-
- metrics: Optional[List[Metric]] = None
- """(Optional) List of metrics associated with the API response"""
diff --git a/src/llama_stack_client/types/completion_response.py b/src/llama_stack_client/types/completion_response.py
deleted file mode 100644
index 9718be8a..00000000
--- a/src/llama_stack_client/types/completion_response.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Optional
-from typing_extensions import Literal
-
-from .._models import BaseModel
-from .shared.metric import Metric
-from .token_log_probs import TokenLogProbs
-
-__all__ = ["CompletionResponse"]
-
-
-class CompletionResponse(BaseModel):
- content: str
- """The generated completion text"""
-
- stop_reason: Literal["end_of_turn", "end_of_message", "out_of_tokens"]
- """Reason why generation stopped"""
-
- logprobs: Optional[List[TokenLogProbs]] = None
- """Optional log probabilities for generated tokens"""
-
- metrics: Optional[List[Metric]] = None
- """(Optional) List of metrics associated with the API response"""
diff --git a/src/llama_stack_client/types/embeddings_response.py b/src/llama_stack_client/types/embeddings_response.py
deleted file mode 100644
index f36c6b97..00000000
--- a/src/llama_stack_client/types/embeddings_response.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from .._models import BaseModel
-
-__all__ = ["EmbeddingsResponse"]
-
-
-class EmbeddingsResponse(BaseModel):
- embeddings: List[List[float]]
- """List of embedding vectors, one per input content.
-
- Each embedding is a list of floats. The dimensionality of the embedding is
- model-specific; you can check model metadata using /models/{model_id}
- """
diff --git a/src/llama_stack_client/types/eval_candidate_param.py b/src/llama_stack_client/types/eval_candidate_param.py
deleted file mode 100644
index be1b21c8..00000000
--- a/src/llama_stack_client/types/eval_candidate_param.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-from .shared_params.agent_config import AgentConfig
-from .shared_params.system_message import SystemMessage
-from .shared_params.sampling_params import SamplingParams
-
-__all__ = ["EvalCandidateParam", "ModelCandidate", "AgentCandidate"]
-
-
-class ModelCandidate(TypedDict, total=False):
- model: Required[str]
- """The model ID to evaluate."""
-
- sampling_params: Required[SamplingParams]
- """The sampling parameters for the model."""
-
- type: Required[Literal["model"]]
-
- system_message: SystemMessage
- """(Optional) The system message providing instructions or context to the model."""
-
-
-class AgentCandidate(TypedDict, total=False):
- config: Required[AgentConfig]
- """The configuration for the agent candidate."""
-
- type: Required[Literal["agent"]]
-
-
-EvalCandidateParam: TypeAlias = Union[ModelCandidate, AgentCandidate]
diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py
index 8322c0a9..2be39a7a 100644
--- a/src/llama_stack_client/types/file_create_params.py
+++ b/src/llama_stack_client/types/file_create_params.py
@@ -6,7 +6,7 @@
from .._types import FileTypes
-__all__ = ["FileCreateParams"]
+__all__ = ["FileCreateParams", "ExpiresAfter"]
class FileCreateParams(TypedDict, total=False):
@@ -14,3 +14,16 @@ class FileCreateParams(TypedDict, total=False):
purpose: Required[Literal["assistants", "batch"]]
"""Valid purpose values for OpenAI Files API."""
+
+ expires_after: ExpiresAfter
+ """Control expiration of uploaded files. Params:
+
+ - anchor, must be "created_at"
+ - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+ """
+
+
+class ExpiresAfter(TypedDict, total=False):
+ anchor: Required[Literal["created_at"]]
+
+ seconds: Required[int]
diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_params.py b/src/llama_stack_client/types/inference_batch_chat_completion_params.py
deleted file mode 100644
index b5da0f0e..00000000
--- a/src/llama_stack_client/types/inference_batch_chat_completion_params.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Dict, Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.message import Message
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.tool_param_definition import ToolParamDefinition
-
-__all__ = ["InferenceBatchChatCompletionParams", "Logprobs", "ToolConfig", "Tool"]
-
-
-class InferenceBatchChatCompletionParams(TypedDict, total=False):
- messages_batch: Required[Iterable[Iterable[Message]]]
- """The messages to generate completions for."""
-
- model_id: Required[str]
- """The identifier of the model to use.
-
- The model must be registered with Llama Stack and available via the /models
- endpoint.
- """
-
- logprobs: Logprobs
- """
- (Optional) If specified, log probabilities for each token position will be
- returned.
- """
-
- response_format: ResponseFormat
- """(Optional) Grammar specification for guided (structured) decoding."""
-
- sampling_params: SamplingParams
- """(Optional) Parameters to control the sampling strategy."""
-
- tool_config: ToolConfig
- """(Optional) Configuration for tool use."""
-
- tools: Iterable[Tool]
- """(Optional) List of tool definitions available to the model."""
-
-
-class Logprobs(TypedDict, total=False):
- top_k: int
- """How many tokens (for each position) to return log probabilities for."""
-
-
-class ToolConfig(TypedDict, total=False):
- system_message_behavior: Literal["append", "replace"]
- """(Optional) Config for how to override the default system prompt.
-
- - `SystemMessageBehavior.append`: Appends the provided system message to the
- default system prompt. - `SystemMessageBehavior.replace`: Replaces the default
- system prompt with the provided system message. The system message can include
- the string '{{function_definitions}}' to indicate where the function
- definitions should be inserted.
- """
-
- tool_choice: Union[Literal["auto", "required", "none"], str]
- """(Optional) Whether tool use is automatic, required, or none.
-
- Can also specify a tool name to use a specific tool. Defaults to
- ToolChoice.auto.
- """
-
- tool_prompt_format: Literal["json", "function_tag", "python_list"]
- """(Optional) Instructs the model how to format tool calls.
-
- By default, Llama Stack will attempt to use a format that is best adapted to the
- model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
- object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls.
- """
-
-
-class Tool(TypedDict, total=False):
- tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]]
-
- description: str
-
- parameters: Dict[str, ToolParamDefinition]
diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_response.py b/src/llama_stack_client/types/inference_batch_chat_completion_response.py
deleted file mode 100644
index ed24908d..00000000
--- a/src/llama_stack_client/types/inference_batch_chat_completion_response.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from .._models import BaseModel
-from .shared.chat_completion_response import ChatCompletionResponse
-
-__all__ = ["InferenceBatchChatCompletionResponse"]
-
-
-class InferenceBatchChatCompletionResponse(BaseModel):
- batch: List[ChatCompletionResponse]
- """List of chat completion responses, one for each conversation in the batch"""
diff --git a/src/llama_stack_client/types/inference_batch_completion_params.py b/src/llama_stack_client/types/inference_batch_completion_params.py
deleted file mode 100644
index b225b883..00000000
--- a/src/llama_stack_client/types/inference_batch_completion_params.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing_extensions import Required, TypedDict
-
-from .._types import SequenceNotStr
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.interleaved_content import InterleavedContent
-
-__all__ = ["InferenceBatchCompletionParams", "Logprobs"]
-
-
-class InferenceBatchCompletionParams(TypedDict, total=False):
- content_batch: Required[SequenceNotStr[InterleavedContent]]
- """The content to generate completions for."""
-
- model_id: Required[str]
- """The identifier of the model to use.
-
- The model must be registered with Llama Stack and available via the /models
- endpoint.
- """
-
- logprobs: Logprobs
- """
- (Optional) If specified, log probabilities for each token position will be
- returned.
- """
-
- response_format: ResponseFormat
- """(Optional) Grammar specification for guided (structured) decoding."""
-
- sampling_params: SamplingParams
- """(Optional) Parameters to control the sampling strategy."""
-
-
-class Logprobs(TypedDict, total=False):
- top_k: int
- """How many tokens (for each position) to return log probabilities for."""
diff --git a/src/llama_stack_client/types/inference_chat_completion_params.py b/src/llama_stack_client/types/inference_chat_completion_params.py
deleted file mode 100644
index 746d3dee..00000000
--- a/src/llama_stack_client/types/inference_chat_completion_params.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Dict, Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.message import Message
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.tool_param_definition import ToolParamDefinition
-
-__all__ = [
- "InferenceChatCompletionParamsBase",
- "Logprobs",
- "ToolConfig",
- "Tool",
- "InferenceChatCompletionParamsNonStreaming",
- "InferenceChatCompletionParamsStreaming",
-]
-
-
-class InferenceChatCompletionParamsBase(TypedDict, total=False):
- messages: Required[Iterable[Message]]
- """List of messages in the conversation."""
-
- model_id: Required[str]
- """The identifier of the model to use.
-
- The model must be registered with Llama Stack and available via the /models
- endpoint.
- """
-
- logprobs: Logprobs
- """
- (Optional) If specified, log probabilities for each token position will be
- returned.
- """
-
- response_format: ResponseFormat
- """(Optional) Grammar specification for guided (structured) decoding.
-
- There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
- schema. Most providers support this format. - `ResponseFormat.grammar`: The
- grammar is a BNF grammar. This format is more flexible, but not all providers
- support it.
- """
-
- sampling_params: SamplingParams
- """Parameters to control the sampling strategy."""
-
- tool_choice: Literal["auto", "required", "none"]
- """(Optional) Whether tool use is required or automatic.
-
- Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead.
- """
-
- tool_config: ToolConfig
- """(Optional) Configuration for tool use."""
-
- tool_prompt_format: Literal["json", "function_tag", "python_list"]
- """(Optional) Instructs the model how to format tool calls.
-
- By default, Llama Stack will attempt to use a format that is best adapted to the
- model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
- object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls. .. deprecated:: Use
- tool_config instead.
- """
-
- tools: Iterable[Tool]
- """(Optional) List of tool definitions available to the model."""
-
-
-class Logprobs(TypedDict, total=False):
- top_k: int
- """How many tokens (for each position) to return log probabilities for."""
-
-
-class ToolConfig(TypedDict, total=False):
- system_message_behavior: Literal["append", "replace"]
- """(Optional) Config for how to override the default system prompt.
-
- - `SystemMessageBehavior.append`: Appends the provided system message to the
- default system prompt. - `SystemMessageBehavior.replace`: Replaces the default
- system prompt with the provided system message. The system message can include
- the string '{{function_definitions}}' to indicate where the function
- definitions should be inserted.
- """
-
- tool_choice: Union[Literal["auto", "required", "none"], str]
- """(Optional) Whether tool use is automatic, required, or none.
-
- Can also specify a tool name to use a specific tool. Defaults to
- ToolChoice.auto.
- """
-
- tool_prompt_format: Literal["json", "function_tag", "python_list"]
- """(Optional) Instructs the model how to format tool calls.
-
- By default, Llama Stack will attempt to use a format that is best adapted to the
- model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
- object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
- tag. - `ToolPromptFormat.python_list`: The tool calls
- are output as Python syntax -- a list of function calls.
- """
-
-
-class Tool(TypedDict, total=False):
- tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]]
-
- description: str
-
- parameters: Dict[str, ToolParamDefinition]
-
-
-class InferenceChatCompletionParamsNonStreaming(InferenceChatCompletionParamsBase, total=False):
- stream: Literal[False]
- """(Optional) If True, generate an SSE event stream of the response.
-
- Defaults to False.
- """
-
-
-class InferenceChatCompletionParamsStreaming(InferenceChatCompletionParamsBase):
- stream: Required[Literal[True]]
- """(Optional) If True, generate an SSE event stream of the response.
-
- Defaults to False.
- """
-
-
-InferenceChatCompletionParams = Union[InferenceChatCompletionParamsNonStreaming, InferenceChatCompletionParamsStreaming]
diff --git a/src/llama_stack_client/types/inference_completion_params.py b/src/llama_stack_client/types/inference_completion_params.py
deleted file mode 100644
index c122f017..00000000
--- a/src/llama_stack_client/types/inference_completion_params.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.interleaved_content import InterleavedContent
-
-__all__ = [
- "InferenceCompletionParamsBase",
- "Logprobs",
- "InferenceCompletionParamsNonStreaming",
- "InferenceCompletionParamsStreaming",
-]
-
-
-class InferenceCompletionParamsBase(TypedDict, total=False):
- content: Required[InterleavedContent]
- """The content to generate a completion for."""
-
- model_id: Required[str]
- """The identifier of the model to use.
-
- The model must be registered with Llama Stack and available via the /models
- endpoint.
- """
-
- logprobs: Logprobs
- """
- (Optional) If specified, log probabilities for each token position will be
- returned.
- """
-
- response_format: ResponseFormat
- """(Optional) Grammar specification for guided (structured) decoding."""
-
- sampling_params: SamplingParams
- """(Optional) Parameters to control the sampling strategy."""
-
-
-class Logprobs(TypedDict, total=False):
- top_k: int
- """How many tokens (for each position) to return log probabilities for."""
-
-
-class InferenceCompletionParamsNonStreaming(InferenceCompletionParamsBase, total=False):
- stream: Literal[False]
- """(Optional) If True, generate an SSE event stream of the response.
-
- Defaults to False.
- """
-
-
-class InferenceCompletionParamsStreaming(InferenceCompletionParamsBase):
- stream: Required[Literal[True]]
- """(Optional) If True, generate an SSE event stream of the response.
-
- Defaults to False.
- """
-
-
-InferenceCompletionParams = Union[InferenceCompletionParamsNonStreaming, InferenceCompletionParamsStreaming]
diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py
deleted file mode 100644
index a1be545b..00000000
--- a/src/llama_stack_client/types/inference_embeddings_params.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .._types import SequenceNotStr
-from .shared_params.interleaved_content_item import InterleavedContentItem
-
-__all__ = ["InferenceEmbeddingsParams"]
-
-
-class InferenceEmbeddingsParams(TypedDict, total=False):
- contents: Required[Union[SequenceNotStr[str], Iterable[InterleavedContentItem]]]
- """List of contents to generate embeddings for.
-
- Each content can be a string or an InterleavedContentItem (and hence can be
- multimodal). The behavior depends on the model and provider. Some models may
- only support text.
- """
-
- model_id: Required[str]
- """The identifier of the model to use.
-
- The model must be an embedding model registered with Llama Stack and available
- via the /models endpoint.
- """
-
- output_dimension: int
- """(Optional) Output dimensionality for the embeddings.
-
- Only supported by Matryoshka models.
- """
-
- task_type: Literal["query", "document"]
- """
- (Optional) How is the embedding being used? This is only supported by asymmetric
- embedding models.
- """
-
- text_truncation: Literal["none", "start", "end"]
- """
- (Optional) Config for how to truncate text for embedding when text is longer
- than the model's max sequence length.
- """
diff --git a/src/llama_stack_client/types/models/openai_list_response.py b/src/llama_stack_client/types/models/openai_list_response.py
index f14845d5..5b6c0358 100644
--- a/src/llama_stack_client/types/models/openai_list_response.py
+++ b/src/llama_stack_client/types/models/openai_list_response.py
@@ -1,21 +1,10 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
from typing import List
-from typing_extensions import Literal, TypeAlias
+from typing_extensions import TypeAlias
-from ..._models import BaseModel
+from ..model import Model
-__all__ = ["OpenAIListResponse", "OpenAIListResponseItem"]
+__all__ = ["OpenAIListResponse"]
-
-class OpenAIListResponseItem(BaseModel):
- id: str
-
- created: int
-
- object: Literal["model"]
-
- owned_by: str
-
-
-OpenAIListResponse: TypeAlias = List[OpenAIListResponseItem]
+OpenAIListResponse: TypeAlias = List[Model]
diff --git a/src/llama_stack_client/types/response_list_response.py b/src/llama_stack_client/types/response_list_response.py
index ae50d44a..ac7ec1b1 100644
--- a/src/llama_stack_client/types/response_list_response.py
+++ b/src/llama_stack_client/types/response_list_response.py
@@ -570,6 +570,3 @@ class ResponseListResponse(BaseModel):
truncation: Optional[str] = None
"""(Optional) Truncation strategy applied to the response"""
-
- user: Optional[str] = None
- """(Optional) User identifier associated with the request"""
diff --git a/src/llama_stack_client/types/response_object.py b/src/llama_stack_client/types/response_object.py
index c0f348a9..b618ddf5 100644
--- a/src/llama_stack_client/types/response_object.py
+++ b/src/llama_stack_client/types/response_object.py
@@ -361,6 +361,3 @@ def output_text(self) -> str:
truncation: Optional[str] = None
"""(Optional) Truncation strategy applied to the response"""
-
- user: Optional[str] = None
- """(Optional) User identifier associated with the request"""
diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py
index fb14d8a6..f346cda7 100644
--- a/src/llama_stack_client/types/shared/__init__.py
+++ b/src/llama_stack_client/types/shared/__init__.py
@@ -9,17 +9,14 @@
from .query_config import QueryConfig as QueryConfig
from .query_result import QueryResult as QueryResult
from .user_message import UserMessage as UserMessage
-from .content_delta import ContentDelta as ContentDelta
from .scoring_result import ScoringResult as ScoringResult
from .system_message import SystemMessage as SystemMessage
from .response_format import ResponseFormat as ResponseFormat
from .sampling_params import SamplingParams as SamplingParams
-from .batch_completion import BatchCompletion as BatchCompletion
from .safety_violation import SafetyViolation as SafetyViolation
from .completion_message import CompletionMessage as CompletionMessage
from .interleaved_content import InterleavedContent as InterleavedContent
from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
from .tool_response_message import ToolResponseMessage as ToolResponseMessage
-from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
from .chat_completion_response import ChatCompletionResponse as ChatCompletionResponse
from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared/batch_completion.py b/src/llama_stack_client/types/shared/batch_completion.py
deleted file mode 100644
index 43a0a735..00000000
--- a/src/llama_stack_client/types/shared/batch_completion.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from ..._models import BaseModel
-from ..completion_response import CompletionResponse
-
-__all__ = ["BatchCompletion"]
-
-
-class BatchCompletion(BaseModel):
- batch: List[CompletionResponse]
- """List of completion responses, one for each input in the batch"""
diff --git a/src/llama_stack_client/types/shared/chat_completion_response.py b/src/llama_stack_client/types/shared/chat_completion_response.py
index 30191439..eb78a109 100644
--- a/src/llama_stack_client/types/shared/chat_completion_response.py
+++ b/src/llama_stack_client/types/shared/chat_completion_response.py
@@ -1,20 +1,24 @@
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-from typing import List, Optional
+from typing import Dict, List, Optional
from .metric import Metric
from ..._models import BaseModel
-from ..token_log_probs import TokenLogProbs
from .completion_message import CompletionMessage
-__all__ = ["ChatCompletionResponse"]
+__all__ = ["ChatCompletionResponse", "Logprob"]
+
+
+class Logprob(BaseModel):
+ logprobs_by_token: Dict[str, float]
+ """Dictionary mapping tokens to their log probabilities"""
class ChatCompletionResponse(BaseModel):
completion_message: CompletionMessage
"""The complete response message"""
- logprobs: Optional[List[TokenLogProbs]] = None
+ logprobs: Optional[List[Logprob]] = None
"""Optional log probabilities for generated tokens"""
metrics: Optional[List[Metric]] = None
diff --git a/src/llama_stack_client/types/shared/content_delta.py b/src/llama_stack_client/types/shared/content_delta.py
deleted file mode 100644
index 7ed58d13..00000000
--- a/src/llama_stack_client/types/shared/content_delta.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Union
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-from .tool_call import ToolCall
-
-__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta", "ToolCallDeltaToolCall"]
-
-
-class TextDelta(BaseModel):
- text: str
- """The incremental text content"""
-
- type: Literal["text"]
- """Discriminator type of the delta. Always "text" """
-
-
-class ImageDelta(BaseModel):
- image: str
- """The incremental image data as bytes"""
-
- type: Literal["image"]
- """Discriminator type of the delta. Always "image" """
-
-
-ToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
-
-
-class ToolCallDelta(BaseModel):
- parse_status: Literal["started", "in_progress", "failed", "succeeded"]
- """Current parsing status of the tool call"""
-
- tool_call: ToolCallDeltaToolCall
- """Either an in-progress tool call string or the final parsed tool call"""
-
- type: Literal["tool_call"]
- """Discriminator type of the delta. Always "tool_call" """
-
-
-ContentDelta: TypeAlias = Annotated[Union[TextDelta, ImageDelta, ToolCallDelta], PropertyInfo(discriminator="type")]
diff --git a/src/llama_stack_client/types/shared/query_config.py b/src/llama_stack_client/types/shared/query_config.py
index 389514c7..a4a1f741 100644
--- a/src/llama_stack_client/types/shared/query_config.py
+++ b/src/llama_stack_client/types/shared/query_config.py
@@ -5,9 +5,41 @@
from ..._utils import PropertyInfo
from ..._models import BaseModel
-from .query_generator_config import QueryGeneratorConfig
-__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"]
+__all__ = [
+ "QueryConfig",
+ "QueryGeneratorConfig",
+ "QueryGeneratorConfigDefaultRagQueryGeneratorConfig",
+ "QueryGeneratorConfigLlmragQueryGeneratorConfig",
+ "Ranker",
+ "RankerRrfRanker",
+ "RankerWeightedRanker",
+]
+
+
+class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(BaseModel):
+ separator: str
+ """String separator used to join query terms"""
+
+ type: Literal["default"]
+ """Type of query generator, always 'default'"""
+
+
+class QueryGeneratorConfigLlmragQueryGeneratorConfig(BaseModel):
+ model: str
+ """Name of the language model to use for query generation"""
+
+ template: str
+ """Template string for formatting the query generation prompt"""
+
+ type: Literal["llm"]
+ """Type of query generator, always 'llm'"""
+
+
+QueryGeneratorConfig: TypeAlias = Annotated[
+ Union[QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig],
+ PropertyInfo(discriminator="type"),
+]
class RankerRrfRanker(BaseModel):
diff --git a/src/llama_stack_client/types/shared/query_generator_config.py b/src/llama_stack_client/types/shared/query_generator_config.py
deleted file mode 100644
index 624fc190..00000000
--- a/src/llama_stack_client/types/shared/query_generator_config.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Union
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-
-__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"]
-
-
-class DefaultRagQueryGeneratorConfig(BaseModel):
- separator: str
- """String separator used to join query terms"""
-
- type: Literal["default"]
- """Type of query generator, always 'default'"""
-
-
-class LlmragQueryGeneratorConfig(BaseModel):
- model: str
- """Name of the language model to use for query generation"""
-
- template: str
- """Template string for formatting the query generation prompt"""
-
- type: Literal["llm"]
- """Type of query generator, always 'llm'"""
-
-
-QueryGeneratorConfig: TypeAlias = Annotated[
- Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig], PropertyInfo(discriminator="type")
-]
diff --git a/src/llama_stack_client/types/shared/tool_param_definition.py b/src/llama_stack_client/types/shared/tool_param_definition.py
index 1466c1f9..316f1e01 100644
--- a/src/llama_stack_client/types/shared/tool_param_definition.py
+++ b/src/llama_stack_client/types/shared/tool_param_definition.py
@@ -14,4 +14,8 @@ class ToolParamDefinition(BaseModel):
description: Optional[str] = None
+ items: Union[bool, float, str, List[object], object, None] = None
+
required: Optional[bool] = None
+
+ title: Optional[str] = None
diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py
index 3a0842e8..894d8a8d 100644
--- a/src/llama_stack_client/types/shared_params/__init__.py
+++ b/src/llama_stack_client/types/shared_params/__init__.py
@@ -11,7 +11,5 @@
from .sampling_params import SamplingParams as SamplingParams
from .completion_message import CompletionMessage as CompletionMessage
from .interleaved_content import InterleavedContent as InterleavedContent
-from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
from .tool_response_message import ToolResponseMessage as ToolResponseMessage
-from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared_params/query_config.py b/src/llama_stack_client/types/shared_params/query_config.py
index d008c48c..91a5b596 100644
--- a/src/llama_stack_client/types/shared_params/query_config.py
+++ b/src/llama_stack_client/types/shared_params/query_config.py
@@ -5,9 +5,39 @@
from typing import Union
from typing_extensions import Literal, Required, TypeAlias, TypedDict
-from .query_generator_config import QueryGeneratorConfig
+__all__ = [
+ "QueryConfig",
+ "QueryGeneratorConfig",
+ "QueryGeneratorConfigDefaultRagQueryGeneratorConfig",
+ "QueryGeneratorConfigLlmragQueryGeneratorConfig",
+ "Ranker",
+ "RankerRrfRanker",
+ "RankerWeightedRanker",
+]
-__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"]
+
+class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(TypedDict, total=False):
+ separator: Required[str]
+ """String separator used to join query terms"""
+
+ type: Required[Literal["default"]]
+ """Type of query generator, always 'default'"""
+
+
+class QueryGeneratorConfigLlmragQueryGeneratorConfig(TypedDict, total=False):
+ model: Required[str]
+ """Name of the language model to use for query generation"""
+
+ template: Required[str]
+ """Template string for formatting the query generation prompt"""
+
+ type: Required[Literal["llm"]]
+ """Type of query generator, always 'llm'"""
+
+
+QueryGeneratorConfig: TypeAlias = Union[
+ QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig
+]
class RankerRrfRanker(TypedDict, total=False):
diff --git a/src/llama_stack_client/types/shared_params/query_generator_config.py b/src/llama_stack_client/types/shared_params/query_generator_config.py
deleted file mode 100644
index 8c589bf9..00000000
--- a/src/llama_stack_client/types/shared_params/query_generator_config.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"]
-
-
-class DefaultRagQueryGeneratorConfig(TypedDict, total=False):
- separator: Required[str]
- """String separator used to join query terms"""
-
- type: Required[Literal["default"]]
- """Type of query generator, always 'default'"""
-
-
-class LlmragQueryGeneratorConfig(TypedDict, total=False):
- model: Required[str]
- """Name of the language model to use for query generation"""
-
- template: Required[str]
- """Template string for formatting the query generation prompt"""
-
- type: Required[Literal["llm"]]
- """Type of query generator, always 'llm'"""
-
-
-QueryGeneratorConfig: TypeAlias = Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig]
diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py
deleted file mode 100644
index 2d7805fe..00000000
--- a/src/llama_stack_client/types/shared_params/tool_param_definition.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union, Iterable
-from typing_extensions import Required, TypedDict
-
-__all__ = ["ToolParamDefinition"]
-
-
-class ToolParamDefinition(TypedDict, total=False):
- param_type: Required[str]
-
- default: Union[bool, float, str, Iterable[object], object, None]
-
- description: str
-
- required: bool
diff --git a/src/llama_stack_client/types/token_log_probs.py b/src/llama_stack_client/types/token_log_probs.py
deleted file mode 100644
index b1a0a2b4..00000000
--- a/src/llama_stack_client/types/token_log_probs.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Dict
-
-from .._models import BaseModel
-
-__all__ = ["TokenLogProbs"]
-
-
-class TokenLogProbs(BaseModel):
- logprobs_by_token: Dict[str, float]
- """Dictionary mapping tokens to their log probabilities"""
diff --git a/src/llama_stack_client/types/tool.py b/src/llama_stack_client/types/tool.py
index c6994268..a7243b64 100644
--- a/src/llama_stack_client/types/tool.py
+++ b/src/llama_stack_client/types/tool.py
@@ -24,6 +24,12 @@ class Parameter(BaseModel):
default: Union[bool, float, str, List[object], object, None] = None
"""(Optional) Default value for the parameter if not provided"""
+ items: Optional[object] = None
+ """Type of the elements when parameter_type is array"""
+
+ title: Optional[str] = None
+ """(Optional) Title of the parameter"""
+
class Tool(BaseModel):
description: str
diff --git a/src/llama_stack_client/types/tool_def.py b/src/llama_stack_client/types/tool_def.py
index c82a9b8a..21949b41 100644
--- a/src/llama_stack_client/types/tool_def.py
+++ b/src/llama_stack_client/types/tool_def.py
@@ -23,6 +23,12 @@ class Parameter(BaseModel):
default: Union[bool, float, str, List[object], object, None] = None
"""(Optional) Default value for the parameter if not provided"""
+ items: Optional[object] = None
+ """Type of the elements when parameter_type is array"""
+
+ title: Optional[str] = None
+ """(Optional) Title of the parameter"""
+
class ToolDef(BaseModel):
name: str
diff --git a/src/llama_stack_client/types/tool_def_param.py b/src/llama_stack_client/types/tool_def_param.py
index 93ad8285..a50437b2 100644
--- a/src/llama_stack_client/types/tool_def_param.py
+++ b/src/llama_stack_client/types/tool_def_param.py
@@ -24,6 +24,12 @@ class Parameter(TypedDict, total=False):
default: Union[bool, float, str, Iterable[object], object, None]
"""(Optional) Default value for the parameter if not provided"""
+ items: object
+ """Type of the elements when parameter_type is array"""
+
+ title: str
+ """(Optional) Title of the parameter"""
+
class ToolDefParam(TypedDict, total=False):
name: Required[str]
diff --git a/tests/api_resources/models/test_openai.py b/tests/api_resources/models/test_openai.py
index ea64cce2..f94d2bf6 100644
--- a/tests/api_resources/models/test_openai.py
+++ b/tests/api_resources/models/test_openai.py
@@ -9,7 +9,7 @@
from tests.utils import assert_matches_type
from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types.models import OpenAIListResponse
+from llama_stack_client.types import ModelListResponse
base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
@@ -20,7 +20,7 @@ class TestOpenAI:
@parametrize
def test_method_list(self, client: LlamaStackClient) -> None:
openai = client.models.openai.list()
- assert_matches_type(OpenAIListResponse, openai, path=["response"])
+ assert_matches_type(ModelListResponse, openai, path=["response"])
@parametrize
def test_raw_response_list(self, client: LlamaStackClient) -> None:
@@ -29,7 +29,7 @@ def test_raw_response_list(self, client: LlamaStackClient) -> None:
assert response.is_closed is True
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
openai = response.parse()
- assert_matches_type(OpenAIListResponse, openai, path=["response"])
+ assert_matches_type(ModelListResponse, openai, path=["response"])
@parametrize
def test_streaming_response_list(self, client: LlamaStackClient) -> None:
@@ -38,7 +38,7 @@ def test_streaming_response_list(self, client: LlamaStackClient) -> None:
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
openai = response.parse()
- assert_matches_type(OpenAIListResponse, openai, path=["response"])
+ assert_matches_type(ModelListResponse, openai, path=["response"])
assert cast(Any, response.is_closed) is True
@@ -51,7 +51,7 @@ class TestAsyncOpenAI:
@parametrize
async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None:
openai = await async_client.models.openai.list()
- assert_matches_type(OpenAIListResponse, openai, path=["response"])
+ assert_matches_type(ModelListResponse, openai, path=["response"])
@parametrize
async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -60,7 +60,7 @@ async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> N
assert response.is_closed is True
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
openai = await response.parse()
- assert_matches_type(OpenAIListResponse, openai, path=["response"])
+ assert_matches_type(ModelListResponse, openai, path=["response"])
@parametrize
async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -69,6 +69,6 @@ async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
openai = await response.parse()
- assert_matches_type(OpenAIListResponse, openai, path=["response"])
+ assert_matches_type(ModelListResponse, openai, path=["response"])
assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/test_agents.py b/tests/api_resources/test_agents.py
index 18b34012..c19bc9bf 100644
--- a/tests/api_resources/test_agents.py
+++ b/tests/api_resources/test_agents.py
@@ -49,6 +49,8 @@ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
"parameter_type": "parameter_type",
"required": True,
"default": True,
+ "items": {},
+ "title": "title",
}
],
}
@@ -253,6 +255,8 @@ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStack
"parameter_type": "parameter_type",
"required": True,
"default": True,
+ "items": {},
+ "title": "title",
}
],
}
diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py
index d9b29ffc..83b763ab 100644
--- a/tests/api_resources/test_files.py
+++ b/tests/api_resources/test_files.py
@@ -26,6 +26,18 @@ def test_method_create(self, client: LlamaStackClient) -> None:
)
assert_matches_type(File, file, path=["response"])
+ @parametrize
+ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
+ file = client.files.create(
+ file=b"raw file contents",
+ purpose="assistants",
+ expires_after={
+ "anchor": "created_at",
+ "seconds": 0,
+ },
+ )
+ assert_matches_type(File, file, path=["response"])
+
@parametrize
def test_raw_response_create(self, client: LlamaStackClient) -> None:
response = client.files.with_raw_response.create(
@@ -215,6 +227,18 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
)
assert_matches_type(File, file, path=["response"])
+ @parametrize
+ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+ file = await async_client.files.create(
+ file=b"raw file contents",
+ purpose="assistants",
+ expires_after={
+ "anchor": "created_at",
+ "seconds": 0,
+ },
+ )
+ assert_matches_type(File, file, path=["response"])
+
@parametrize
async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None:
response = await async_client.files.with_raw_response.create(
diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py
index 474ff7cf..f26802c2 100644
--- a/tests/api_resources/test_inference.py
+++ b/tests/api_resources/test_inference.py
@@ -9,15 +9,7 @@
from tests.utils import assert_matches_type
from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types import (
- CompletionResponse,
- EmbeddingsResponse,
- InferenceRerankResponse,
- InferenceBatchChatCompletionResponse,
-)
-from llama_stack_client.types.shared import BatchCompletion, ChatCompletionResponse
-
-# pyright: reportDeprecated=false
+from llama_stack_client.types import InferenceRerankResponse
base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
@@ -25,539 +17,6 @@
class TestInference:
parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
- @parametrize
- def test_method_batch_chat_completion(self, client: LlamaStackClient) -> None:
- inference = client.inference.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- }
- ]
- ],
- model_id="model_id",
- )
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_method_batch_chat_completion_with_all_params(self, client: LlamaStackClient) -> None:
- inference = client.inference.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- "context": "string",
- }
- ]
- ],
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- tool_config={
- "system_message_behavior": "append",
- "tool_choice": "auto",
- "tool_prompt_format": "json",
- },
- tools=[
- {
- "tool_name": "brave_search",
- "description": "description",
- "parameters": {
- "foo": {
- "param_type": "param_type",
- "default": True,
- "description": "description",
- "required": True,
- }
- },
- }
- ],
- )
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_raw_response_batch_chat_completion(self, client: LlamaStackClient) -> None:
- response = client.inference.with_raw_response.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- }
- ]
- ],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = response.parse()
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_streaming_response_batch_chat_completion(self, client: LlamaStackClient) -> None:
- with client.inference.with_streaming_response.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- }
- ]
- ],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = response.parse()
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- def test_method_batch_completion(self, client: LlamaStackClient) -> None:
- inference = client.inference.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- )
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- @parametrize
- def test_method_batch_completion_with_all_params(self, client: LlamaStackClient) -> None:
- inference = client.inference.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- )
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- @parametrize
- def test_raw_response_batch_completion(self, client: LlamaStackClient) -> None:
- response = client.inference.with_raw_response.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = response.parse()
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- @parametrize
- def test_streaming_response_batch_completion(self, client: LlamaStackClient) -> None:
- with client.inference.with_streaming_response.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = response.parse()
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- )
-
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- "context": "string",
- }
- ],
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- stream=False,
- tool_choice="auto",
- tool_config={
- "system_message_behavior": "append",
- "tool_choice": "auto",
- "tool_prompt_format": "json",
- },
- tool_prompt_format="json",
- tools=[
- {
- "tool_name": "brave_search",
- "description": "description",
- "parameters": {
- "foo": {
- "param_type": "param_type",
- "default": True,
- "description": "description",
- "required": True,
- }
- },
- }
- ],
- )
-
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_raw_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = client.inference.with_raw_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = response.parse()
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_streaming_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- with client.inference.with_streaming_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = response.parse()
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- def test_method_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- stream=True,
- )
-
- inference_stream.response.close()
-
- @parametrize
- def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- "context": "string",
- }
- ],
- model_id="model_id",
- stream=True,
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- tool_choice="auto",
- tool_config={
- "system_message_behavior": "append",
- "tool_choice": "auto",
- "tool_prompt_format": "json",
- },
- tool_prompt_format="json",
- tools=[
- {
- "tool_name": "brave_search",
- "description": "description",
- "parameters": {
- "foo": {
- "param_type": "param_type",
- "default": True,
- "description": "description",
- "required": True,
- }
- },
- }
- ],
- )
-
- inference_stream.response.close()
-
- @parametrize
- def test_raw_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = client.inference.with_raw_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- stream=True,
- )
-
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- stream = response.parse()
- stream.close()
-
- @parametrize
- def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- with client.inference.with_streaming_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- stream=True,
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- stream = response.parse()
- stream.close()
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- def test_method_completion_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = client.inference.completion(
- content="string",
- model_id="model_id",
- )
-
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_method_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = client.inference.completion(
- content="string",
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- stream=False,
- )
-
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_raw_response_completion_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = client.inference.with_raw_response.completion(
- content="string",
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = response.parse()
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- @parametrize
- def test_streaming_response_completion_overload_1(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- with client.inference.with_streaming_response.completion(
- content="string",
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = response.parse()
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- def test_method_completion_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = client.inference.completion(
- content="string",
- model_id="model_id",
- stream=True,
- )
-
- inference_stream.response.close()
-
- @parametrize
- def test_method_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = client.inference.completion(
- content="string",
- model_id="model_id",
- stream=True,
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- )
-
- inference_stream.response.close()
-
- @parametrize
- def test_raw_response_completion_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = client.inference.with_raw_response.completion(
- content="string",
- model_id="model_id",
- stream=True,
- )
-
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- stream = response.parse()
- stream.close()
-
- @parametrize
- def test_streaming_response_completion_overload_2(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- with client.inference.with_streaming_response.completion(
- content="string",
- model_id="model_id",
- stream=True,
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- stream = response.parse()
- stream.close()
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- def test_method_embeddings(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = client.inference.embeddings(
- contents=["string"],
- model_id="model_id",
- )
-
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- @parametrize
- def test_method_embeddings_with_all_params(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = client.inference.embeddings(
- contents=["string"],
- model_id="model_id",
- output_dimension=0,
- task_type="query",
- text_truncation="none",
- )
-
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- @parametrize
- def test_raw_response_embeddings(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = client.inference.with_raw_response.embeddings(
- contents=["string"],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = response.parse()
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- @parametrize
- def test_streaming_response_embeddings(self, client: LlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- with client.inference.with_streaming_response.embeddings(
- contents=["string"],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = response.parse()
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
@parametrize
def test_method_rerank(self, client: LlamaStackClient) -> None:
inference = client.inference.rerank(
@@ -611,539 +70,6 @@ class TestAsyncInference:
"async_client", [False, True, {"http_client": "aiohttp"}], indirect=True, ids=["loose", "strict", "aiohttp"]
)
- @parametrize
- async def test_method_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
- inference = await async_client.inference.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- }
- ]
- ],
- model_id="model_id",
- )
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_method_batch_chat_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
- inference = await async_client.inference.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- "context": "string",
- }
- ]
- ],
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- tool_config={
- "system_message_behavior": "append",
- "tool_choice": "auto",
- "tool_prompt_format": "json",
- },
- tools=[
- {
- "tool_name": "brave_search",
- "description": "description",
- "parameters": {
- "foo": {
- "param_type": "param_type",
- "default": True,
- "description": "description",
- "required": True,
- }
- },
- }
- ],
- )
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_raw_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
- response = await async_client.inference.with_raw_response.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- }
- ]
- ],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = await response.parse()
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_streaming_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
- async with async_client.inference.with_streaming_response.batch_chat_completion(
- messages_batch=[
- [
- {
- "content": "string",
- "role": "user",
- }
- ]
- ],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = await response.parse()
- assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- async def test_method_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
- inference = await async_client.inference.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- )
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- @parametrize
- async def test_method_batch_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
- inference = await async_client.inference.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- )
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- @parametrize
- async def test_raw_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
- response = await async_client.inference.with_raw_response.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = await response.parse()
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- @parametrize
- async def test_streaming_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
- async with async_client.inference.with_streaming_response.batch_completion(
- content_batch=["string"],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = await response.parse()
- assert_matches_type(BatchCompletion, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = await async_client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- )
-
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_method_chat_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = await async_client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- "context": "string",
- }
- ],
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- stream=False,
- tool_choice="auto",
- tool_config={
- "system_message_behavior": "append",
- "tool_choice": "auto",
- "tool_prompt_format": "json",
- },
- tool_prompt_format="json",
- tools=[
- {
- "tool_name": "brave_search",
- "description": "description",
- "parameters": {
- "foo": {
- "param_type": "param_type",
- "default": True,
- "description": "description",
- "required": True,
- }
- },
- }
- ],
- )
-
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_raw_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = await async_client.inference.with_raw_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = await response.parse()
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_streaming_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- async with async_client.inference.with_streaming_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = await response.parse()
- assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- async def test_method_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = await async_client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- stream=True,
- )
-
- await inference_stream.response.aclose()
-
- @parametrize
- async def test_method_chat_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = await async_client.inference.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- "context": "string",
- }
- ],
- model_id="model_id",
- stream=True,
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- tool_choice="auto",
- tool_config={
- "system_message_behavior": "append",
- "tool_choice": "auto",
- "tool_prompt_format": "json",
- },
- tool_prompt_format="json",
- tools=[
- {
- "tool_name": "brave_search",
- "description": "description",
- "parameters": {
- "foo": {
- "param_type": "param_type",
- "default": True,
- "description": "description",
- "required": True,
- }
- },
- }
- ],
- )
-
- await inference_stream.response.aclose()
-
- @parametrize
- async def test_raw_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = await async_client.inference.with_raw_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- stream=True,
- )
-
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- stream = await response.parse()
- await stream.close()
-
- @parametrize
- async def test_streaming_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- async with async_client.inference.with_streaming_response.chat_completion(
- messages=[
- {
- "content": "string",
- "role": "user",
- }
- ],
- model_id="model_id",
- stream=True,
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- stream = await response.parse()
- await stream.close()
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- async def test_method_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = await async_client.inference.completion(
- content="string",
- model_id="model_id",
- )
-
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_method_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = await async_client.inference.completion(
- content="string",
- model_id="model_id",
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- stream=False,
- )
-
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_raw_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = await async_client.inference.with_raw_response.completion(
- content="string",
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = await response.parse()
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- @parametrize
- async def test_streaming_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- async with async_client.inference.with_streaming_response.completion(
- content="string",
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = await response.parse()
- assert_matches_type(CompletionResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- async def test_method_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = await async_client.inference.completion(
- content="string",
- model_id="model_id",
- stream=True,
- )
-
- await inference_stream.response.aclose()
-
- @parametrize
- async def test_method_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference_stream = await async_client.inference.completion(
- content="string",
- model_id="model_id",
- stream=True,
- logprobs={"top_k": 0},
- response_format={
- "json_schema": {"foo": True},
- "type": "json_schema",
- },
- sampling_params={
- "strategy": {"type": "greedy"},
- "max_tokens": 0,
- "repetition_penalty": 0,
- "stop": ["string"],
- },
- )
-
- await inference_stream.response.aclose()
-
- @parametrize
- async def test_raw_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = await async_client.inference.with_raw_response.completion(
- content="string",
- model_id="model_id",
- stream=True,
- )
-
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- stream = await response.parse()
- await stream.close()
-
- @parametrize
- async def test_streaming_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- async with async_client.inference.with_streaming_response.completion(
- content="string",
- model_id="model_id",
- stream=True,
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- stream = await response.parse()
- await stream.close()
-
- assert cast(Any, response.is_closed) is True
-
- @parametrize
- async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = await async_client.inference.embeddings(
- contents=["string"],
- model_id="model_id",
- )
-
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- @parametrize
- async def test_method_embeddings_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- inference = await async_client.inference.embeddings(
- contents=["string"],
- model_id="model_id",
- output_dimension=0,
- task_type="query",
- text_truncation="none",
- )
-
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- @parametrize
- async def test_raw_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- response = await async_client.inference.with_raw_response.embeddings(
- contents=["string"],
- model_id="model_id",
- )
-
- assert response.is_closed is True
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
- inference = await response.parse()
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- @parametrize
- async def test_streaming_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
- with pytest.warns(DeprecationWarning):
- async with async_client.inference.with_streaming_response.embeddings(
- contents=["string"],
- model_id="model_id",
- ) as response:
- assert not response.is_closed
- assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
- inference = await response.parse()
- assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
- assert cast(Any, response.is_closed) is True
-
@parametrize
async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None:
inference = await async_client.inference.rerank(
diff --git a/tests/test_client.py b/tests/test_client.py
index a5bce12c..708c7420 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -678,17 +678,17 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str
@mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
@pytest.mark.respx(base_url=base_url)
def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+ respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error"))
with pytest.raises(APITimeoutError):
- client.inference.with_streaming_response.chat_completion(
+ client.chat.completions.with_streaming_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
).__enter__()
assert _get_open_connections(self.client) == 0
@@ -696,17 +696,17 @@ def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, clien
@mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
@pytest.mark.respx(base_url=base_url)
def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
- respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500))
+ respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500))
with pytest.raises(APIStatusError):
- client.inference.with_streaming_response.chat_completion(
+ client.chat.completions.with_streaming_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
).__enter__()
assert _get_open_connections(self.client) == 0
@@ -734,16 +734,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500)
return httpx.Response(200)
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+ respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
- response = client.inference.with_raw_response.chat_completion(
+ response = client.chat.completions.with_raw_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
)
assert response.retries_taken == failures_before_success
@@ -766,16 +766,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500)
return httpx.Response(200)
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+ respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
- response = client.inference.with_raw_response.chat_completion(
+ response = client.chat.completions.with_raw_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
extra_headers={"x-stainless-retry-count": Omit()},
)
@@ -798,16 +798,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500)
return httpx.Response(200)
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+ respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
- response = client.inference.with_raw_response.chat_completion(
+ response = client.chat.completions.with_raw_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
extra_headers={"x-stainless-retry-count": "42"},
)
@@ -1498,17 +1498,17 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte
async def test_retrying_timeout_errors_doesnt_leak(
self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
) -> None:
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+ respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error"))
with pytest.raises(APITimeoutError):
- await async_client.inference.with_streaming_response.chat_completion(
+ await async_client.chat.completions.with_streaming_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
).__aenter__()
assert _get_open_connections(self.client) == 0
@@ -1518,17 +1518,17 @@ async def test_retrying_timeout_errors_doesnt_leak(
async def test_retrying_status_errors_doesnt_leak(
self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
) -> None:
- respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500))
+ respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500))
with pytest.raises(APIStatusError):
- await async_client.inference.with_streaming_response.chat_completion(
+ await async_client.chat.completions.with_streaming_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
).__aenter__()
assert _get_open_connections(self.client) == 0
@@ -1557,16 +1557,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500)
return httpx.Response(200)
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+ respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
- response = await client.inference.with_raw_response.chat_completion(
+ response = await client.chat.completions.with_raw_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
)
assert response.retries_taken == failures_before_success
@@ -1590,16 +1590,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500)
return httpx.Response(200)
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+ respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
- response = await client.inference.with_raw_response.chat_completion(
+ response = await client.chat.completions.with_raw_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
extra_headers={"x-stainless-retry-count": Omit()},
)
@@ -1623,16 +1623,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
return httpx.Response(500)
return httpx.Response(200)
- respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+ respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
- response = await client.inference.with_raw_response.chat_completion(
+ response = await client.chat.completions.with_raw_response.create(
messages=[
{
"content": "string",
"role": "user",
}
],
- model_id="model_id",
+ model="model",
extra_headers={"x-stainless-retry-count": "42"},
)