diff --git a/.release-please-manifest.json b/.release-please-manifest.json index ed9acd29..1ae25264 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.2.23-alpha.1" + ".": "0.3.0-alpha.1" } diff --git a/.stats.yml b/.stats.yml index fa9edfc7..755df453 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ -configured_endpoints: 111 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-f252873ea1e1f38fd207331ef2621c511154d5be3f4076e59cc15754fc58eee4.yml -openapi_spec_hash: 10cbb4337a06a9fdd7d08612dd6044c3 -config_hash: 0358112cc0f3d880b4d55debdbe1cfa3 +configured_endpoints: 105 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml +openapi_spec_hash: f73b3af77108625edae3f25972b9e665 +config_hash: 548f336ac1b68ab1dfe385b79df764dd diff --git a/CHANGELOG.md b/CHANGELOG.md index 0011c19f..93d68692 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,36 @@ # Changelog +## 0.3.0-alpha.1 (2025-09-30) + +Full Changelog: [v0.2.23-alpha.1...v0.3.0-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.23-alpha.1...v0.3.0-alpha.1) + +### ⚠ BREAKING CHANGES + +* **api:** fixes to remove deprecated inference resources + +### Features + +* **api:** expires_after changes for /files ([7f24c43](https://github.com/llamastack/llama-stack-client-python/commit/7f24c432dc1859312710a4a1ff4a80f6f861bee8)) +* **api:** fixes to remove deprecated inference resources ([04834d2](https://github.com/llamastack/llama-stack-client-python/commit/04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec)) +* **api:** removing openai/v1 ([a918b43](https://github.com/llamastack/llama-stack-client-python/commit/a918b4323118c18f77c2abe7e1a3054c1eebeaac)) +* **api:** updating post /v1/files to have correct multipart/form-data ([433a996](https://github.com/llamastack/llama-stack-client-python/commit/433a996527bcca131ada4730376d8993f34ad6f5)) + + +### Bug Fixes + +* clean up deprecated code ([f10ead0](https://github.com/llamastack/llama-stack-client-python/commit/f10ead00522b7ca803cd7dc3617da0d451efa7da)) +* Don't retry for non-recoverable server http errors ([#212](https://github.com/llamastack/llama-stack-client-python/issues/212)) ([6782e8f](https://github.com/llamastack/llama-stack-client-python/commit/6782e8fc5931369223ed4446f8e7732f62712eff)) + + +### Documentation + +* update examples ([f896747](https://github.com/llamastack/llama-stack-client-python/commit/f89674726f55915a8cda0e2b4284be3c92978121)) + + +### Build System + +* Bump version to 0.2.23 ([0d4dc64](https://github.com/llamastack/llama-stack-client-python/commit/0d4dc6449224fa2a0f6d20f6229dd9d1a5427861)) + ## 0.2.23-alpha.1 (2025-09-26) Full Changelog: [v0.2.19-alpha.1...v0.2.23-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.19-alpha.1...v0.2.23-alpha.1) diff --git a/README.md b/README.md index 928458d2..c8cebcc3 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,50 @@ asyncio.run(main()) Functionality between the synchronous and asynchronous clients is otherwise identical. +## Streaming responses + +We provide support for streaming responses using Server Side Events (SSE). + +```python +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient() + +stream = client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + stream=True, +) +for completion in stream: + print(completion) +``` + +The async client uses the exact same interface. + +```python +from llama_stack_client import AsyncLlamaStackClient + +client = AsyncLlamaStackClient() + +stream = await client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", + stream=True, +) +async for completion in stream: + print(completion) +``` + ## Using types Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like: @@ -118,6 +162,40 @@ Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typ Typed requests and responses provide autocomplete and documentation within your editor. If you would like to see type errors in VS Code to help catch bugs earlier, set `python.analysis.typeCheckingMode` to `basic`. +## Nested params + +Nested parameters are dictionaries, typed using `TypedDict`, for example: + +```python +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient() + +client.toolgroups.register( + provider_id="provider_id", + toolgroup_id="toolgroup_id", + mcp_endpoint={"uri": "uri"}, +) +``` + +## File uploads + +Request parameters that correspond to file uploads can be passed as `bytes`, or a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance or a tuple of `(filename, contents, media type)`. + +```python +from pathlib import Path +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient() + +client.files.create( + file=Path("/path/to/file"), + purpose="assistants", +) +``` + +The async client uses the exact same interface. If you pass a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance, the file contents will be read asynchronously automatically. + ## Handling errors When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `llama_stack_client.APIConnectionError` is raised. @@ -134,9 +212,14 @@ from llama_stack_client import LlamaStackClient client = LlamaStackClient() try: - client.agents.sessions.create( - agent_id="agent_id", - session_name="session_name", + client.chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", ) except llama_stack_client.APIConnectionError as e: print("The server could not be reached") @@ -180,9 +263,14 @@ client = LlamaStackClient( ) # Or, configure per-request: -client.with_options(max_retries=5).agents.sessions.create( - agent_id="agent_id", - session_name="session_name", +client.with_options(max_retries=5).chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", ) ``` @@ -206,9 +294,14 @@ client = LlamaStackClient( ) # Override per-request: -client.with_options(timeout=5.0).agents.sessions.create( - agent_id="agent_id", - session_name="session_name", +client.with_options(timeout=5.0).chat.completions.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", ) ``` @@ -248,14 +341,17 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to from llama_stack_client import LlamaStackClient client = LlamaStackClient() -response = client.agents.sessions.with_raw_response.create( - agent_id="agent_id", - session_name="session_name", +response = client.chat.completions.with_raw_response.create( + messages=[{ + "content": "string", + "role": "user", + }], + model="model", ) print(response.headers.get('X-My-Header')) -session = response.parse() # get the object that `agents.sessions.create()` would have returned -print(session.session_id) +completion = response.parse() # get the object that `chat.completions.create()` would have returned +print(completion) ``` These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object. @@ -269,9 +365,14 @@ The above interface eagerly reads the full response body when you make the reque To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods. ```python -with client.agents.sessions.with_streaming_response.create( - agent_id="agent_id", - session_name="session_name", +with client.chat.completions.with_streaming_response.create( + messages=[ + { + "content": "string", + "role": "user", + } + ], + model="model", ) as response: print(response.headers.get("X-My-Header")) diff --git a/api.md b/api.md index 22c2120f..c246f4c1 100644 --- a/api.md +++ b/api.md @@ -3,10 +3,8 @@ ```python from llama_stack_client.types import ( AgentConfig, - BatchCompletion, ChatCompletionResponse, CompletionMessage, - ContentDelta, Document, InterleavedContent, InterleavedContentItem, @@ -14,7 +12,6 @@ from llama_stack_client.types import ( Metric, ParamType, QueryConfig, - QueryGeneratorConfig, QueryResult, ResponseFormat, SafetyViolation, @@ -91,10 +88,10 @@ from llama_stack_client.types import ( Methods: -- client.responses.create(\*\*params) -> ResponseObject -- client.responses.retrieve(response_id) -> ResponseObject -- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse] -- client.responses.delete(response_id) -> ResponseDeleteResponse +- client.responses.create(\*\*params) -> ResponseObject +- client.responses.retrieve(response_id) -> ResponseObject +- client.responses.list(\*\*params) -> SyncOpenAICursorPage[ResponseListResponse] +- client.responses.delete(response_id) -> ResponseDeleteResponse ## InputItems @@ -106,7 +103,7 @@ from llama_stack_client.types.responses import InputItemListResponse Methods: -- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse +- client.responses.input_items.list(response_id, \*\*params) -> InputItemListResponse # Agents @@ -164,12 +161,7 @@ Methods: Types: ```python -from llama_stack_client.types.agents import ( - AgentTurnResponseStreamChunk, - Turn, - TurnResponseEvent, - TurnResponseEventPayload, -) +from llama_stack_client.types.agents import AgentTurnResponseStreamChunk, Turn, TurnResponseEvent ``` Methods: @@ -206,7 +198,7 @@ Methods: Types: ```python -from llama_stack_client.types import BenchmarkConfig, EvalCandidate, EvaluateResponse, Job +from llama_stack_client.types import BenchmarkConfig, EvaluateResponse, Job ``` Methods: @@ -242,24 +234,12 @@ Methods: Types: ```python -from llama_stack_client.types import ( - ChatCompletionResponseStreamChunk, - CompletionResponse, - EmbeddingsResponse, - TokenLogProbs, - InferenceBatchChatCompletionResponse, - InferenceRerankResponse, -) +from llama_stack_client.types import InferenceRerankResponse ``` Methods: -- client.inference.batch_chat_completion(\*\*params) -> InferenceBatchChatCompletionResponse -- client.inference.batch_completion(\*\*params) -> BatchCompletion -- client.inference.chat_completion(\*\*params) -> ChatCompletionResponse -- client.inference.completion(\*\*params) -> CompletionResponse -- client.inference.embeddings(\*\*params) -> EmbeddingsResponse -- client.inference.rerank(\*\*params) -> InferenceRerankResponse +- client.inference.rerank(\*\*params) -> InferenceRerankResponse # Embeddings @@ -271,7 +251,7 @@ from llama_stack_client.types import CreateEmbeddingsResponse Methods: -- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse +- client.embeddings.create(\*\*params) -> CreateEmbeddingsResponse # Chat @@ -295,9 +275,9 @@ from llama_stack_client.types.chat import ( Methods: -- client.chat.completions.create(\*\*params) -> CompletionCreateResponse -- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse -- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse] +- client.chat.completions.create(\*\*params) -> CompletionCreateResponse +- client.chat.completions.retrieve(completion_id) -> CompletionRetrieveResponse +- client.chat.completions.list(\*\*params) -> SyncOpenAICursorPage[CompletionListResponse] # Completions @@ -309,7 +289,7 @@ from llama_stack_client.types import CompletionCreateResponse Methods: -- client.completions.create(\*\*params) -> CompletionCreateResponse +- client.completions.create(\*\*params) -> CompletionCreateResponse # VectorIo @@ -359,12 +339,12 @@ from llama_stack_client.types import ( Methods: -- client.vector_stores.create(\*\*params) -> VectorStore -- client.vector_stores.retrieve(vector_store_id) -> VectorStore -- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore -- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore] -- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse -- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse +- client.vector_stores.create(\*\*params) -> VectorStore +- client.vector_stores.retrieve(vector_store_id) -> VectorStore +- client.vector_stores.update(vector_store_id, \*\*params) -> VectorStore +- client.vector_stores.list(\*\*params) -> SyncOpenAICursorPage[VectorStore] +- client.vector_stores.delete(vector_store_id) -> VectorStoreDeleteResponse +- client.vector_stores.search(vector_store_id, \*\*params) -> VectorStoreSearchResponse ## Files @@ -380,12 +360,12 @@ from llama_stack_client.types.vector_stores import ( Methods: -- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile -- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile -- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile -- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile] -- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse -- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse +- client.vector_stores.files.create(vector_store_id, \*\*params) -> VectorStoreFile +- client.vector_stores.files.retrieve(file_id, \*, vector_store_id) -> VectorStoreFile +- client.vector_stores.files.update(file_id, \*, vector_store_id, \*\*params) -> VectorStoreFile +- client.vector_stores.files.list(vector_store_id, \*\*params) -> SyncOpenAICursorPage[VectorStoreFile] +- client.vector_stores.files.delete(file_id, \*, vector_store_id) -> FileDeleteResponse +- client.vector_stores.files.content(file_id, \*, vector_store_id) -> FileContentResponse # Models @@ -412,7 +392,7 @@ from llama_stack_client.types.models import OpenAIListResponse Methods: -- client.models.openai.list() -> OpenAIListResponse +- client.models.openai.list() -> ModelListResponse # PostTraining @@ -481,7 +461,7 @@ from llama_stack_client.types import CreateResponse Methods: -- client.moderations.create(\*\*params) -> CreateResponse +- client.moderations.create(\*\*params) -> CreateResponse # Safety @@ -608,8 +588,8 @@ from llama_stack_client.types import DeleteFileResponse, File, ListFilesResponse Methods: -- client.files.create(\*\*params) -> File -- client.files.retrieve(file_id) -> File -- client.files.list(\*\*params) -> SyncOpenAICursorPage[File] -- client.files.delete(file_id) -> DeleteFileResponse -- client.files.content(file_id) -> object +- client.files.create(\*\*params) -> File +- client.files.retrieve(file_id) -> File +- client.files.list(\*\*params) -> SyncOpenAICursorPage[File] +- client.files.delete(file_id) -> DeleteFileResponse +- client.files.content(file_id) -> object diff --git a/pyproject.toml b/pyproject.toml index 843dd9b7..3b50518e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "llama_stack_client" -version = "0.2.23" +version = "0.3.0-alpha.1" description = "The official Python library for the llama-stack-client API" dynamic = ["readme"] license = "MIT" diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py index 14b46372..cbf5f680 100644 --- a/src/llama_stack_client/lib/inference/event_logger.py +++ b/src/llama_stack_client/lib/inference/event_logger.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from typing import Generator from termcolor import cprint -from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk +from llama_stack_client.types import ChatCompletionChunk class InferenceStreamPrintableEvent: @@ -28,35 +28,11 @@ def __init__(self): self.is_thinking = False def yield_printable_events( - self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk + self, chunk: ChatCompletionChunk ) -> Generator[InferenceStreamPrintableEvent, None, None]: - # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk) - if hasattr(chunk, "event"): - yield from self._handle_inference_stream_chunk(chunk) - # Check if the chunk has choices attribute (ChatCompletionChunk) - elif hasattr(chunk, "choices") and len(chunk.choices) > 0: + if hasattr(chunk, "choices") and len(chunk.choices) > 0: yield from self._handle_chat_completion_chunk(chunk) - def _handle_inference_stream_chunk( - self, chunk: ChatCompletionResponseStreamChunk - ) -> Generator[InferenceStreamPrintableEvent, None, None]: - event = chunk.event - if event.event_type == "start": - yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="") - elif event.event_type == "progress": - if event.delta.type == "reasoning": - if not self.is_thinking: - yield InferenceStreamPrintableEvent(" ", color="magenta", end="") - self.is_thinking = True - yield InferenceStreamPrintableEvent(event.delta.reasoning, color="magenta", end="") - else: - if self.is_thinking: - yield InferenceStreamPrintableEvent("", color="magenta", end="") - self.is_thinking = False - yield InferenceStreamPrintableEvent(event.delta.text, color="yellow", end="") - elif event.event_type == "complete": - yield InferenceStreamPrintableEvent("") - def _handle_chat_completion_chunk( self, chunk: ChatCompletionChunk ) -> Generator[InferenceStreamPrintableEvent, None, None]: diff --git a/src/llama_stack_client/resources/chat/completions.py b/src/llama_stack_client/resources/chat/completions.py index 5445a2d1..2fb19980 100644 --- a/src/llama_stack_client/resources/chat/completions.py +++ b/src/llama_stack_client/resources/chat/completions.py @@ -372,7 +372,7 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]: return self._post( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", body=maybe_transform( { "messages": messages, @@ -439,7 +439,7 @@ def retrieve( if not completion_id: raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}") return self._get( - f"/v1/openai/v1/chat/completions/{completion_id}", + f"/v1/chat/completions/{completion_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -481,7 +481,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", page=SyncOpenAICursorPage[CompletionListResponse], options=make_request_options( extra_headers=extra_headers, @@ -845,7 +845,7 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]: return await self._post( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", body=await async_maybe_transform( { "messages": messages, @@ -912,7 +912,7 @@ async def retrieve( if not completion_id: raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}") return await self._get( - f"/v1/openai/v1/chat/completions/{completion_id}", + f"/v1/chat/completions/{completion_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -954,7 +954,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/chat/completions", + "/v1/chat/completions", page=AsyncOpenAICursorPage[CompletionListResponse], options=make_request_options( extra_headers=extra_headers, diff --git a/src/llama_stack_client/resources/completions.py b/src/llama_stack_client/resources/completions.py index 2c1475de..caeab7a1 100644 --- a/src/llama_stack_client/resources/completions.py +++ b/src/llama_stack_client/resources/completions.py @@ -326,7 +326,7 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | Stream[CompletionCreateResponse]: return self._post( - "/v1/openai/v1/completions", + "/v1/completions", body=maybe_transform( { "model": model, @@ -664,7 +664,7 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]: return await self._post( - "/v1/openai/v1/completions", + "/v1/completions", body=await async_maybe_transform( { "model": model, diff --git a/src/llama_stack_client/resources/embeddings.py b/src/llama_stack_client/resources/embeddings.py index 60c38cb2..29cd69d8 100644 --- a/src/llama_stack_client/resources/embeddings.py +++ b/src/llama_stack_client/resources/embeddings.py @@ -87,7 +87,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ return self._post( - "/v1/openai/v1/embeddings", + "/v1/embeddings", body=maybe_transform( { "input": input, @@ -169,7 +169,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/openai/v1/embeddings", + "/v1/embeddings", body=await async_maybe_transform( { "input": input, diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py index 6b395e52..39add811 100644 --- a/src/llama_stack_client/resources/files.py +++ b/src/llama_stack_client/resources/files.py @@ -51,6 +51,7 @@ def create( *, file: FileTypes, purpose: Literal["assistants", "batch"], + expires_after: file_create_params.ExpiresAfter | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -65,10 +66,17 @@ def create( - file: The File object (not file name) to be uploaded. - purpose: The intended purpose of the uploaded file. + - expires_after: Optional form values describing expiration for the file. Args: purpose: Valid purpose values for OpenAI Files API. + expires_after: + Control expiration of uploaded files. Params: + + - anchor, must be "created_at" + - seconds, must be int between 3600 and 2592000 (1 hour to 30 days) + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -81,6 +89,7 @@ def create( { "file": file, "purpose": purpose, + "expires_after": expires_after, } ) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) @@ -89,7 +98,7 @@ def create( # multipart/form-data; boundary=---abc-- extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})} return self._post( - "/v1/openai/v1/files", + "/v1/files", body=maybe_transform(body, file_create_params.FileCreateParams), files=files, options=make_request_options( @@ -124,7 +133,7 @@ def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -171,7 +180,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/files", + "/v1/files", page=SyncOpenAICursorPage[File], options=make_request_options( extra_headers=extra_headers, @@ -217,7 +226,7 @@ def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._delete( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -250,7 +259,7 @@ def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/files/{file_id}/content", + f"/v1/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -283,6 +292,7 @@ async def create( *, file: FileTypes, purpose: Literal["assistants", "batch"], + expires_after: file_create_params.ExpiresAfter | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, @@ -297,10 +307,17 @@ async def create( - file: The File object (not file name) to be uploaded. - purpose: The intended purpose of the uploaded file. + - expires_after: Optional form values describing expiration for the file. Args: purpose: Valid purpose values for OpenAI Files API. + expires_after: + Control expiration of uploaded files. Params: + + - anchor, must be "created_at" + - seconds, must be int between 3600 and 2592000 (1 hour to 30 days) + extra_headers: Send extra headers extra_query: Add additional query parameters to the request @@ -313,6 +330,7 @@ async def create( { "file": file, "purpose": purpose, + "expires_after": expires_after, } ) files = extract_files(cast(Mapping[str, object], body), paths=[["file"]]) @@ -321,7 +339,7 @@ async def create( # multipart/form-data; boundary=---abc-- extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})} return await self._post( - "/v1/openai/v1/files", + "/v1/files", body=await async_maybe_transform(body, file_create_params.FileCreateParams), files=files, options=make_request_options( @@ -356,7 +374,7 @@ async def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -403,7 +421,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/files", + "/v1/files", page=AsyncOpenAICursorPage[File], options=make_request_options( extra_headers=extra_headers, @@ -449,7 +467,7 @@ async def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._delete( - f"/v1/openai/v1/files/{file_id}", + f"/v1/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -482,7 +500,7 @@ async def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/files/{file_id}/content", + f"/v1/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py index 732025cc..e5cf7b6b 100644 --- a/src/llama_stack_client/resources/inference.py +++ b/src/llama_stack_client/resources/inference.py @@ -2,1106 +2,76 @@ from __future__ import annotations -import typing_extensions -from typing import Type, Union, Iterable, cast -from typing_extensions import Literal, overload +from typing import Type, cast import httpx -from ..types import ( - inference_rerank_params, - inference_completion_params, - inference_embeddings_params, - inference_chat_completion_params, - inference_batch_completion_params, - inference_batch_chat_completion_params, -) -from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given -from .._utils import required_args, maybe_transform, async_maybe_transform -from .._compat import cached_property -from .._resource import SyncAPIResource, AsyncAPIResource -from .._response import ( - to_raw_response_wrapper, - to_streamed_response_wrapper, - async_to_raw_response_wrapper, - async_to_streamed_response_wrapper, -) -from .._wrappers import DataWrapper -from .._streaming import Stream, AsyncStream -from .._base_client import make_request_options -from ..types.completion_response import CompletionResponse -from ..types.embeddings_response import EmbeddingsResponse -from ..types.shared_params.message import Message -from ..types.shared.batch_completion import BatchCompletion -from ..types.inference_rerank_response import InferenceRerankResponse -from ..types.shared_params.response_format import ResponseFormat -from ..types.shared_params.sampling_params import SamplingParams -from ..types.shared.chat_completion_response import ChatCompletionResponse -from ..types.shared_params.interleaved_content import InterleavedContent -from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk -from ..types.shared_params.interleaved_content_item import InterleavedContentItem -from ..types.inference_batch_chat_completion_response import InferenceBatchChatCompletionResponse - -__all__ = ["InferenceResource", "AsyncInferenceResource"] - - -class InferenceResource(SyncAPIResource): - @cached_property - def with_raw_response(self) -> InferenceResourceWithRawResponse: - """ - This property can be used as a prefix for any HTTP method call to return - the raw response object instead of the parsed content. - - For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers - """ - return InferenceResourceWithRawResponse(self) - - @cached_property - def with_streaming_response(self) -> InferenceResourceWithStreamingResponse: - """ - An alternative to `.with_raw_response` that doesn't eagerly read the response body. - - For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response - """ - return InferenceResourceWithStreamingResponse(self) - - def batch_chat_completion( - self, - *, - messages_batch: Iterable[Iterable[Message]], - model_id: str, - logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit, - tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceBatchChatCompletionResponse: - """ - Generate chat completions for a batch of messages using the specified model. - - Args: - messages_batch: The messages to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - tool_config: (Optional) Configuration for tool use. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/batch-chat-completion", - body=maybe_transform( - { - "messages_batch": messages_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "tool_config": tool_config, - "tools": tools, - }, - inference_batch_chat_completion_params.InferenceBatchChatCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=InferenceBatchChatCompletionResponse, - ) - - def batch_completion( - self, - *, - content_batch: SequenceNotStr[InterleavedContent], - model_id: str, - logprobs: inference_batch_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> BatchCompletion: - """ - Generate completions for a batch of content using the specified model. - - Args: - content_batch: The content to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/batch-completion", - body=maybe_transform( - { - "content_batch": content_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - }, - inference_batch_completion_params.InferenceBatchCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=BatchCompletion, - ) - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @overload - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @overload - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: Literal[True], - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> Stream[ChatCompletionResponseStreamChunk]: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @overload - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: bool, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @required_args(["messages", "model_id"], ["messages", "model_id", "stream"]) - def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} - return self._post( - "/v1/inference/chat-completion", - body=maybe_transform( - { - "messages": messages, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - "tool_choice": tool_choice, - "tool_config": tool_config, - "tool_prompt_format": tool_prompt_format, - "tools": tools, - }, - inference_chat_completion_params.InferenceChatCompletionParamsStreaming - if stream - else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=ChatCompletionResponse, - stream=stream or False, - stream_cls=Stream[ChatCompletionResponseStreamChunk], - ) - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: Literal[True], - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> Stream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: bool, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | Stream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @required_args(["content", "model_id"], ["content", "model_id", "stream"]) - def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | Stream[CompletionResponse]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} - return self._post( - "/v1/inference/completion", - body=maybe_transform( - { - "content": content, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - }, - inference_completion_params.InferenceCompletionParamsStreaming - if stream - else inference_completion_params.InferenceCompletionParamsNonStreaming, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=CompletionResponse, - stream=stream or False, - stream_cls=Stream[CompletionResponse], - ) - - @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.") - def embeddings( - self, - *, - contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]], - model_id: str, - output_dimension: int | Omit = omit, - task_type: Literal["query", "document"] | Omit = omit, - text_truncation: Literal["none", "start", "end"] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> EmbeddingsResponse: - """ - Generate embeddings for content pieces using the specified model. - - Args: - contents: List of contents to generate embeddings for. Each content can be a string or an - InterleavedContentItem (and hence can be multimodal). The behavior depends on - the model and provider. Some models may only support text. - - model_id: The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - - output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by - Matryoshka models. - - task_type: (Optional) How is the embedding being used? This is only supported by asymmetric - embedding models. - - text_truncation: (Optional) Config for how to truncate text for embedding when text is longer - than the model's max sequence length. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/embeddings", - body=maybe_transform( - { - "contents": contents, - "model_id": model_id, - "output_dimension": output_dimension, - "task_type": task_type, - "text_truncation": text_truncation, - }, - inference_embeddings_params.InferenceEmbeddingsParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=EmbeddingsResponse, - ) - - def rerank( - self, - *, - items: SequenceNotStr[inference_rerank_params.Item], - model: str, - query: inference_rerank_params.Query, - max_num_results: int | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceRerankResponse: - """ - Rerank a list of documents based on their relevance to a query. - - Args: - items: List of items to rerank. Each item can be a string, text content part, or image - content part. Each input must not exceed the model's max input token length. - - model: The identifier of the reranking model to use. - - query: The search query to rank items against. Can be a string, text content part, or - image content part. The input must not exceed the model's max input token - length. - - max_num_results: (Optional) Maximum number of results to return. Default: returns all. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return self._post( - "/v1/inference/rerank", - body=maybe_transform( - { - "items": items, - "model": model, - "query": query, - "max_num_results": max_num_results, - }, - inference_rerank_params.InferenceRerankParams, - ), - options=make_request_options( - extra_headers=extra_headers, - extra_query=extra_query, - extra_body=extra_body, - timeout=timeout, - post_parser=DataWrapper[InferenceRerankResponse]._unwrapper, - ), - cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), - ) - - -class AsyncInferenceResource(AsyncAPIResource): - @cached_property - def with_raw_response(self) -> AsyncInferenceResourceWithRawResponse: - """ - This property can be used as a prefix for any HTTP method call to return - the raw response object instead of the parsed content. - - For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers - """ - return AsyncInferenceResourceWithRawResponse(self) - - @cached_property - def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse: - """ - An alternative to `.with_raw_response` that doesn't eagerly read the response body. - - For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response - """ - return AsyncInferenceResourceWithStreamingResponse(self) - - async def batch_chat_completion( - self, - *, - messages_batch: Iterable[Iterable[Message]], - model_id: str, - logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit, - tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> InferenceBatchChatCompletionResponse: - """ - Generate chat completions for a batch of messages using the specified model. - - Args: - messages_batch: The messages to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - tool_config: (Optional) Configuration for tool use. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return await self._post( - "/v1/inference/batch-chat-completion", - body=await async_maybe_transform( - { - "messages_batch": messages_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "tool_config": tool_config, - "tools": tools, - }, - inference_batch_chat_completion_params.InferenceBatchChatCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=InferenceBatchChatCompletionResponse, - ) - - async def batch_completion( - self, - *, - content_batch: SequenceNotStr[InterleavedContent], - model_id: str, - logprobs: inference_batch_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> BatchCompletion: - """ - Generate completions for a batch of content using the specified model. - - Args: - content_batch: The content to generate completions for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - return await self._post( - "/v1/inference/batch-completion", - body=await async_maybe_transform( - { - "content_batch": content_batch, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - }, - inference_batch_completion_params.InferenceBatchCompletionParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=BatchCompletion, - ) - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @overload - async def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse: - """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers +from ..types import inference_rerank_params +from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given +from .._utils import maybe_transform, async_maybe_transform +from .._compat import cached_property +from .._resource import SyncAPIResource, AsyncAPIResource +from .._response import ( + to_raw_response_wrapper, + to_streamed_response_wrapper, + async_to_raw_response_wrapper, + async_to_streamed_response_wrapper, +) +from .._wrappers import DataWrapper +from .._base_client import make_request_options +from ..types.inference_rerank_response import InferenceRerankResponse - extra_query: Add additional query parameters to the request +__all__ = ["InferenceResource", "AsyncInferenceResource"] - extra_body: Add additional JSON properties to the request - timeout: Override the client-level default timeout for this request, in seconds +class InferenceResource(SyncAPIResource): + @cached_property + def with_raw_response(self) -> InferenceResourceWithRawResponse: """ - ... + This property can be used as a prefix for any HTTP method call to return + the raw response object instead of the parsed content. - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @overload - async def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - stream: Literal[True], - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> AsyncStream[ChatCompletionResponseStreamChunk]: + For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers """ - Generate a chat completion for the given messages using the specified model. - - Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. - - tool_config: (Optional) Configuration for tool use. - - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - - tools: (Optional) List of tool definitions available to the model. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request + return InferenceResourceWithRawResponse(self) - extra_body: Add additional JSON properties to the request + @cached_property + def with_streaming_response(self) -> InferenceResourceWithStreamingResponse: + """ + An alternative to `.with_raw_response` that doesn't eagerly read the response body. - timeout: Override the client-level default timeout for this request, in seconds + For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response """ - ... + return InferenceResourceWithStreamingResponse(self) - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @overload - async def chat_completion( + def rerank( self, *, - messages: Iterable[Message], - model_id: str, - stream: bool, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, + items: SequenceNotStr[inference_rerank_params.Item], + model: str, + query: inference_rerank_params.Query, + max_num_results: int | Omit = omit, # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. # The extra values given here take precedence over values defined on the client or passed to this method. extra_headers: Headers | None = None, extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]: + ) -> InferenceRerankResponse: """ - Generate a chat completion for the given messages using the specified model. + Rerank a list of documents based on their relevance to a query. Args: - messages: List of messages in the conversation. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. There are two - options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most - providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF - grammar. This format is more flexible, but not all providers support it. - - sampling_params: Parameters to control the sampling strategy. - - tool_choice: (Optional) Whether tool use is required or automatic. Defaults to - ToolChoice.auto. .. deprecated:: Use tool_config instead. + items: List of items to rerank. Each item can be a string, text content part, or image + content part. Each input must not exceed the model's max input token length. - tool_config: (Optional) Configuration for tool use. + model: The identifier of the reranking model to use. - tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack - will attempt to use a format that is best adapted to the model. - - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. + query: The search query to rank items against. Can be a string, text content part, or + image content part. The input must not exceed the model's max input token + length. - tools: (Optional) List of tool definitions available to the model. + max_num_results: (Optional) Maximum number of results to return. Default: returns all. extra_headers: Send extra headers @@ -1111,306 +81,47 @@ async def chat_completion( timeout: Override the client-level default timeout for this request, in seconds """ - ... - - @typing_extensions.deprecated( - "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions." - ) - @required_args(["messages", "model_id"], ["messages", "model_id", "stream"]) - async def chat_completion( - self, - *, - messages: Iterable[Message], - model_id: str, - logprobs: inference_chat_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - tool_choice: Literal["auto", "required", "none"] | Omit = omit, - tool_config: inference_chat_completion_params.ToolConfig | Omit = omit, - tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit, - tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} - return await self._post( - "/v1/inference/chat-completion", - body=await async_maybe_transform( + return self._post( + "/v1alpha/inference/rerank", + body=maybe_transform( { - "messages": messages, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - "tool_choice": tool_choice, - "tool_config": tool_config, - "tool_prompt_format": tool_prompt_format, - "tools": tools, + "items": items, + "model": model, + "query": query, + "max_num_results": max_num_results, }, - inference_chat_completion_params.InferenceChatCompletionParamsStreaming - if stream - else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming, + inference_rerank_params.InferenceRerankParams, ), options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + extra_headers=extra_headers, + extra_query=extra_query, + extra_body=extra_body, + timeout=timeout, + post_parser=DataWrapper[InferenceRerankResponse]._unwrapper, ), - cast_to=ChatCompletionResponse, - stream=stream or False, - stream_cls=AsyncStream[ChatCompletionResponseStreamChunk], + cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]), ) - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: Literal[True], - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> AsyncStream[CompletionResponse]: - """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - - timeout: Override the client-level default timeout for this request, in seconds - """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @overload - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - stream: bool, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | AsyncStream[CompletionResponse]: +class AsyncInferenceResource(AsyncAPIResource): + @cached_property + def with_raw_response(self) -> AsyncInferenceResourceWithRawResponse: """ - Generate a completion for the given content using the specified model. - - Args: - content: The content to generate a completion for. - - model_id: The identifier of the model to use. The model must be registered with Llama - Stack and available via the /models endpoint. - - stream: (Optional) If True, generate an SSE event stream of the response. Defaults to - False. - - logprobs: (Optional) If specified, log probabilities for each token position will be - returned. - - response_format: (Optional) Grammar specification for guided (structured) decoding. - - sampling_params: (Optional) Parameters to control the sampling strategy. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request + This property can be used as a prefix for any HTTP method call to return + the raw response object instead of the parsed content. - timeout: Override the client-level default timeout for this request, in seconds + For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers """ - ... - - @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.") - @required_args(["content", "model_id"], ["content", "model_id", "stream"]) - async def completion( - self, - *, - content: InterleavedContent, - model_id: str, - logprobs: inference_completion_params.Logprobs | Omit = omit, - response_format: ResponseFormat | Omit = omit, - sampling_params: SamplingParams | Omit = omit, - stream: Literal[False] | Literal[True] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> CompletionResponse | AsyncStream[CompletionResponse]: - if stream: - extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})} - return await self._post( - "/v1/inference/completion", - body=await async_maybe_transform( - { - "content": content, - "model_id": model_id, - "logprobs": logprobs, - "response_format": response_format, - "sampling_params": sampling_params, - "stream": stream, - }, - inference_completion_params.InferenceCompletionParamsStreaming - if stream - else inference_completion_params.InferenceCompletionParamsNonStreaming, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=CompletionResponse, - stream=stream or False, - stream_cls=AsyncStream[CompletionResponse], - ) + return AsyncInferenceResourceWithRawResponse(self) - @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.") - async def embeddings( - self, - *, - contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]], - model_id: str, - output_dimension: int | Omit = omit, - task_type: Literal["query", "document"] | Omit = omit, - text_truncation: Literal["none", "start", "end"] | Omit = omit, - # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. - # The extra values given here take precedence over values defined on the client or passed to this method. - extra_headers: Headers | None = None, - extra_query: Query | None = None, - extra_body: Body | None = None, - timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> EmbeddingsResponse: + @cached_property + def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse: """ - Generate embeddings for content pieces using the specified model. - - Args: - contents: List of contents to generate embeddings for. Each content can be a string or an - InterleavedContentItem (and hence can be multimodal). The behavior depends on - the model and provider. Some models may only support text. - - model_id: The identifier of the model to use. The model must be an embedding model - registered with Llama Stack and available via the /models endpoint. - - output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by - Matryoshka models. - - task_type: (Optional) How is the embedding being used? This is only supported by asymmetric - embedding models. - - text_truncation: (Optional) Config for how to truncate text for embedding when text is longer - than the model's max sequence length. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request + An alternative to `.with_raw_response` that doesn't eagerly read the response body. - timeout: Override the client-level default timeout for this request, in seconds + For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response """ - return await self._post( - "/v1/inference/embeddings", - body=await async_maybe_transform( - { - "contents": contents, - "model_id": model_id, - "output_dimension": output_dimension, - "task_type": task_type, - "text_truncation": text_truncation, - }, - inference_embeddings_params.InferenceEmbeddingsParams, - ), - options=make_request_options( - extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout - ), - cast_to=EmbeddingsResponse, - ) + return AsyncInferenceResourceWithStreamingResponse(self) async def rerank( self, @@ -1450,7 +161,7 @@ async def rerank( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/inference/rerank", + "/v1alpha/inference/rerank", body=await async_maybe_transform( { "items": items, @@ -1475,27 +186,6 @@ class InferenceResourceWithRawResponse: def __init__(self, inference: InferenceResource) -> None: self._inference = inference - self.batch_chat_completion = to_raw_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = to_raw_response_wrapper( - inference.batch_completion, - ) - self.chat_completion = ( # pyright: ignore[reportDeprecated] - to_raw_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) - ) - self.completion = ( # pyright: ignore[reportDeprecated] - to_raw_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - to_raw_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) self.rerank = to_raw_response_wrapper( inference.rerank, ) @@ -1505,27 +195,6 @@ class AsyncInferenceResourceWithRawResponse: def __init__(self, inference: AsyncInferenceResource) -> None: self._inference = inference - self.batch_chat_completion = async_to_raw_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = async_to_raw_response_wrapper( - inference.batch_completion, - ) - self.chat_completion = ( # pyright: ignore[reportDeprecated] - async_to_raw_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) - ) - self.completion = ( # pyright: ignore[reportDeprecated] - async_to_raw_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - async_to_raw_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) self.rerank = async_to_raw_response_wrapper( inference.rerank, ) @@ -1535,27 +204,6 @@ class InferenceResourceWithStreamingResponse: def __init__(self, inference: InferenceResource) -> None: self._inference = inference - self.batch_chat_completion = to_streamed_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = to_streamed_response_wrapper( - inference.batch_completion, - ) - self.chat_completion = ( # pyright: ignore[reportDeprecated] - to_streamed_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) - ) - self.completion = ( # pyright: ignore[reportDeprecated] - to_streamed_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - to_streamed_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) self.rerank = to_streamed_response_wrapper( inference.rerank, ) @@ -1565,27 +213,6 @@ class AsyncInferenceResourceWithStreamingResponse: def __init__(self, inference: AsyncInferenceResource) -> None: self._inference = inference - self.batch_chat_completion = async_to_streamed_response_wrapper( - inference.batch_chat_completion, - ) - self.batch_completion = async_to_streamed_response_wrapper( - inference.batch_completion, - ) - self.chat_completion = ( # pyright: ignore[reportDeprecated] - async_to_streamed_response_wrapper( - inference.chat_completion, # pyright: ignore[reportDeprecated], - ) - ) - self.completion = ( # pyright: ignore[reportDeprecated] - async_to_streamed_response_wrapper( - inference.completion, # pyright: ignore[reportDeprecated], - ) - ) - self.embeddings = ( # pyright: ignore[reportDeprecated] - async_to_streamed_response_wrapper( - inference.embeddings, # pyright: ignore[reportDeprecated], - ) - ) self.rerank = async_to_streamed_response_wrapper( inference.rerank, ) diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py index e4b2fbd8..ab4b4038 100644 --- a/src/llama_stack_client/resources/models/openai.py +++ b/src/llama_stack_client/resources/models/openai.py @@ -17,7 +17,7 @@ ) from ..._wrappers import DataWrapper from ..._base_client import make_request_options -from ...types.models.openai_list_response import OpenAIListResponse +from ...types.model_list_response import ModelListResponse __all__ = ["OpenAIResource", "AsyncOpenAIResource"] @@ -51,18 +51,18 @@ def list( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> OpenAIListResponse: - """List models using the OpenAI API.""" + ) -> ModelListResponse: + """List all models.""" return self._get( - "/v1/openai/v1/models", + "/v1/models", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout, - post_parser=DataWrapper[OpenAIListResponse]._unwrapper, + post_parser=DataWrapper[ModelListResponse]._unwrapper, ), - cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]), + cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]), ) @@ -95,18 +95,18 @@ async def list( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> OpenAIListResponse: - """List models using the OpenAI API.""" + ) -> ModelListResponse: + """List all models.""" return await self._get( - "/v1/openai/v1/models", + "/v1/models", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout, - post_parser=DataWrapper[OpenAIListResponse]._unwrapper, + post_parser=DataWrapper[ModelListResponse]._unwrapper, ), - cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]), + cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]), ) diff --git a/src/llama_stack_client/resources/moderations.py b/src/llama_stack_client/resources/moderations.py index a016b5b0..a73dc85a 100644 --- a/src/llama_stack_client/resources/moderations.py +++ b/src/llama_stack_client/resources/moderations.py @@ -73,7 +73,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ return self._post( - "/v1/openai/v1/moderations", + "/v1/moderations", body=maybe_transform( { "input": input, @@ -138,7 +138,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/openai/v1/moderations", + "/v1/moderations", body=await async_maybe_transform( { "input": input, diff --git a/src/llama_stack_client/resources/responses/input_items.py b/src/llama_stack_client/resources/responses/input_items.py index da06debd..a5836ba7 100644 --- a/src/llama_stack_client/resources/responses/input_items.py +++ b/src/llama_stack_client/resources/responses/input_items.py @@ -85,7 +85,7 @@ def list( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return self._get( - f"/v1/openai/v1/responses/{response_id}/input_items", + f"/v1/responses/{response_id}/input_items", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, @@ -168,7 +168,7 @@ async def list( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return await self._get( - f"/v1/openai/v1/responses/{response_id}/input_items", + f"/v1/responses/{response_id}/input_items", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, diff --git a/src/llama_stack_client/resources/responses/responses.py b/src/llama_stack_client/resources/responses/responses.py index 7f21f3ea..16e38fd0 100644 --- a/src/llama_stack_client/resources/responses/responses.py +++ b/src/llama_stack_client/resources/responses/responses.py @@ -228,7 +228,7 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ResponseObject | Stream[ResponseObjectStream]: return self._post( - "/v1/openai/v1/responses", + "/v1/responses", body=maybe_transform( { "input": input, @@ -281,7 +281,7 @@ def retrieve( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return self._get( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -323,7 +323,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/responses", + "/v1/responses", page=SyncOpenAICursorPage[ResponseListResponse], options=make_request_options( extra_headers=extra_headers, @@ -369,7 +369,7 @@ def delete( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return self._delete( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -568,7 +568,7 @@ async def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> ResponseObject | AsyncStream[ResponseObjectStream]: return await self._post( - "/v1/openai/v1/responses", + "/v1/responses", body=await async_maybe_transform( { "input": input, @@ -621,7 +621,7 @@ async def retrieve( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return await self._get( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -663,7 +663,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/responses", + "/v1/responses", page=AsyncOpenAICursorPage[ResponseListResponse], options=make_request_options( extra_headers=extra_headers, @@ -709,7 +709,7 @@ async def delete( if not response_id: raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}") return await self._delete( - f"/v1/openai/v1/responses/{response_id}", + f"/v1/responses/{response_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), diff --git a/src/llama_stack_client/resources/vector_stores/files.py b/src/llama_stack_client/resources/vector_stores/files.py index 39f16a66..f9a1ef31 100644 --- a/src/llama_stack_client/resources/vector_stores/files.py +++ b/src/llama_stack_client/resources/vector_stores/files.py @@ -82,7 +82,7 @@ def create( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", body=maybe_transform( { "file_id": file_id, @@ -126,7 +126,7 @@ def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -165,7 +165,7 @@ def update( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", body=maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout @@ -218,7 +218,7 @@ def list( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._get_api_list( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", page=SyncOpenAICursorPage[VectorStoreFile], options=make_request_options( extra_headers=extra_headers, @@ -268,7 +268,7 @@ def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -304,7 +304,7 @@ def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -367,7 +367,7 @@ async def create( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", body=await async_maybe_transform( { "file_id": file_id, @@ -411,7 +411,7 @@ async def retrieve( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -450,7 +450,7 @@ async def update( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", body=await async_maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams), options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout @@ -503,7 +503,7 @@ def list( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._get_api_list( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files", + f"/v1/vector_stores/{vector_store_id}/files", page=AsyncOpenAICursorPage[VectorStoreFile], options=make_request_options( extra_headers=extra_headers, @@ -553,7 +553,7 @@ async def delete( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -589,7 +589,7 @@ async def content( if not file_id: raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}") return await self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content", + f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), diff --git a/src/llama_stack_client/resources/vector_stores/vector_stores.py b/src/llama_stack_client/resources/vector_stores/vector_stores.py index f3ab01f2..f858100b 100644 --- a/src/llama_stack_client/resources/vector_stores/vector_stores.py +++ b/src/llama_stack_client/resources/vector_stores/vector_stores.py @@ -112,7 +112,7 @@ def create( timeout: Override the client-level default timeout for this request, in seconds """ return self._post( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", body=maybe_transform( { "chunking_strategy": chunking_strategy, @@ -158,7 +158,7 @@ def retrieve( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -200,7 +200,7 @@ def update( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", body=maybe_transform( { "expires_after": expires_after, @@ -255,7 +255,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", page=SyncOpenAICursorPage[VectorStore], options=make_request_options( extra_headers=extra_headers, @@ -301,7 +301,7 @@ def delete( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -354,7 +354,7 @@ def search( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/search", + f"/v1/vector_stores/{vector_store_id}/search", body=maybe_transform( { "query": query, @@ -446,7 +446,7 @@ async def create( timeout: Override the client-level default timeout for this request, in seconds """ return await self._post( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", body=await async_maybe_transform( { "chunking_strategy": chunking_strategy, @@ -492,7 +492,7 @@ async def retrieve( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._get( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -534,7 +534,7 @@ async def update( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", body=await async_maybe_transform( { "expires_after": expires_after, @@ -589,7 +589,7 @@ def list( timeout: Override the client-level default timeout for this request, in seconds """ return self._get_api_list( - "/v1/openai/v1/vector_stores", + "/v1/vector_stores", page=AsyncOpenAICursorPage[VectorStore], options=make_request_options( extra_headers=extra_headers, @@ -635,7 +635,7 @@ async def delete( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._delete( - f"/v1/openai/v1/vector_stores/{vector_store_id}", + f"/v1/vector_stores/{vector_store_id}", options=make_request_options( extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout ), @@ -688,7 +688,7 @@ async def search( if not vector_store_id: raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}") return await self._post( - f"/v1/openai/v1/vector_stores/{vector_store_id}/search", + f"/v1/vector_stores/{vector_store_id}/search", body=await async_maybe_transform( { "query": query, diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py index 56b7f887..f81ada61 100644 --- a/src/llama_stack_client/types/__init__.py +++ b/src/llama_stack_client/types/__init__.py @@ -17,18 +17,15 @@ QueryConfig as QueryConfig, QueryResult as QueryResult, UserMessage as UserMessage, - ContentDelta as ContentDelta, ScoringResult as ScoringResult, SystemMessage as SystemMessage, ResponseFormat as ResponseFormat, SamplingParams as SamplingParams, - BatchCompletion as BatchCompletion, SafetyViolation as SafetyViolation, CompletionMessage as CompletionMessage, InterleavedContent as InterleavedContent, ToolParamDefinition as ToolParamDefinition, ToolResponseMessage as ToolResponseMessage, - QueryGeneratorConfig as QueryGeneratorConfig, ChatCompletionResponse as ChatCompletionResponse, InterleavedContentItem as InterleavedContentItem, ) @@ -48,7 +45,6 @@ from .tool_def_param import ToolDefParam as ToolDefParam from .create_response import CreateResponse as CreateResponse from .response_object import ResponseObject as ResponseObject -from .token_log_probs import TokenLogProbs as TokenLogProbs from .file_list_params import FileListParams as FileListParams from .shield_call_step import ShieldCallStep as ShieldCallStep from .span_with_status import SpanWithStatus as SpanWithStatus @@ -61,8 +57,6 @@ from .tool_list_response import ToolListResponse as ToolListResponse from .agent_create_params import AgentCreateParams as AgentCreateParams from .agent_list_response import AgentListResponse as AgentListResponse -from .completion_response import CompletionResponse as CompletionResponse -from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse from .list_files_response import ListFilesResponse as ListFilesResponse from .list_tools_response import ListToolsResponse as ListToolsResponse from .model_list_response import ModelListResponse as ModelListResponse @@ -71,7 +65,6 @@ from .tool_execution_step import ToolExecutionStep as ToolExecutionStep from .tool_response_param import ToolResponseParam as ToolResponseParam from .delete_file_response import DeleteFileResponse as DeleteFileResponse -from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams from .list_models_response import ListModelsResponse as ListModelsResponse from .list_routes_response import ListRoutesResponse as ListRoutesResponse @@ -134,8 +127,6 @@ from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams -from .inference_completion_params import InferenceCompletionParams as InferenceCompletionParams -from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse from .vector_db_register_response import VectorDBRegisterResponse as VectorDBRegisterResponse @@ -154,26 +145,15 @@ from .list_scoring_functions_response import ListScoringFunctionsResponse as ListScoringFunctionsResponse from .telemetry_query_traces_response import TelemetryQueryTracesResponse as TelemetryQueryTracesResponse from .tool_runtime_invoke_tool_params import ToolRuntimeInvokeToolParams as ToolRuntimeInvokeToolParams -from .inference_chat_completion_params import InferenceChatCompletionParams as InferenceChatCompletionParams from .list_post_training_jobs_response import ListPostTrainingJobsResponse as ListPostTrainingJobsResponse from .scoring_function_register_params import ScoringFunctionRegisterParams as ScoringFunctionRegisterParams from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse -from .inference_batch_completion_params import InferenceBatchCompletionParams as InferenceBatchCompletionParams from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse -from .chat_completion_response_stream_chunk import ( - ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk, -) -from .inference_batch_chat_completion_params import ( - InferenceBatchChatCompletionParams as InferenceBatchChatCompletionParams, -) from .telemetry_save_spans_to_dataset_params import ( TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams, ) -from .inference_batch_chat_completion_response import ( - InferenceBatchChatCompletionResponse as InferenceBatchChatCompletionResponse, -) from .post_training_preference_optimize_params import ( PostTrainingPreferenceOptimizeParams as PostTrainingPreferenceOptimizeParams, ) diff --git a/src/llama_stack_client/types/agents/__init__.py b/src/llama_stack_client/types/agents/__init__.py index f4f48353..3a144840 100644 --- a/src/llama_stack_client/types/agents/__init__.py +++ b/src/llama_stack_client/types/agents/__init__.py @@ -13,5 +13,4 @@ from .step_retrieve_response import StepRetrieveResponse as StepRetrieveResponse from .session_create_response import SessionCreateResponse as SessionCreateResponse from .session_retrieve_params import SessionRetrieveParams as SessionRetrieveParams -from .turn_response_event_payload import TurnResponseEventPayload as TurnResponseEventPayload from .agent_turn_response_stream_chunk import AgentTurnResponseStreamChunk as AgentTurnResponseStreamChunk diff --git a/src/llama_stack_client/types/agents/turn_response_event.py b/src/llama_stack_client/types/agents/turn_response_event.py index df213246..c52121ab 100644 --- a/src/llama_stack_client/types/agents/turn_response_event.py +++ b/src/llama_stack_client/types/agents/turn_response_event.py @@ -1,11 +1,160 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. +from typing import Dict, List, Union, Optional +from typing_extensions import Literal, Annotated, TypeAlias + +from .turn import Turn +from ..._utils import PropertyInfo from ..._models import BaseModel -from .turn_response_event_payload import TurnResponseEventPayload +from ..inference_step import InferenceStep +from ..shared.tool_call import ToolCall +from ..shield_call_step import ShieldCallStep +from ..tool_execution_step import ToolExecutionStep +from ..memory_retrieval_step import MemoryRetrievalStep + +__all__ = [ + "TurnResponseEvent", + "Payload", + "PayloadAgentTurnResponseStepStartPayload", + "PayloadAgentTurnResponseStepProgressPayload", + "PayloadAgentTurnResponseStepProgressPayloadDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta", + "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall", + "PayloadAgentTurnResponseStepCompletePayload", + "PayloadAgentTurnResponseStepCompletePayloadStepDetails", + "PayloadAgentTurnResponseTurnStartPayload", + "PayloadAgentTurnResponseTurnCompletePayload", + "PayloadAgentTurnResponseTurnAwaitingInputPayload", +] + + +class PayloadAgentTurnResponseStepStartPayload(BaseModel): + event_type: Literal["step_start"] + """Type of event being reported""" + + step_id: str + """Unique identifier for the step within a turn""" + + step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] + """Type of step being executed""" + + metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None + """(Optional) Additional metadata for the step""" + + +class PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta(BaseModel): + text: str + """The incremental text content""" + + type: Literal["text"] + """Discriminator type of the delta. Always "text" """ + + +class PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta(BaseModel): + image: str + """The incremental image data as bytes""" + + type: Literal["image"] + """Discriminator type of the delta. Always "image" """ + + +PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall] + + +class PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta(BaseModel): + parse_status: Literal["started", "in_progress", "failed", "succeeded"] + """Current parsing status of the tool call""" + + tool_call: PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall + """Either an in-progress tool call string or the final parsed tool call""" + + type: Literal["tool_call"] + """Discriminator type of the delta. Always "tool_call" """ + + +PayloadAgentTurnResponseStepProgressPayloadDelta: TypeAlias = Annotated[ + Union[ + PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta, + PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta, + PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta, + ], + PropertyInfo(discriminator="type"), +] + + +class PayloadAgentTurnResponseStepProgressPayload(BaseModel): + delta: PayloadAgentTurnResponseStepProgressPayloadDelta + """Incremental content changes during step execution""" + + event_type: Literal["step_progress"] + """Type of event being reported""" + + step_id: str + """Unique identifier for the step within a turn""" + + step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] + """Type of step being executed""" + + +PayloadAgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[ + Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep], + PropertyInfo(discriminator="step_type"), +] + + +class PayloadAgentTurnResponseStepCompletePayload(BaseModel): + event_type: Literal["step_complete"] + """Type of event being reported""" + + step_details: PayloadAgentTurnResponseStepCompletePayloadStepDetails + """Complete details of the executed step""" + + step_id: str + """Unique identifier for the step within a turn""" + + step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] + """Type of step being executed""" + + +class PayloadAgentTurnResponseTurnStartPayload(BaseModel): + event_type: Literal["turn_start"] + """Type of event being reported""" + + turn_id: str + """Unique identifier for the turn within a session""" + + +class PayloadAgentTurnResponseTurnCompletePayload(BaseModel): + event_type: Literal["turn_complete"] + """Type of event being reported""" + + turn: Turn + """Complete turn data including all steps and results""" + + +class PayloadAgentTurnResponseTurnAwaitingInputPayload(BaseModel): + event_type: Literal["turn_awaiting_input"] + """Type of event being reported""" + + turn: Turn + """Turn data when waiting for external tool responses""" + -__all__ = ["TurnResponseEvent"] +Payload: TypeAlias = Annotated[ + Union[ + PayloadAgentTurnResponseStepStartPayload, + PayloadAgentTurnResponseStepProgressPayload, + PayloadAgentTurnResponseStepCompletePayload, + PayloadAgentTurnResponseTurnStartPayload, + PayloadAgentTurnResponseTurnCompletePayload, + PayloadAgentTurnResponseTurnAwaitingInputPayload, + ], + PropertyInfo(discriminator="event_type"), +] class TurnResponseEvent(BaseModel): - payload: TurnResponseEventPayload + payload: Payload """Event-specific payload containing event data""" diff --git a/src/llama_stack_client/types/agents/turn_response_event_payload.py b/src/llama_stack_client/types/agents/turn_response_event_payload.py deleted file mode 100644 index 1844c61e..00000000 --- a/src/llama_stack_client/types/agents/turn_response_event_payload.py +++ /dev/null @@ -1,109 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Dict, List, Union, Optional -from typing_extensions import Literal, Annotated, TypeAlias - -from .turn import Turn -from ..._utils import PropertyInfo -from ..._models import BaseModel -from ..inference_step import InferenceStep -from ..shield_call_step import ShieldCallStep -from ..tool_execution_step import ToolExecutionStep -from ..shared.content_delta import ContentDelta -from ..memory_retrieval_step import MemoryRetrievalStep - -__all__ = [ - "TurnResponseEventPayload", - "AgentTurnResponseStepStartPayload", - "AgentTurnResponseStepProgressPayload", - "AgentTurnResponseStepCompletePayload", - "AgentTurnResponseStepCompletePayloadStepDetails", - "AgentTurnResponseTurnStartPayload", - "AgentTurnResponseTurnCompletePayload", - "AgentTurnResponseTurnAwaitingInputPayload", -] - - -class AgentTurnResponseStepStartPayload(BaseModel): - event_type: Literal["step_start"] - """Type of event being reported""" - - step_id: str - """Unique identifier for the step within a turn""" - - step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] - """Type of step being executed""" - - metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None - """(Optional) Additional metadata for the step""" - - -class AgentTurnResponseStepProgressPayload(BaseModel): - delta: ContentDelta - """Incremental content changes during step execution""" - - event_type: Literal["step_progress"] - """Type of event being reported""" - - step_id: str - """Unique identifier for the step within a turn""" - - step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] - """Type of step being executed""" - - -AgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[ - Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep], - PropertyInfo(discriminator="step_type"), -] - - -class AgentTurnResponseStepCompletePayload(BaseModel): - event_type: Literal["step_complete"] - """Type of event being reported""" - - step_details: AgentTurnResponseStepCompletePayloadStepDetails - """Complete details of the executed step""" - - step_id: str - """Unique identifier for the step within a turn""" - - step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"] - """Type of step being executed""" - - -class AgentTurnResponseTurnStartPayload(BaseModel): - event_type: Literal["turn_start"] - """Type of event being reported""" - - turn_id: str - """Unique identifier for the turn within a session""" - - -class AgentTurnResponseTurnCompletePayload(BaseModel): - event_type: Literal["turn_complete"] - """Type of event being reported""" - - turn: Turn - """Complete turn data including all steps and results""" - - -class AgentTurnResponseTurnAwaitingInputPayload(BaseModel): - event_type: Literal["turn_awaiting_input"] - """Type of event being reported""" - - turn: Turn - """Turn data when waiting for external tool responses""" - - -TurnResponseEventPayload: TypeAlias = Annotated[ - Union[ - AgentTurnResponseStepStartPayload, - AgentTurnResponseStepProgressPayload, - AgentTurnResponseStepCompletePayload, - AgentTurnResponseTurnStartPayload, - AgentTurnResponseTurnCompletePayload, - AgentTurnResponseTurnAwaitingInputPayload, - ], - PropertyInfo(discriminator="event_type"), -] diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py index 740bf99b..dc968521 100644 --- a/src/llama_stack_client/types/benchmark_config_param.py +++ b/src/llama_stack_client/types/benchmark_config_param.py @@ -2,17 +2,42 @@ from __future__ import annotations -from typing import Dict -from typing_extensions import Required, TypedDict +from typing import Dict, Union +from typing_extensions import Literal, Required, TypeAlias, TypedDict -from .eval_candidate_param import EvalCandidateParam from .scoring_fn_params_param import ScoringFnParamsParam +from .shared_params.agent_config import AgentConfig +from .shared_params.system_message import SystemMessage +from .shared_params.sampling_params import SamplingParams -__all__ = ["BenchmarkConfigParam"] +__all__ = ["BenchmarkConfigParam", "EvalCandidate", "EvalCandidateModelCandidate", "EvalCandidateAgentCandidate"] + + +class EvalCandidateModelCandidate(TypedDict, total=False): + model: Required[str] + """The model ID to evaluate.""" + + sampling_params: Required[SamplingParams] + """The sampling parameters for the model.""" + + type: Required[Literal["model"]] + + system_message: SystemMessage + """(Optional) The system message providing instructions or context to the model.""" + + +class EvalCandidateAgentCandidate(TypedDict, total=False): + config: Required[AgentConfig] + """The configuration for the agent candidate.""" + + type: Required[Literal["agent"]] + + +EvalCandidate: TypeAlias = Union[EvalCandidateModelCandidate, EvalCandidateAgentCandidate] class BenchmarkConfigParam(TypedDict, total=False): - eval_candidate: Required[EvalCandidateParam] + eval_candidate: Required[EvalCandidate] """The candidate to evaluate.""" scoring_params: Required[Dict[str, ScoringFnParamsParam]] diff --git a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py deleted file mode 100644 index 1a55f3d1..00000000 --- a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py +++ /dev/null @@ -1,36 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List, Optional -from typing_extensions import Literal - -from .._models import BaseModel -from .shared.metric import Metric -from .token_log_probs import TokenLogProbs -from .shared.content_delta import ContentDelta - -__all__ = ["ChatCompletionResponseStreamChunk", "Event"] - - -class Event(BaseModel): - delta: ContentDelta - """Content generated since last event. - - This can be one or more tokens, or a tool call. - """ - - event_type: Literal["start", "complete", "progress"] - """Type of the event""" - - logprobs: Optional[List[TokenLogProbs]] = None - """Optional log probabilities for generated tokens""" - - stop_reason: Optional[Literal["end_of_turn", "end_of_message", "out_of_tokens"]] = None - """Optional reason why generation stopped, if complete""" - - -class ChatCompletionResponseStreamChunk(BaseModel): - event: Event - """The event containing the new content""" - - metrics: Optional[List[Metric]] = None - """(Optional) List of metrics associated with the API response""" diff --git a/src/llama_stack_client/types/completion_response.py b/src/llama_stack_client/types/completion_response.py deleted file mode 100644 index 9718be8a..00000000 --- a/src/llama_stack_client/types/completion_response.py +++ /dev/null @@ -1,24 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List, Optional -from typing_extensions import Literal - -from .._models import BaseModel -from .shared.metric import Metric -from .token_log_probs import TokenLogProbs - -__all__ = ["CompletionResponse"] - - -class CompletionResponse(BaseModel): - content: str - """The generated completion text""" - - stop_reason: Literal["end_of_turn", "end_of_message", "out_of_tokens"] - """Reason why generation stopped""" - - logprobs: Optional[List[TokenLogProbs]] = None - """Optional log probabilities for generated tokens""" - - metrics: Optional[List[Metric]] = None - """(Optional) List of metrics associated with the API response""" diff --git a/src/llama_stack_client/types/embeddings_response.py b/src/llama_stack_client/types/embeddings_response.py deleted file mode 100644 index f36c6b97..00000000 --- a/src/llama_stack_client/types/embeddings_response.py +++ /dev/null @@ -1,16 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List - -from .._models import BaseModel - -__all__ = ["EmbeddingsResponse"] - - -class EmbeddingsResponse(BaseModel): - embeddings: List[List[float]] - """List of embedding vectors, one per input content. - - Each embedding is a list of floats. The dimensionality of the embedding is - model-specific; you can check model metadata using /models/{model_id} - """ diff --git a/src/llama_stack_client/types/eval_candidate_param.py b/src/llama_stack_client/types/eval_candidate_param.py deleted file mode 100644 index be1b21c8..00000000 --- a/src/llama_stack_client/types/eval_candidate_param.py +++ /dev/null @@ -1,35 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypeAlias, TypedDict - -from .shared_params.agent_config import AgentConfig -from .shared_params.system_message import SystemMessage -from .shared_params.sampling_params import SamplingParams - -__all__ = ["EvalCandidateParam", "ModelCandidate", "AgentCandidate"] - - -class ModelCandidate(TypedDict, total=False): - model: Required[str] - """The model ID to evaluate.""" - - sampling_params: Required[SamplingParams] - """The sampling parameters for the model.""" - - type: Required[Literal["model"]] - - system_message: SystemMessage - """(Optional) The system message providing instructions or context to the model.""" - - -class AgentCandidate(TypedDict, total=False): - config: Required[AgentConfig] - """The configuration for the agent candidate.""" - - type: Required[Literal["agent"]] - - -EvalCandidateParam: TypeAlias = Union[ModelCandidate, AgentCandidate] diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py index 8322c0a9..2be39a7a 100644 --- a/src/llama_stack_client/types/file_create_params.py +++ b/src/llama_stack_client/types/file_create_params.py @@ -6,7 +6,7 @@ from .._types import FileTypes -__all__ = ["FileCreateParams"] +__all__ = ["FileCreateParams", "ExpiresAfter"] class FileCreateParams(TypedDict, total=False): @@ -14,3 +14,16 @@ class FileCreateParams(TypedDict, total=False): purpose: Required[Literal["assistants", "batch"]] """Valid purpose values for OpenAI Files API.""" + + expires_after: ExpiresAfter + """Control expiration of uploaded files. Params: + + - anchor, must be "created_at" + - seconds, must be int between 3600 and 2592000 (1 hour to 30 days) + """ + + +class ExpiresAfter(TypedDict, total=False): + anchor: Required[Literal["created_at"]] + + seconds: Required[int] diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_params.py b/src/llama_stack_client/types/inference_batch_chat_completion_params.py deleted file mode 100644 index b5da0f0e..00000000 --- a/src/llama_stack_client/types/inference_batch_chat_completion_params.py +++ /dev/null @@ -1,85 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Dict, Union, Iterable -from typing_extensions import Literal, Required, TypedDict - -from .shared_params.message import Message -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.tool_param_definition import ToolParamDefinition - -__all__ = ["InferenceBatchChatCompletionParams", "Logprobs", "ToolConfig", "Tool"] - - -class InferenceBatchChatCompletionParams(TypedDict, total=False): - messages_batch: Required[Iterable[Iterable[Message]]] - """The messages to generate completions for.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding.""" - - sampling_params: SamplingParams - """(Optional) Parameters to control the sampling strategy.""" - - tool_config: ToolConfig - """(Optional) Configuration for tool use.""" - - tools: Iterable[Tool] - """(Optional) List of tool definitions available to the model.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" - - -class ToolConfig(TypedDict, total=False): - system_message_behavior: Literal["append", "replace"] - """(Optional) Config for how to override the default system prompt. - - - `SystemMessageBehavior.append`: Appends the provided system message to the - default system prompt. - `SystemMessageBehavior.replace`: Replaces the default - system prompt with the provided system message. The system message can include - the string '{{function_definitions}}' to indicate where the function - definitions should be inserted. - """ - - tool_choice: Union[Literal["auto", "required", "none"], str] - """(Optional) Whether tool use is automatic, required, or none. - - Can also specify a tool name to use a specific tool. Defaults to - ToolChoice.auto. - """ - - tool_prompt_format: Literal["json", "function_tag", "python_list"] - """(Optional) Instructs the model how to format tool calls. - - By default, Llama Stack will attempt to use a format that is best adapted to the - model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON - object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. - """ - - -class Tool(TypedDict, total=False): - tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]] - - description: str - - parameters: Dict[str, ToolParamDefinition] diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_response.py b/src/llama_stack_client/types/inference_batch_chat_completion_response.py deleted file mode 100644 index ed24908d..00000000 --- a/src/llama_stack_client/types/inference_batch_chat_completion_response.py +++ /dev/null @@ -1,13 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List - -from .._models import BaseModel -from .shared.chat_completion_response import ChatCompletionResponse - -__all__ = ["InferenceBatchChatCompletionResponse"] - - -class InferenceBatchChatCompletionResponse(BaseModel): - batch: List[ChatCompletionResponse] - """List of chat completion responses, one for each conversation in the batch""" diff --git a/src/llama_stack_client/types/inference_batch_completion_params.py b/src/llama_stack_client/types/inference_batch_completion_params.py deleted file mode 100644 index b225b883..00000000 --- a/src/llama_stack_client/types/inference_batch_completion_params.py +++ /dev/null @@ -1,41 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Required, TypedDict - -from .._types import SequenceNotStr -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.interleaved_content import InterleavedContent - -__all__ = ["InferenceBatchCompletionParams", "Logprobs"] - - -class InferenceBatchCompletionParams(TypedDict, total=False): - content_batch: Required[SequenceNotStr[InterleavedContent]] - """The content to generate completions for.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding.""" - - sampling_params: SamplingParams - """(Optional) Parameters to control the sampling strategy.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" diff --git a/src/llama_stack_client/types/inference_chat_completion_params.py b/src/llama_stack_client/types/inference_chat_completion_params.py deleted file mode 100644 index 746d3dee..00000000 --- a/src/llama_stack_client/types/inference_chat_completion_params.py +++ /dev/null @@ -1,134 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Dict, Union, Iterable -from typing_extensions import Literal, Required, TypedDict - -from .shared_params.message import Message -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.tool_param_definition import ToolParamDefinition - -__all__ = [ - "InferenceChatCompletionParamsBase", - "Logprobs", - "ToolConfig", - "Tool", - "InferenceChatCompletionParamsNonStreaming", - "InferenceChatCompletionParamsStreaming", -] - - -class InferenceChatCompletionParamsBase(TypedDict, total=False): - messages: Required[Iterable[Message]] - """List of messages in the conversation.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding. - - There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON - schema. Most providers support this format. - `ResponseFormat.grammar`: The - grammar is a BNF grammar. This format is more flexible, but not all providers - support it. - """ - - sampling_params: SamplingParams - """Parameters to control the sampling strategy.""" - - tool_choice: Literal["auto", "required", "none"] - """(Optional) Whether tool use is required or automatic. - - Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead. - """ - - tool_config: ToolConfig - """(Optional) Configuration for tool use.""" - - tool_prompt_format: Literal["json", "function_tag", "python_list"] - """(Optional) Instructs the model how to format tool calls. - - By default, Llama Stack will attempt to use a format that is best adapted to the - model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON - object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. .. deprecated:: Use - tool_config instead. - """ - - tools: Iterable[Tool] - """(Optional) List of tool definitions available to the model.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" - - -class ToolConfig(TypedDict, total=False): - system_message_behavior: Literal["append", "replace"] - """(Optional) Config for how to override the default system prompt. - - - `SystemMessageBehavior.append`: Appends the provided system message to the - default system prompt. - `SystemMessageBehavior.replace`: Replaces the default - system prompt with the provided system message. The system message can include - the string '{{function_definitions}}' to indicate where the function - definitions should be inserted. - """ - - tool_choice: Union[Literal["auto", "required", "none"], str] - """(Optional) Whether tool use is automatic, required, or none. - - Can also specify a tool name to use a specific tool. Defaults to - ToolChoice.auto. - """ - - tool_prompt_format: Literal["json", "function_tag", "python_list"] - """(Optional) Instructs the model how to format tool calls. - - By default, Llama Stack will attempt to use a format that is best adapted to the - model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON - object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a - tag. - `ToolPromptFormat.python_list`: The tool calls - are output as Python syntax -- a list of function calls. - """ - - -class Tool(TypedDict, total=False): - tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]] - - description: str - - parameters: Dict[str, ToolParamDefinition] - - -class InferenceChatCompletionParamsNonStreaming(InferenceChatCompletionParamsBase, total=False): - stream: Literal[False] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -class InferenceChatCompletionParamsStreaming(InferenceChatCompletionParamsBase): - stream: Required[Literal[True]] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -InferenceChatCompletionParams = Union[InferenceChatCompletionParamsNonStreaming, InferenceChatCompletionParamsStreaming] diff --git a/src/llama_stack_client/types/inference_completion_params.py b/src/llama_stack_client/types/inference_completion_params.py deleted file mode 100644 index c122f017..00000000 --- a/src/llama_stack_client/types/inference_completion_params.py +++ /dev/null @@ -1,65 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypedDict - -from .shared_params.response_format import ResponseFormat -from .shared_params.sampling_params import SamplingParams -from .shared_params.interleaved_content import InterleavedContent - -__all__ = [ - "InferenceCompletionParamsBase", - "Logprobs", - "InferenceCompletionParamsNonStreaming", - "InferenceCompletionParamsStreaming", -] - - -class InferenceCompletionParamsBase(TypedDict, total=False): - content: Required[InterleavedContent] - """The content to generate a completion for.""" - - model_id: Required[str] - """The identifier of the model to use. - - The model must be registered with Llama Stack and available via the /models - endpoint. - """ - - logprobs: Logprobs - """ - (Optional) If specified, log probabilities for each token position will be - returned. - """ - - response_format: ResponseFormat - """(Optional) Grammar specification for guided (structured) decoding.""" - - sampling_params: SamplingParams - """(Optional) Parameters to control the sampling strategy.""" - - -class Logprobs(TypedDict, total=False): - top_k: int - """How many tokens (for each position) to return log probabilities for.""" - - -class InferenceCompletionParamsNonStreaming(InferenceCompletionParamsBase, total=False): - stream: Literal[False] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -class InferenceCompletionParamsStreaming(InferenceCompletionParamsBase): - stream: Required[Literal[True]] - """(Optional) If True, generate an SSE event stream of the response. - - Defaults to False. - """ - - -InferenceCompletionParams = Union[InferenceCompletionParamsNonStreaming, InferenceCompletionParamsStreaming] diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py deleted file mode 100644 index a1be545b..00000000 --- a/src/llama_stack_client/types/inference_embeddings_params.py +++ /dev/null @@ -1,46 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union, Iterable -from typing_extensions import Literal, Required, TypedDict - -from .._types import SequenceNotStr -from .shared_params.interleaved_content_item import InterleavedContentItem - -__all__ = ["InferenceEmbeddingsParams"] - - -class InferenceEmbeddingsParams(TypedDict, total=False): - contents: Required[Union[SequenceNotStr[str], Iterable[InterleavedContentItem]]] - """List of contents to generate embeddings for. - - Each content can be a string or an InterleavedContentItem (and hence can be - multimodal). The behavior depends on the model and provider. Some models may - only support text. - """ - - model_id: Required[str] - """The identifier of the model to use. - - The model must be an embedding model registered with Llama Stack and available - via the /models endpoint. - """ - - output_dimension: int - """(Optional) Output dimensionality for the embeddings. - - Only supported by Matryoshka models. - """ - - task_type: Literal["query", "document"] - """ - (Optional) How is the embedding being used? This is only supported by asymmetric - embedding models. - """ - - text_truncation: Literal["none", "start", "end"] - """ - (Optional) Config for how to truncate text for embedding when text is longer - than the model's max sequence length. - """ diff --git a/src/llama_stack_client/types/models/openai_list_response.py b/src/llama_stack_client/types/models/openai_list_response.py index f14845d5..5b6c0358 100644 --- a/src/llama_stack_client/types/models/openai_list_response.py +++ b/src/llama_stack_client/types/models/openai_list_response.py @@ -1,21 +1,10 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from typing import List -from typing_extensions import Literal, TypeAlias +from typing_extensions import TypeAlias -from ..._models import BaseModel +from ..model import Model -__all__ = ["OpenAIListResponse", "OpenAIListResponseItem"] +__all__ = ["OpenAIListResponse"] - -class OpenAIListResponseItem(BaseModel): - id: str - - created: int - - object: Literal["model"] - - owned_by: str - - -OpenAIListResponse: TypeAlias = List[OpenAIListResponseItem] +OpenAIListResponse: TypeAlias = List[Model] diff --git a/src/llama_stack_client/types/response_list_response.py b/src/llama_stack_client/types/response_list_response.py index ae50d44a..ac7ec1b1 100644 --- a/src/llama_stack_client/types/response_list_response.py +++ b/src/llama_stack_client/types/response_list_response.py @@ -570,6 +570,3 @@ class ResponseListResponse(BaseModel): truncation: Optional[str] = None """(Optional) Truncation strategy applied to the response""" - - user: Optional[str] = None - """(Optional) User identifier associated with the request""" diff --git a/src/llama_stack_client/types/response_object.py b/src/llama_stack_client/types/response_object.py index c0f348a9..b618ddf5 100644 --- a/src/llama_stack_client/types/response_object.py +++ b/src/llama_stack_client/types/response_object.py @@ -361,6 +361,3 @@ def output_text(self) -> str: truncation: Optional[str] = None """(Optional) Truncation strategy applied to the response""" - - user: Optional[str] = None - """(Optional) User identifier associated with the request""" diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py index fb14d8a6..f346cda7 100644 --- a/src/llama_stack_client/types/shared/__init__.py +++ b/src/llama_stack_client/types/shared/__init__.py @@ -9,17 +9,14 @@ from .query_config import QueryConfig as QueryConfig from .query_result import QueryResult as QueryResult from .user_message import UserMessage as UserMessage -from .content_delta import ContentDelta as ContentDelta from .scoring_result import ScoringResult as ScoringResult from .system_message import SystemMessage as SystemMessage from .response_format import ResponseFormat as ResponseFormat from .sampling_params import SamplingParams as SamplingParams -from .batch_completion import BatchCompletion as BatchCompletion from .safety_violation import SafetyViolation as SafetyViolation from .completion_message import CompletionMessage as CompletionMessage from .interleaved_content import InterleavedContent as InterleavedContent from .tool_param_definition import ToolParamDefinition as ToolParamDefinition from .tool_response_message import ToolResponseMessage as ToolResponseMessage -from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig from .chat_completion_response import ChatCompletionResponse as ChatCompletionResponse from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem diff --git a/src/llama_stack_client/types/shared/batch_completion.py b/src/llama_stack_client/types/shared/batch_completion.py deleted file mode 100644 index 43a0a735..00000000 --- a/src/llama_stack_client/types/shared/batch_completion.py +++ /dev/null @@ -1,13 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import List - -from ..._models import BaseModel -from ..completion_response import CompletionResponse - -__all__ = ["BatchCompletion"] - - -class BatchCompletion(BaseModel): - batch: List[CompletionResponse] - """List of completion responses, one for each input in the batch""" diff --git a/src/llama_stack_client/types/shared/chat_completion_response.py b/src/llama_stack_client/types/shared/chat_completion_response.py index 30191439..eb78a109 100644 --- a/src/llama_stack_client/types/shared/chat_completion_response.py +++ b/src/llama_stack_client/types/shared/chat_completion_response.py @@ -1,20 +1,24 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. -from typing import List, Optional +from typing import Dict, List, Optional from .metric import Metric from ..._models import BaseModel -from ..token_log_probs import TokenLogProbs from .completion_message import CompletionMessage -__all__ = ["ChatCompletionResponse"] +__all__ = ["ChatCompletionResponse", "Logprob"] + + +class Logprob(BaseModel): + logprobs_by_token: Dict[str, float] + """Dictionary mapping tokens to their log probabilities""" class ChatCompletionResponse(BaseModel): completion_message: CompletionMessage """The complete response message""" - logprobs: Optional[List[TokenLogProbs]] = None + logprobs: Optional[List[Logprob]] = None """Optional log probabilities for generated tokens""" metrics: Optional[List[Metric]] = None diff --git a/src/llama_stack_client/types/shared/content_delta.py b/src/llama_stack_client/types/shared/content_delta.py deleted file mode 100644 index 7ed58d13..00000000 --- a/src/llama_stack_client/types/shared/content_delta.py +++ /dev/null @@ -1,43 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Union -from typing_extensions import Literal, Annotated, TypeAlias - -from ..._utils import PropertyInfo -from ..._models import BaseModel -from .tool_call import ToolCall - -__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta", "ToolCallDeltaToolCall"] - - -class TextDelta(BaseModel): - text: str - """The incremental text content""" - - type: Literal["text"] - """Discriminator type of the delta. Always "text" """ - - -class ImageDelta(BaseModel): - image: str - """The incremental image data as bytes""" - - type: Literal["image"] - """Discriminator type of the delta. Always "image" """ - - -ToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall] - - -class ToolCallDelta(BaseModel): - parse_status: Literal["started", "in_progress", "failed", "succeeded"] - """Current parsing status of the tool call""" - - tool_call: ToolCallDeltaToolCall - """Either an in-progress tool call string or the final parsed tool call""" - - type: Literal["tool_call"] - """Discriminator type of the delta. Always "tool_call" """ - - -ContentDelta: TypeAlias = Annotated[Union[TextDelta, ImageDelta, ToolCallDelta], PropertyInfo(discriminator="type")] diff --git a/src/llama_stack_client/types/shared/query_config.py b/src/llama_stack_client/types/shared/query_config.py index 389514c7..a4a1f741 100644 --- a/src/llama_stack_client/types/shared/query_config.py +++ b/src/llama_stack_client/types/shared/query_config.py @@ -5,9 +5,41 @@ from ..._utils import PropertyInfo from ..._models import BaseModel -from .query_generator_config import QueryGeneratorConfig -__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"] +__all__ = [ + "QueryConfig", + "QueryGeneratorConfig", + "QueryGeneratorConfigDefaultRagQueryGeneratorConfig", + "QueryGeneratorConfigLlmragQueryGeneratorConfig", + "Ranker", + "RankerRrfRanker", + "RankerWeightedRanker", +] + + +class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(BaseModel): + separator: str + """String separator used to join query terms""" + + type: Literal["default"] + """Type of query generator, always 'default'""" + + +class QueryGeneratorConfigLlmragQueryGeneratorConfig(BaseModel): + model: str + """Name of the language model to use for query generation""" + + template: str + """Template string for formatting the query generation prompt""" + + type: Literal["llm"] + """Type of query generator, always 'llm'""" + + +QueryGeneratorConfig: TypeAlias = Annotated[ + Union[QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig], + PropertyInfo(discriminator="type"), +] class RankerRrfRanker(BaseModel): diff --git a/src/llama_stack_client/types/shared/query_generator_config.py b/src/llama_stack_client/types/shared/query_generator_config.py deleted file mode 100644 index 624fc190..00000000 --- a/src/llama_stack_client/types/shared/query_generator_config.py +++ /dev/null @@ -1,33 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Union -from typing_extensions import Literal, Annotated, TypeAlias - -from ..._utils import PropertyInfo -from ..._models import BaseModel - -__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"] - - -class DefaultRagQueryGeneratorConfig(BaseModel): - separator: str - """String separator used to join query terms""" - - type: Literal["default"] - """Type of query generator, always 'default'""" - - -class LlmragQueryGeneratorConfig(BaseModel): - model: str - """Name of the language model to use for query generation""" - - template: str - """Template string for formatting the query generation prompt""" - - type: Literal["llm"] - """Type of query generator, always 'llm'""" - - -QueryGeneratorConfig: TypeAlias = Annotated[ - Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig], PropertyInfo(discriminator="type") -] diff --git a/src/llama_stack_client/types/shared/tool_param_definition.py b/src/llama_stack_client/types/shared/tool_param_definition.py index 1466c1f9..316f1e01 100644 --- a/src/llama_stack_client/types/shared/tool_param_definition.py +++ b/src/llama_stack_client/types/shared/tool_param_definition.py @@ -14,4 +14,8 @@ class ToolParamDefinition(BaseModel): description: Optional[str] = None + items: Union[bool, float, str, List[object], object, None] = None + required: Optional[bool] = None + + title: Optional[str] = None diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py index 3a0842e8..894d8a8d 100644 --- a/src/llama_stack_client/types/shared_params/__init__.py +++ b/src/llama_stack_client/types/shared_params/__init__.py @@ -11,7 +11,5 @@ from .sampling_params import SamplingParams as SamplingParams from .completion_message import CompletionMessage as CompletionMessage from .interleaved_content import InterleavedContent as InterleavedContent -from .tool_param_definition import ToolParamDefinition as ToolParamDefinition from .tool_response_message import ToolResponseMessage as ToolResponseMessage -from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem diff --git a/src/llama_stack_client/types/shared_params/query_config.py b/src/llama_stack_client/types/shared_params/query_config.py index d008c48c..91a5b596 100644 --- a/src/llama_stack_client/types/shared_params/query_config.py +++ b/src/llama_stack_client/types/shared_params/query_config.py @@ -5,9 +5,39 @@ from typing import Union from typing_extensions import Literal, Required, TypeAlias, TypedDict -from .query_generator_config import QueryGeneratorConfig +__all__ = [ + "QueryConfig", + "QueryGeneratorConfig", + "QueryGeneratorConfigDefaultRagQueryGeneratorConfig", + "QueryGeneratorConfigLlmragQueryGeneratorConfig", + "Ranker", + "RankerRrfRanker", + "RankerWeightedRanker", +] -__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"] + +class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(TypedDict, total=False): + separator: Required[str] + """String separator used to join query terms""" + + type: Required[Literal["default"]] + """Type of query generator, always 'default'""" + + +class QueryGeneratorConfigLlmragQueryGeneratorConfig(TypedDict, total=False): + model: Required[str] + """Name of the language model to use for query generation""" + + template: Required[str] + """Template string for formatting the query generation prompt""" + + type: Required[Literal["llm"]] + """Type of query generator, always 'llm'""" + + +QueryGeneratorConfig: TypeAlias = Union[ + QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig +] class RankerRrfRanker(TypedDict, total=False): diff --git a/src/llama_stack_client/types/shared_params/query_generator_config.py b/src/llama_stack_client/types/shared_params/query_generator_config.py deleted file mode 100644 index 8c589bf9..00000000 --- a/src/llama_stack_client/types/shared_params/query_generator_config.py +++ /dev/null @@ -1,30 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union -from typing_extensions import Literal, Required, TypeAlias, TypedDict - -__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"] - - -class DefaultRagQueryGeneratorConfig(TypedDict, total=False): - separator: Required[str] - """String separator used to join query terms""" - - type: Required[Literal["default"]] - """Type of query generator, always 'default'""" - - -class LlmragQueryGeneratorConfig(TypedDict, total=False): - model: Required[str] - """Name of the language model to use for query generation""" - - template: Required[str] - """Template string for formatting the query generation prompt""" - - type: Required[Literal["llm"]] - """Type of query generator, always 'llm'""" - - -QueryGeneratorConfig: TypeAlias = Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig] diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py deleted file mode 100644 index 2d7805fe..00000000 --- a/src/llama_stack_client/types/shared_params/tool_param_definition.py +++ /dev/null @@ -1,18 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing import Union, Iterable -from typing_extensions import Required, TypedDict - -__all__ = ["ToolParamDefinition"] - - -class ToolParamDefinition(TypedDict, total=False): - param_type: Required[str] - - default: Union[bool, float, str, Iterable[object], object, None] - - description: str - - required: bool diff --git a/src/llama_stack_client/types/token_log_probs.py b/src/llama_stack_client/types/token_log_probs.py deleted file mode 100644 index b1a0a2b4..00000000 --- a/src/llama_stack_client/types/token_log_probs.py +++ /dev/null @@ -1,12 +0,0 @@ -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Dict - -from .._models import BaseModel - -__all__ = ["TokenLogProbs"] - - -class TokenLogProbs(BaseModel): - logprobs_by_token: Dict[str, float] - """Dictionary mapping tokens to their log probabilities""" diff --git a/src/llama_stack_client/types/tool.py b/src/llama_stack_client/types/tool.py index c6994268..a7243b64 100644 --- a/src/llama_stack_client/types/tool.py +++ b/src/llama_stack_client/types/tool.py @@ -24,6 +24,12 @@ class Parameter(BaseModel): default: Union[bool, float, str, List[object], object, None] = None """(Optional) Default value for the parameter if not provided""" + items: Optional[object] = None + """Type of the elements when parameter_type is array""" + + title: Optional[str] = None + """(Optional) Title of the parameter""" + class Tool(BaseModel): description: str diff --git a/src/llama_stack_client/types/tool_def.py b/src/llama_stack_client/types/tool_def.py index c82a9b8a..21949b41 100644 --- a/src/llama_stack_client/types/tool_def.py +++ b/src/llama_stack_client/types/tool_def.py @@ -23,6 +23,12 @@ class Parameter(BaseModel): default: Union[bool, float, str, List[object], object, None] = None """(Optional) Default value for the parameter if not provided""" + items: Optional[object] = None + """Type of the elements when parameter_type is array""" + + title: Optional[str] = None + """(Optional) Title of the parameter""" + class ToolDef(BaseModel): name: str diff --git a/src/llama_stack_client/types/tool_def_param.py b/src/llama_stack_client/types/tool_def_param.py index 93ad8285..a50437b2 100644 --- a/src/llama_stack_client/types/tool_def_param.py +++ b/src/llama_stack_client/types/tool_def_param.py @@ -24,6 +24,12 @@ class Parameter(TypedDict, total=False): default: Union[bool, float, str, Iterable[object], object, None] """(Optional) Default value for the parameter if not provided""" + items: object + """Type of the elements when parameter_type is array""" + + title: str + """(Optional) Title of the parameter""" + class ToolDefParam(TypedDict, total=False): name: Required[str] diff --git a/tests/api_resources/models/test_openai.py b/tests/api_resources/models/test_openai.py index ea64cce2..f94d2bf6 100644 --- a/tests/api_resources/models/test_openai.py +++ b/tests/api_resources/models/test_openai.py @@ -9,7 +9,7 @@ from tests.utils import assert_matches_type from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient -from llama_stack_client.types.models import OpenAIListResponse +from llama_stack_client.types import ModelListResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -20,7 +20,7 @@ class TestOpenAI: @parametrize def test_method_list(self, client: LlamaStackClient) -> None: openai = client.models.openai.list() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize def test_raw_response_list(self, client: LlamaStackClient) -> None: @@ -29,7 +29,7 @@ def test_raw_response_list(self, client: LlamaStackClient) -> None: assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize def test_streaming_response_list(self, client: LlamaStackClient) -> None: @@ -38,7 +38,7 @@ def test_streaming_response_list(self, client: LlamaStackClient) -> None: assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) assert cast(Any, response.is_closed) is True @@ -51,7 +51,7 @@ class TestAsyncOpenAI: @parametrize async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None: openai = await async_client.models.openai.list() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None: @@ -60,7 +60,7 @@ async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> N assert response.is_closed is True assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = await response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) @parametrize async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None: @@ -69,6 +69,6 @@ async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient assert response.http_request.headers.get("X-Stainless-Lang") == "python" openai = await response.parse() - assert_matches_type(OpenAIListResponse, openai, path=["response"]) + assert_matches_type(ModelListResponse, openai, path=["response"]) assert cast(Any, response.is_closed) is True diff --git a/tests/api_resources/test_agents.py b/tests/api_resources/test_agents.py index 18b34012..c19bc9bf 100644 --- a/tests/api_resources/test_agents.py +++ b/tests/api_resources/test_agents.py @@ -49,6 +49,8 @@ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None: "parameter_type": "parameter_type", "required": True, "default": True, + "items": {}, + "title": "title", } ], } @@ -253,6 +255,8 @@ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStack "parameter_type": "parameter_type", "required": True, "default": True, + "items": {}, + "title": "title", } ], } diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py index d9b29ffc..83b763ab 100644 --- a/tests/api_resources/test_files.py +++ b/tests/api_resources/test_files.py @@ -26,6 +26,18 @@ def test_method_create(self, client: LlamaStackClient) -> None: ) assert_matches_type(File, file, path=["response"]) + @parametrize + def test_method_create_with_all_params(self, client: LlamaStackClient) -> None: + file = client.files.create( + file=b"raw file contents", + purpose="assistants", + expires_after={ + "anchor": "created_at", + "seconds": 0, + }, + ) + assert_matches_type(File, file, path=["response"]) + @parametrize def test_raw_response_create(self, client: LlamaStackClient) -> None: response = client.files.with_raw_response.create( @@ -215,6 +227,18 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None: ) assert_matches_type(File, file, path=["response"]) + @parametrize + async def test_method_create_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: + file = await async_client.files.create( + file=b"raw file contents", + purpose="assistants", + expires_after={ + "anchor": "created_at", + "seconds": 0, + }, + ) + assert_matches_type(File, file, path=["response"]) + @parametrize async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None: response = await async_client.files.with_raw_response.create( diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py index 474ff7cf..f26802c2 100644 --- a/tests/api_resources/test_inference.py +++ b/tests/api_resources/test_inference.py @@ -9,15 +9,7 @@ from tests.utils import assert_matches_type from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient -from llama_stack_client.types import ( - CompletionResponse, - EmbeddingsResponse, - InferenceRerankResponse, - InferenceBatchChatCompletionResponse, -) -from llama_stack_client.types.shared import BatchCompletion, ChatCompletionResponse - -# pyright: reportDeprecated=false +from llama_stack_client.types import InferenceRerankResponse base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010") @@ -25,539 +17,6 @@ class TestInference: parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"]) - @parametrize - def test_method_batch_chat_completion(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_method_batch_chat_completion_with_all_params(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - "context": "string", - } - ] - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_batch_chat_completion(self, client: LlamaStackClient) -> None: - response = client.inference.with_raw_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_batch_chat_completion(self, client: LlamaStackClient) -> None: - with client.inference.with_streaming_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_batch_completion(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - def test_method_batch_completion_with_all_params(self, client: LlamaStackClient) -> None: - inference = client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - def test_raw_response_batch_completion(self, client: LlamaStackClient) -> None: - response = client.inference.with_raw_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - def test_streaming_response_batch_completion(self, client: LlamaStackClient) -> None: - with client.inference.with_streaming_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_chat_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - inference_stream.response.close() - - @parametrize - def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - - inference_stream.response.close() - - @parametrize - def test_raw_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = response.parse() - stream.close() - - @parametrize - def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - stream = response.parse() - stream.close() - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.completion( - content="string", - model_id="model_id", - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - def test_method_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.completion( - content="string", - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_completion_overload_1(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.completion( - content="string", - model_id="model_id", - stream=True, - ) - - inference_stream.response.close() - - @parametrize - def test_method_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = client.inference.completion( - content="string", - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - - inference_stream.response.close() - - @parametrize - def test_raw_response_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = response.parse() - stream.close() - - @parametrize - def test_streaming_response_completion_overload_2(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - stream = response.parse() - stream.close() - - assert cast(Any, response.is_closed) is True - - @parametrize - def test_method_embeddings(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - def test_method_embeddings_with_all_params(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = client.inference.embeddings( - contents=["string"], - model_id="model_id", - output_dimension=0, - task_type="query", - text_truncation="none", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - def test_raw_response_embeddings(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = client.inference.with_raw_response.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - def test_streaming_response_embeddings(self, client: LlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - with client.inference.with_streaming_response.embeddings( - contents=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - @parametrize def test_method_rerank(self, client: LlamaStackClient) -> None: inference = client.inference.rerank( @@ -611,539 +70,6 @@ class TestAsyncInference: "async_client", [False, True, {"http_client": "aiohttp"}], indirect=True, ids=["loose", "strict", "aiohttp"] ) - @parametrize - async def test_method_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_method_batch_chat_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - "context": "string", - } - ] - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None: - response = await async_client.inference.with_raw_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None: - async with async_client.inference.with_streaming_response.batch_chat_completion( - messages_batch=[ - [ - { - "content": "string", - "role": "user", - } - ] - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_batch_completion(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - async def test_method_batch_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - inference = await async_client.inference.batch_completion( - content_batch=["string"], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - async def test_raw_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None: - response = await async_client.inference.with_raw_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - @parametrize - async def test_streaming_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None: - async with async_client.inference.with_streaming_response.batch_completion( - content_batch=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(BatchCompletion, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_method_chat_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(ChatCompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_method_chat_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - "context": "string", - } - ], - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - tool_choice="auto", - tool_config={ - "system_message_behavior": "append", - "tool_choice": "auto", - "tool_prompt_format": "json", - }, - tool_prompt_format="json", - tools=[ - { - "tool_name": "brave_search", - "description": "description", - "parameters": { - "foo": { - "param_type": "param_type", - "default": True, - "description": "description", - "required": True, - } - }, - } - ], - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_raw_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = await response.parse() - await stream.close() - - @parametrize - async def test_streaming_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.chat_completion( - messages=[ - { - "content": "string", - "role": "user", - } - ], - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - stream = await response.parse() - await stream.close() - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.completion( - content="string", - model_id="model_id", - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - async def test_method_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.completion( - content="string", - model_id="model_id", - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - stream=False, - ) - - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(CompletionResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.completion( - content="string", - model_id="model_id", - stream=True, - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_method_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference_stream = await async_client.inference.completion( - content="string", - model_id="model_id", - stream=True, - logprobs={"top_k": 0}, - response_format={ - "json_schema": {"foo": True}, - "type": "json_schema", - }, - sampling_params={ - "strategy": {"type": "greedy"}, - "max_tokens": 0, - "repetition_penalty": 0, - "stop": ["string"], - }, - ) - - await inference_stream.response.aclose() - - @parametrize - async def test_raw_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.completion( - content="string", - model_id="model_id", - stream=True, - ) - - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - stream = await response.parse() - await stream.close() - - @parametrize - async def test_streaming_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.completion( - content="string", - model_id="model_id", - stream=True, - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - stream = await response.parse() - await stream.close() - - assert cast(Any, response.is_closed) is True - - @parametrize - async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - async def test_method_embeddings_with_all_params(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - inference = await async_client.inference.embeddings( - contents=["string"], - model_id="model_id", - output_dimension=0, - task_type="query", - text_truncation="none", - ) - - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - async def test_raw_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - response = await async_client.inference.with_raw_response.embeddings( - contents=["string"], - model_id="model_id", - ) - - assert response.is_closed is True - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - inference = await response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - @parametrize - async def test_streaming_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None: - with pytest.warns(DeprecationWarning): - async with async_client.inference.with_streaming_response.embeddings( - contents=["string"], - model_id="model_id", - ) as response: - assert not response.is_closed - assert response.http_request.headers.get("X-Stainless-Lang") == "python" - - inference = await response.parse() - assert_matches_type(EmbeddingsResponse, inference, path=["response"]) - - assert cast(Any, response.is_closed) is True - @parametrize async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None: inference = await async_client.inference.rerank( diff --git a/tests/test_client.py b/tests/test_client.py index a5bce12c..708c7420 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -678,17 +678,17 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout) @pytest.mark.respx(base_url=base_url) def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error")) + respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error")) with pytest.raises(APITimeoutError): - client.inference.with_streaming_response.chat_completion( + client.chat.completions.with_streaming_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", ).__enter__() assert _get_open_connections(self.client) == 0 @@ -696,17 +696,17 @@ def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, clien @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout) @pytest.mark.respx(base_url=base_url) def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500)) + respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500)) with pytest.raises(APIStatusError): - client.inference.with_streaming_response.chat_completion( + client.chat.completions.with_streaming_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", ).__enter__() assert _get_open_connections(self.client) == 0 @@ -734,16 +734,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = client.inference.with_raw_response.chat_completion( + response = client.chat.completions.with_raw_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", ) assert response.retries_taken == failures_before_success @@ -766,16 +766,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = client.inference.with_raw_response.chat_completion( + response = client.chat.completions.with_raw_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", extra_headers={"x-stainless-retry-count": Omit()}, ) @@ -798,16 +798,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = client.inference.with_raw_response.chat_completion( + response = client.chat.completions.with_raw_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", extra_headers={"x-stainless-retry-count": "42"}, ) @@ -1498,17 +1498,17 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte async def test_retrying_timeout_errors_doesnt_leak( self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient ) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error")) + respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error")) with pytest.raises(APITimeoutError): - await async_client.inference.with_streaming_response.chat_completion( + await async_client.chat.completions.with_streaming_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", ).__aenter__() assert _get_open_connections(self.client) == 0 @@ -1518,17 +1518,17 @@ async def test_retrying_timeout_errors_doesnt_leak( async def test_retrying_status_errors_doesnt_leak( self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient ) -> None: - respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500)) + respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500)) with pytest.raises(APIStatusError): - await async_client.inference.with_streaming_response.chat_completion( + await async_client.chat.completions.with_streaming_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", ).__aenter__() assert _get_open_connections(self.client) == 0 @@ -1557,16 +1557,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = await client.inference.with_raw_response.chat_completion( + response = await client.chat.completions.with_raw_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", ) assert response.retries_taken == failures_before_success @@ -1590,16 +1590,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = await client.inference.with_raw_response.chat_completion( + response = await client.chat.completions.with_raw_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", extra_headers={"x-stainless-retry-count": Omit()}, ) @@ -1623,16 +1623,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response: return httpx.Response(500) return httpx.Response(200) - respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler) + respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler) - response = await client.inference.with_raw_response.chat_completion( + response = await client.chat.completions.with_raw_response.create( messages=[ { "content": "string", "role": "user", } ], - model_id="model_id", + model="model", extra_headers={"x-stainless-retry-count": "42"}, )