diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index ed9acd29..1ae25264 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "0.2.23-alpha.1"
+  ".": "0.3.0-alpha.1"
 }
diff --git a/.stats.yml b/.stats.yml
index fa9edfc7..755df453 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
-configured_endpoints: 111
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-f252873ea1e1f38fd207331ef2621c511154d5be3f4076e59cc15754fc58eee4.yml
-openapi_spec_hash: 10cbb4337a06a9fdd7d08612dd6044c3
-config_hash: 0358112cc0f3d880b4d55debdbe1cfa3
+configured_endpoints: 105
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-d7bea816190382a93511491e33d1f37f707620926ab133ae8ce0883d763df741.yml
+openapi_spec_hash: f73b3af77108625edae3f25972b9e665
+config_hash: 548f336ac1b68ab1dfe385b79df764dd
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0011c19f..93d68692 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,36 @@
 # Changelog
 
+## 0.3.0-alpha.1 (2025-09-30)
+
+Full Changelog: [v0.2.23-alpha.1...v0.3.0-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.23-alpha.1...v0.3.0-alpha.1)
+
+### ⚠ BREAKING CHANGES
+
+* **api:** fixes to remove deprecated inference resources
+
+### Features
+
+* **api:** expires_after changes for /files ([7f24c43](https://github.com/llamastack/llama-stack-client-python/commit/7f24c432dc1859312710a4a1ff4a80f6f861bee8))
+* **api:** fixes to remove deprecated inference resources ([04834d2](https://github.com/llamastack/llama-stack-client-python/commit/04834d2189ae4e4b8cd2c9370d1d39857bc6e9ec))
+* **api:** removing openai/v1 ([a918b43](https://github.com/llamastack/llama-stack-client-python/commit/a918b4323118c18f77c2abe7e1a3054c1eebeaac))
+* **api:** updating post /v1/files to have correct multipart/form-data ([433a996](https://github.com/llamastack/llama-stack-client-python/commit/433a996527bcca131ada4730376d8993f34ad6f5))
+
+
+### Bug Fixes
+
+* clean up deprecated code ([f10ead0](https://github.com/llamastack/llama-stack-client-python/commit/f10ead00522b7ca803cd7dc3617da0d451efa7da))
+* Don't retry for non-recoverable server http errors ([#212](https://github.com/llamastack/llama-stack-client-python/issues/212)) ([6782e8f](https://github.com/llamastack/llama-stack-client-python/commit/6782e8fc5931369223ed4446f8e7732f62712eff))
+
+
+### Documentation
+
+* update examples ([f896747](https://github.com/llamastack/llama-stack-client-python/commit/f89674726f55915a8cda0e2b4284be3c92978121))
+
+
+### Build System
+
+* Bump version to 0.2.23 ([0d4dc64](https://github.com/llamastack/llama-stack-client-python/commit/0d4dc6449224fa2a0f6d20f6229dd9d1a5427861))
+
 ## 0.2.23-alpha.1 (2025-09-26)
 
 Full Changelog: [v0.2.19-alpha.1...v0.2.23-alpha.1](https://github.com/llamastack/llama-stack-client-python/compare/v0.2.19-alpha.1...v0.2.23-alpha.1)
diff --git a/README.md b/README.md
index 928458d2..c8cebcc3 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,50 @@ asyncio.run(main())
 
 Functionality between the synchronous and asynchronous clients is otherwise identical.
 
+## Streaming responses
+
+We provide support for streaming responses using Server Side Events (SSE).
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+stream = client.chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+    stream=True,
+)
+for completion in stream:
+    print(completion)
+```
+
+The async client uses the exact same interface.
+
+```python
+from llama_stack_client import AsyncLlamaStackClient
+
+client = AsyncLlamaStackClient()
+
+stream = await client.chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
+    stream=True,
+)
+async for completion in stream:
+    print(completion)
+```
+
 ## Using types
 
 Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:
@@ -118,6 +162,40 @@ Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typ
 
 Typed requests and responses provide autocomplete and documentation within your editor. If you would like to see type errors in VS Code to help catch bugs earlier, set `python.analysis.typeCheckingMode` to `basic`.
 
+## Nested params
+
+Nested parameters are dictionaries, typed using `TypedDict`, for example:
+
+```python
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+client.toolgroups.register(
+    provider_id="provider_id",
+    toolgroup_id="toolgroup_id",
+    mcp_endpoint={"uri": "uri"},
+)
+```
+
+## File uploads
+
+Request parameters that correspond to file uploads can be passed as `bytes`, or a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance or a tuple of `(filename, contents, media type)`.
+
+```python
+from pathlib import Path
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient()
+
+client.files.create(
+    file=Path("/path/to/file"),
+    purpose="assistants",
+)
+```
+
+The async client uses the exact same interface. If you pass a [`PathLike`](https://docs.python.org/3/library/os.html#os.PathLike) instance, the file contents will be read asynchronously automatically.
+
 ## Handling errors
 
 When the library is unable to connect to the API (for example, due to network connection problems or a timeout), a subclass of `llama_stack_client.APIConnectionError` is raised.
@@ -134,9 +212,14 @@ from llama_stack_client import LlamaStackClient
 client = LlamaStackClient()
 
 try:
-    client.agents.sessions.create(
-        agent_id="agent_id",
-        session_name="session_name",
+    client.chat.completions.create(
+        messages=[
+            {
+                "content": "string",
+                "role": "user",
+            }
+        ],
+        model="model",
     )
 except llama_stack_client.APIConnectionError as e:
     print("The server could not be reached")
@@ -180,9 +263,14 @@ client = LlamaStackClient(
 )
 
 # Or, configure per-request:
-client.with_options(max_retries=5).agents.sessions.create(
-    agent_id="agent_id",
-    session_name="session_name",
+client.with_options(max_retries=5).chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
 )
 ```
 
@@ -206,9 +294,14 @@ client = LlamaStackClient(
 )
 
 # Override per-request:
-client.with_options(timeout=5.0).agents.sessions.create(
-    agent_id="agent_id",
-    session_name="session_name",
+client.with_options(timeout=5.0).chat.completions.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
 )
 ```
 
@@ -248,14 +341,17 @@ The "raw" Response object can be accessed by prefixing `.with_raw_response.` to
 from llama_stack_client import LlamaStackClient
 
 client = LlamaStackClient()
-response = client.agents.sessions.with_raw_response.create(
-    agent_id="agent_id",
-    session_name="session_name",
+response = client.chat.completions.with_raw_response.create(
+    messages=[{
+        "content": "string",
+        "role": "user",
+    }],
+    model="model",
 )
 print(response.headers.get('X-My-Header'))
 
-session = response.parse()  # get the object that `agents.sessions.create()` would have returned
-print(session.session_id)
+completion = response.parse()  # get the object that `chat.completions.create()` would have returned
+print(completion)
 ```
 
 These methods return an [`APIResponse`](https://github.com/meta-llama/llama-stack-python/tree/main/src/llama_stack_client/_response.py) object.
@@ -269,9 +365,14 @@ The above interface eagerly reads the full response body when you make the reque
 To stream the response body, use `.with_streaming_response` instead, which requires a context manager and only reads the response body once you call `.read()`, `.text()`, `.json()`, `.iter_bytes()`, `.iter_text()`, `.iter_lines()` or `.parse()`. In the async client, these are async methods.
 
 ```python
-with client.agents.sessions.with_streaming_response.create(
-    agent_id="agent_id",
-    session_name="session_name",
+with client.chat.completions.with_streaming_response.create(
+    messages=[
+        {
+            "content": "string",
+            "role": "user",
+        }
+    ],
+    model="model",
 ) as response:
     print(response.headers.get("X-My-Header"))
 
diff --git a/api.md b/api.md
index 22c2120f..c246f4c1 100644
--- a/api.md
+++ b/api.md
@@ -3,10 +3,8 @@
 ```python
 from llama_stack_client.types import (
     AgentConfig,
-    BatchCompletion,
     ChatCompletionResponse,
     CompletionMessage,
-    ContentDelta,
     Document,
     InterleavedContent,
     InterleavedContentItem,
@@ -14,7 +12,6 @@ from llama_stack_client.types import (
     Metric,
     ParamType,
     QueryConfig,
-    QueryGeneratorConfig,
     QueryResult,
     ResponseFormat,
     SafetyViolation,
@@ -91,10 +88,10 @@ from llama_stack_client.types import (
 
 Methods:
 
-- <code title="post /v1/openai/v1/responses">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">create</a>(\*\*<a href="src/llama_stack_client/types/response_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/response_object.py">ResponseObject</a></code>
-- <code title="get /v1/openai/v1/responses/{response_id}">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">retrieve</a>(response_id) -> <a href="./src/llama_stack_client/types/response_object.py">ResponseObject</a></code>
-- <code title="get /v1/openai/v1/responses">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">list</a>(\*\*<a href="src/llama_stack_client/types/response_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/response_list_response.py">SyncOpenAICursorPage[ResponseListResponse]</a></code>
-- <code title="delete /v1/openai/v1/responses/{response_id}">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">delete</a>(response_id) -> <a href="./src/llama_stack_client/types/response_delete_response.py">ResponseDeleteResponse</a></code>
+- <code title="post /v1/responses">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">create</a>(\*\*<a href="src/llama_stack_client/types/response_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/response_object.py">ResponseObject</a></code>
+- <code title="get /v1/responses/{response_id}">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">retrieve</a>(response_id) -> <a href="./src/llama_stack_client/types/response_object.py">ResponseObject</a></code>
+- <code title="get /v1/responses">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">list</a>(\*\*<a href="src/llama_stack_client/types/response_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/response_list_response.py">SyncOpenAICursorPage[ResponseListResponse]</a></code>
+- <code title="delete /v1/responses/{response_id}">client.responses.<a href="./src/llama_stack_client/resources/responses/responses.py">delete</a>(response_id) -> <a href="./src/llama_stack_client/types/response_delete_response.py">ResponseDeleteResponse</a></code>
 
 ## InputItems
 
@@ -106,7 +103,7 @@ from llama_stack_client.types.responses import InputItemListResponse
 
 Methods:
 
-- <code title="get /v1/openai/v1/responses/{response_id}/input_items">client.responses.input_items.<a href="./src/llama_stack_client/resources/responses/input_items.py">list</a>(response_id, \*\*<a href="src/llama_stack_client/types/responses/input_item_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/responses/input_item_list_response.py">InputItemListResponse</a></code>
+- <code title="get /v1/responses/{response_id}/input_items">client.responses.input_items.<a href="./src/llama_stack_client/resources/responses/input_items.py">list</a>(response_id, \*\*<a href="src/llama_stack_client/types/responses/input_item_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/responses/input_item_list_response.py">InputItemListResponse</a></code>
 
 # Agents
 
@@ -164,12 +161,7 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types.agents import (
-    AgentTurnResponseStreamChunk,
-    Turn,
-    TurnResponseEvent,
-    TurnResponseEventPayload,
-)
+from llama_stack_client.types.agents import AgentTurnResponseStreamChunk, Turn, TurnResponseEvent
 ```
 
 Methods:
@@ -206,7 +198,7 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types import BenchmarkConfig, EvalCandidate, EvaluateResponse, Job
+from llama_stack_client.types import BenchmarkConfig, EvaluateResponse, Job
 ```
 
 Methods:
@@ -242,24 +234,12 @@ Methods:
 Types:
 
 ```python
-from llama_stack_client.types import (
-    ChatCompletionResponseStreamChunk,
-    CompletionResponse,
-    EmbeddingsResponse,
-    TokenLogProbs,
-    InferenceBatchChatCompletionResponse,
-    InferenceRerankResponse,
-)
+from llama_stack_client.types import InferenceRerankResponse
 ```
 
 Methods:
 
-- <code title="post /v1/inference/batch-chat-completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">batch_chat_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_batch_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_batch_chat_completion_response.py">InferenceBatchChatCompletionResponse</a></code>
-- <code title="post /v1/inference/batch-completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">batch_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_batch_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>
-- <code title="post /v1/inference/chat-completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/chat_completion_response.py">ChatCompletionResponse</a></code>
-- <code title="post /v1/inference/completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/completion_response.py">CompletionResponse</a></code>
-- <code title="post /v1/inference/embeddings">client.inference.<a href="./src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="./src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
-- <code title="post /v1/inference/rerank">client.inference.<a href="./src/llama_stack_client/resources/inference.py">rerank</a>(\*\*<a href="src/llama_stack_client/types/inference_rerank_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_rerank_response.py">InferenceRerankResponse</a></code>
+- <code title="post /v1alpha/inference/rerank">client.inference.<a href="./src/llama_stack_client/resources/inference.py">rerank</a>(\*\*<a href="src/llama_stack_client/types/inference_rerank_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_rerank_response.py">InferenceRerankResponse</a></code>
 
 # Embeddings
 
@@ -271,7 +251,7 @@ from llama_stack_client.types import CreateEmbeddingsResponse
 
 Methods:
 
-- <code title="post /v1/openai/v1/embeddings">client.embeddings.<a href="./src/llama_stack_client/resources/embeddings.py">create</a>(\*\*<a href="src/llama_stack_client/types/embedding_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/create_embeddings_response.py">CreateEmbeddingsResponse</a></code>
+- <code title="post /v1/embeddings">client.embeddings.<a href="./src/llama_stack_client/resources/embeddings.py">create</a>(\*\*<a href="src/llama_stack_client/types/embedding_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/create_embeddings_response.py">CreateEmbeddingsResponse</a></code>
 
 # Chat
 
@@ -295,9 +275,9 @@ from llama_stack_client.types.chat import (
 
 Methods:
 
-- <code title="post /v1/openai/v1/chat/completions">client.chat.completions.<a href="./src/llama_stack_client/resources/chat/completions.py">create</a>(\*\*<a href="src/llama_stack_client/types/chat/completion_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/chat/completion_create_response.py">CompletionCreateResponse</a></code>
-- <code title="get /v1/openai/v1/chat/completions/{completion_id}">client.chat.completions.<a href="./src/llama_stack_client/resources/chat/completions.py">retrieve</a>(completion_id) -> <a href="./src/llama_stack_client/types/chat/completion_retrieve_response.py">CompletionRetrieveResponse</a></code>
-- <code title="get /v1/openai/v1/chat/completions">client.chat.completions.<a href="./src/llama_stack_client/resources/chat/completions.py">list</a>(\*\*<a href="src/llama_stack_client/types/chat/completion_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/chat/completion_list_response.py">SyncOpenAICursorPage[CompletionListResponse]</a></code>
+- <code title="post /v1/chat/completions">client.chat.completions.<a href="./src/llama_stack_client/resources/chat/completions.py">create</a>(\*\*<a href="src/llama_stack_client/types/chat/completion_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/chat/completion_create_response.py">CompletionCreateResponse</a></code>
+- <code title="get /v1/chat/completions/{completion_id}">client.chat.completions.<a href="./src/llama_stack_client/resources/chat/completions.py">retrieve</a>(completion_id) -> <a href="./src/llama_stack_client/types/chat/completion_retrieve_response.py">CompletionRetrieveResponse</a></code>
+- <code title="get /v1/chat/completions">client.chat.completions.<a href="./src/llama_stack_client/resources/chat/completions.py">list</a>(\*\*<a href="src/llama_stack_client/types/chat/completion_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/chat/completion_list_response.py">SyncOpenAICursorPage[CompletionListResponse]</a></code>
 
 # Completions
 
@@ -309,7 +289,7 @@ from llama_stack_client.types import CompletionCreateResponse
 
 Methods:
 
-- <code title="post /v1/openai/v1/completions">client.completions.<a href="./src/llama_stack_client/resources/completions.py">create</a>(\*\*<a href="src/llama_stack_client/types/completion_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/completion_create_response.py">CompletionCreateResponse</a></code>
+- <code title="post /v1/completions">client.completions.<a href="./src/llama_stack_client/resources/completions.py">create</a>(\*\*<a href="src/llama_stack_client/types/completion_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/completion_create_response.py">CompletionCreateResponse</a></code>
 
 # VectorIo
 
@@ -359,12 +339,12 @@ from llama_stack_client.types import (
 
 Methods:
 
-- <code title="post /v1/openai/v1/vector_stores">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">create</a>(\*\*<a href="src/llama_stack_client/types/vector_store_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store.py">VectorStore</a></code>
-- <code title="get /v1/openai/v1/vector_stores/{vector_store_id}">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">retrieve</a>(vector_store_id) -> <a href="./src/llama_stack_client/types/vector_store.py">VectorStore</a></code>
-- <code title="post /v1/openai/v1/vector_stores/{vector_store_id}">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">update</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_store_update_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store.py">VectorStore</a></code>
-- <code title="get /v1/openai/v1/vector_stores">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">list</a>(\*\*<a href="src/llama_stack_client/types/vector_store_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store.py">SyncOpenAICursorPage[VectorStore]</a></code>
-- <code title="delete /v1/openai/v1/vector_stores/{vector_store_id}">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">delete</a>(vector_store_id) -> <a href="./src/llama_stack_client/types/vector_store_delete_response.py">VectorStoreDeleteResponse</a></code>
-- <code title="post /v1/openai/v1/vector_stores/{vector_store_id}/search">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">search</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_store_search_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store_search_response.py">VectorStoreSearchResponse</a></code>
+- <code title="post /v1/vector_stores">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">create</a>(\*\*<a href="src/llama_stack_client/types/vector_store_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store.py">VectorStore</a></code>
+- <code title="get /v1/vector_stores/{vector_store_id}">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">retrieve</a>(vector_store_id) -> <a href="./src/llama_stack_client/types/vector_store.py">VectorStore</a></code>
+- <code title="post /v1/vector_stores/{vector_store_id}">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">update</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_store_update_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store.py">VectorStore</a></code>
+- <code title="get /v1/vector_stores">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">list</a>(\*\*<a href="src/llama_stack_client/types/vector_store_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store.py">SyncOpenAICursorPage[VectorStore]</a></code>
+- <code title="delete /v1/vector_stores/{vector_store_id}">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">delete</a>(vector_store_id) -> <a href="./src/llama_stack_client/types/vector_store_delete_response.py">VectorStoreDeleteResponse</a></code>
+- <code title="post /v1/vector_stores/{vector_store_id}/search">client.vector_stores.<a href="./src/llama_stack_client/resources/vector_stores/vector_stores.py">search</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_store_search_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_store_search_response.py">VectorStoreSearchResponse</a></code>
 
 ## Files
 
@@ -380,12 +360,12 @@ from llama_stack_client.types.vector_stores import (
 
 Methods:
 
-- <code title="post /v1/openai/v1/vector_stores/{vector_store_id}/files">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">create</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_stores/file_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">VectorStoreFile</a></code>
-- <code title="get /v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">retrieve</a>(file_id, \*, vector_store_id) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">VectorStoreFile</a></code>
-- <code title="post /v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">update</a>(file_id, \*, vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_stores/file_update_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">VectorStoreFile</a></code>
-- <code title="get /v1/openai/v1/vector_stores/{vector_store_id}/files">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">list</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_stores/file_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">SyncOpenAICursorPage[VectorStoreFile]</a></code>
-- <code title="delete /v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">delete</a>(file_id, \*, vector_store_id) -> <a href="./src/llama_stack_client/types/vector_stores/file_delete_response.py">FileDeleteResponse</a></code>
-- <code title="get /v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">content</a>(file_id, \*, vector_store_id) -> <a href="./src/llama_stack_client/types/vector_stores/file_content_response.py">FileContentResponse</a></code>
+- <code title="post /v1/vector_stores/{vector_store_id}/files">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">create</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_stores/file_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">VectorStoreFile</a></code>
+- <code title="get /v1/vector_stores/{vector_store_id}/files/{file_id}">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">retrieve</a>(file_id, \*, vector_store_id) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">VectorStoreFile</a></code>
+- <code title="post /v1/vector_stores/{vector_store_id}/files/{file_id}">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">update</a>(file_id, \*, vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_stores/file_update_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">VectorStoreFile</a></code>
+- <code title="get /v1/vector_stores/{vector_store_id}/files">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">list</a>(vector_store_id, \*\*<a href="src/llama_stack_client/types/vector_stores/file_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_stores/vector_store_file.py">SyncOpenAICursorPage[VectorStoreFile]</a></code>
+- <code title="delete /v1/vector_stores/{vector_store_id}/files/{file_id}">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">delete</a>(file_id, \*, vector_store_id) -> <a href="./src/llama_stack_client/types/vector_stores/file_delete_response.py">FileDeleteResponse</a></code>
+- <code title="get /v1/vector_stores/{vector_store_id}/files/{file_id}/content">client.vector_stores.files.<a href="./src/llama_stack_client/resources/vector_stores/files.py">content</a>(file_id, \*, vector_store_id) -> <a href="./src/llama_stack_client/types/vector_stores/file_content_response.py">FileContentResponse</a></code>
 
 # Models
 
@@ -412,7 +392,7 @@ from llama_stack_client.types.models import OpenAIListResponse
 
 Methods:
 
-- <code title="get /v1/openai/v1/models">client.models.openai.<a href="./src/llama_stack_client/resources/models/openai.py">list</a>() -> <a href="./src/llama_stack_client/types/models/openai_list_response.py">OpenAIListResponse</a></code>
+- <code title="get /v1/models">client.models.openai.<a href="./src/llama_stack_client/resources/models/openai.py">list</a>() -> <a href="./src/llama_stack_client/types/model_list_response.py">ModelListResponse</a></code>
 
 # PostTraining
 
@@ -481,7 +461,7 @@ from llama_stack_client.types import CreateResponse
 
 Methods:
 
-- <code title="post /v1/openai/v1/moderations">client.moderations.<a href="./src/llama_stack_client/resources/moderations.py">create</a>(\*\*<a href="src/llama_stack_client/types/moderation_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/create_response.py">CreateResponse</a></code>
+- <code title="post /v1/moderations">client.moderations.<a href="./src/llama_stack_client/resources/moderations.py">create</a>(\*\*<a href="src/llama_stack_client/types/moderation_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/create_response.py">CreateResponse</a></code>
 
 # Safety
 
@@ -608,8 +588,8 @@ from llama_stack_client.types import DeleteFileResponse, File, ListFilesResponse
 
 Methods:
 
-- <code title="post /v1/openai/v1/files">client.files.<a href="./src/llama_stack_client/resources/files.py">create</a>(\*\*<a href="src/llama_stack_client/types/file_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/file.py">File</a></code>
-- <code title="get /v1/openai/v1/files/{file_id}">client.files.<a href="./src/llama_stack_client/resources/files.py">retrieve</a>(file_id) -> <a href="./src/llama_stack_client/types/file.py">File</a></code>
-- <code title="get /v1/openai/v1/files">client.files.<a href="./src/llama_stack_client/resources/files.py">list</a>(\*\*<a href="src/llama_stack_client/types/file_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/file.py">SyncOpenAICursorPage[File]</a></code>
-- <code title="delete /v1/openai/v1/files/{file_id}">client.files.<a href="./src/llama_stack_client/resources/files.py">delete</a>(file_id) -> <a href="./src/llama_stack_client/types/delete_file_response.py">DeleteFileResponse</a></code>
-- <code title="get /v1/openai/v1/files/{file_id}/content">client.files.<a href="./src/llama_stack_client/resources/files.py">content</a>(file_id) -> object</code>
+- <code title="post /v1/files">client.files.<a href="./src/llama_stack_client/resources/files.py">create</a>(\*\*<a href="src/llama_stack_client/types/file_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/file.py">File</a></code>
+- <code title="get /v1/files/{file_id}">client.files.<a href="./src/llama_stack_client/resources/files.py">retrieve</a>(file_id) -> <a href="./src/llama_stack_client/types/file.py">File</a></code>
+- <code title="get /v1/files">client.files.<a href="./src/llama_stack_client/resources/files.py">list</a>(\*\*<a href="src/llama_stack_client/types/file_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/file.py">SyncOpenAICursorPage[File]</a></code>
+- <code title="delete /v1/files/{file_id}">client.files.<a href="./src/llama_stack_client/resources/files.py">delete</a>(file_id) -> <a href="./src/llama_stack_client/types/delete_file_response.py">DeleteFileResponse</a></code>
+- <code title="get /v1/files/{file_id}/content">client.files.<a href="./src/llama_stack_client/resources/files.py">content</a>(file_id) -> object</code>
diff --git a/pyproject.toml b/pyproject.toml
index 843dd9b7..3b50518e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "llama_stack_client"
-version = "0.2.23"
+version = "0.3.0-alpha.1"
 description = "The official Python library for the llama-stack-client API"
 dynamic = ["readme"]
 license = "MIT"
diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
index 14b46372..cbf5f680 100644
--- a/src/llama_stack_client/lib/inference/event_logger.py
+++ b/src/llama_stack_client/lib/inference/event_logger.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 from typing import Generator
 from termcolor import cprint
-from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk
+from llama_stack_client.types import ChatCompletionChunk
 
 
 class InferenceStreamPrintableEvent:
@@ -28,35 +28,11 @@ def __init__(self):
         self.is_thinking = False
 
     def yield_printable_events(
-        self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
+        self, chunk: ChatCompletionChunk
     ) -> Generator[InferenceStreamPrintableEvent, None, None]:
-        # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
-        if hasattr(chunk, "event"):
-            yield from self._handle_inference_stream_chunk(chunk)
-        # Check if the chunk has choices attribute (ChatCompletionChunk)
-        elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
+        if hasattr(chunk, "choices") and len(chunk.choices) > 0:
             yield from self._handle_chat_completion_chunk(chunk)
 
-    def _handle_inference_stream_chunk(
-        self, chunk: ChatCompletionResponseStreamChunk
-    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
-        event = chunk.event
-        if event.event_type == "start":
-            yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
-        elif event.event_type == "progress":
-            if event.delta.type == "reasoning":
-                if not self.is_thinking:
-                    yield InferenceStreamPrintableEvent("<thinking> ", color="magenta", end="")
-                    self.is_thinking = True
-                yield InferenceStreamPrintableEvent(event.delta.reasoning, color="magenta", end="")
-            else:
-                if self.is_thinking:
-                    yield InferenceStreamPrintableEvent("</thinking>", color="magenta", end="")
-                    self.is_thinking = False
-                yield InferenceStreamPrintableEvent(event.delta.text, color="yellow", end="")
-        elif event.event_type == "complete":
-            yield InferenceStreamPrintableEvent("")
-
     def _handle_chat_completion_chunk(
         self, chunk: ChatCompletionChunk
     ) -> Generator[InferenceStreamPrintableEvent, None, None]:
diff --git a/src/llama_stack_client/resources/chat/completions.py b/src/llama_stack_client/resources/chat/completions.py
index 5445a2d1..2fb19980 100644
--- a/src/llama_stack_client/resources/chat/completions.py
+++ b/src/llama_stack_client/resources/chat/completions.py
@@ -372,7 +372,7 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | Stream[ChatCompletionChunk]:
         return self._post(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             body=maybe_transform(
                 {
                     "messages": messages,
@@ -439,7 +439,7 @@ def retrieve(
         if not completion_id:
             raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}")
         return self._get(
-            f"/v1/openai/v1/chat/completions/{completion_id}",
+            f"/v1/chat/completions/{completion_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -481,7 +481,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             page=SyncOpenAICursorPage[CompletionListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -845,7 +845,7 @@ async def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | AsyncStream[ChatCompletionChunk]:
         return await self._post(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             body=await async_maybe_transform(
                 {
                     "messages": messages,
@@ -912,7 +912,7 @@ async def retrieve(
         if not completion_id:
             raise ValueError(f"Expected a non-empty value for `completion_id` but received {completion_id!r}")
         return await self._get(
-            f"/v1/openai/v1/chat/completions/{completion_id}",
+            f"/v1/chat/completions/{completion_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -954,7 +954,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/chat/completions",
+            "/v1/chat/completions",
             page=AsyncOpenAICursorPage[CompletionListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
diff --git a/src/llama_stack_client/resources/completions.py b/src/llama_stack_client/resources/completions.py
index 2c1475de..caeab7a1 100644
--- a/src/llama_stack_client/resources/completions.py
+++ b/src/llama_stack_client/resources/completions.py
@@ -326,7 +326,7 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | Stream[CompletionCreateResponse]:
         return self._post(
-            "/v1/openai/v1/completions",
+            "/v1/completions",
             body=maybe_transform(
                 {
                     "model": model,
@@ -664,7 +664,7 @@ async def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> CompletionCreateResponse | AsyncStream[CompletionCreateResponse]:
         return await self._post(
-            "/v1/openai/v1/completions",
+            "/v1/completions",
             body=await async_maybe_transform(
                 {
                     "model": model,
diff --git a/src/llama_stack_client/resources/embeddings.py b/src/llama_stack_client/resources/embeddings.py
index 60c38cb2..29cd69d8 100644
--- a/src/llama_stack_client/resources/embeddings.py
+++ b/src/llama_stack_client/resources/embeddings.py
@@ -87,7 +87,7 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._post(
-            "/v1/openai/v1/embeddings",
+            "/v1/embeddings",
             body=maybe_transform(
                 {
                     "input": input,
@@ -169,7 +169,7 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/openai/v1/embeddings",
+            "/v1/embeddings",
             body=await async_maybe_transform(
                 {
                     "input": input,
diff --git a/src/llama_stack_client/resources/files.py b/src/llama_stack_client/resources/files.py
index 6b395e52..39add811 100644
--- a/src/llama_stack_client/resources/files.py
+++ b/src/llama_stack_client/resources/files.py
@@ -51,6 +51,7 @@ def create(
         *,
         file: FileTypes,
         purpose: Literal["assistants", "batch"],
+        expires_after: file_create_params.ExpiresAfter | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -65,10 +66,17 @@ def create(
 
         - file: The File object (not file name) to be uploaded.
         - purpose: The intended purpose of the uploaded file.
+        - expires_after: Optional form values describing expiration for the file.
 
         Args:
           purpose: Valid purpose values for OpenAI Files API.
 
+          expires_after:
+              Control expiration of uploaded files. Params:
+
+              - anchor, must be "created_at"
+              - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -81,6 +89,7 @@ def create(
             {
                 "file": file,
                 "purpose": purpose,
+                "expires_after": expires_after,
             }
         )
         files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
@@ -89,7 +98,7 @@ def create(
         # multipart/form-data; boundary=---abc--
         extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
         return self._post(
-            "/v1/openai/v1/files",
+            "/v1/files",
             body=maybe_transform(body, file_create_params.FileCreateParams),
             files=files,
             options=make_request_options(
@@ -124,7 +133,7 @@ def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -171,7 +180,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/files",
+            "/v1/files",
             page=SyncOpenAICursorPage[File],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -217,7 +226,7 @@ def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._delete(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -250,7 +259,7 @@ def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/files/{file_id}/content",
+            f"/v1/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -283,6 +292,7 @@ async def create(
         *,
         file: FileTypes,
         purpose: Literal["assistants", "batch"],
+        expires_after: file_create_params.ExpiresAfter | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -297,10 +307,17 @@ async def create(
 
         - file: The File object (not file name) to be uploaded.
         - purpose: The intended purpose of the uploaded file.
+        - expires_after: Optional form values describing expiration for the file.
 
         Args:
           purpose: Valid purpose values for OpenAI Files API.
 
+          expires_after:
+              Control expiration of uploaded files. Params:
+
+              - anchor, must be "created_at"
+              - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -313,6 +330,7 @@ async def create(
             {
                 "file": file,
                 "purpose": purpose,
+                "expires_after": expires_after,
             }
         )
         files = extract_files(cast(Mapping[str, object], body), paths=[["file"]])
@@ -321,7 +339,7 @@ async def create(
         # multipart/form-data; boundary=---abc--
         extra_headers = {"Content-Type": "multipart/form-data", **(extra_headers or {})}
         return await self._post(
-            "/v1/openai/v1/files",
+            "/v1/files",
             body=await async_maybe_transform(body, file_create_params.FileCreateParams),
             files=files,
             options=make_request_options(
@@ -356,7 +374,7 @@ async def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -403,7 +421,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/files",
+            "/v1/files",
             page=AsyncOpenAICursorPage[File],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -449,7 +467,7 @@ async def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/files/{file_id}",
+            f"/v1/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -482,7 +500,7 @@ async def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/files/{file_id}/content",
+            f"/v1/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
index 732025cc..e5cf7b6b 100644
--- a/src/llama_stack_client/resources/inference.py
+++ b/src/llama_stack_client/resources/inference.py
@@ -2,1106 +2,76 @@
 
 from __future__ import annotations
 
-import typing_extensions
-from typing import Type, Union, Iterable, cast
-from typing_extensions import Literal, overload
+from typing import Type, cast
 
 import httpx
 
-from ..types import (
-    inference_rerank_params,
-    inference_completion_params,
-    inference_embeddings_params,
-    inference_chat_completion_params,
-    inference_batch_completion_params,
-    inference_batch_chat_completion_params,
-)
-from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
-from .._utils import required_args, maybe_transform, async_maybe_transform
-from .._compat import cached_property
-from .._resource import SyncAPIResource, AsyncAPIResource
-from .._response import (
-    to_raw_response_wrapper,
-    to_streamed_response_wrapper,
-    async_to_raw_response_wrapper,
-    async_to_streamed_response_wrapper,
-)
-from .._wrappers import DataWrapper
-from .._streaming import Stream, AsyncStream
-from .._base_client import make_request_options
-from ..types.completion_response import CompletionResponse
-from ..types.embeddings_response import EmbeddingsResponse
-from ..types.shared_params.message import Message
-from ..types.shared.batch_completion import BatchCompletion
-from ..types.inference_rerank_response import InferenceRerankResponse
-from ..types.shared_params.response_format import ResponseFormat
-from ..types.shared_params.sampling_params import SamplingParams
-from ..types.shared.chat_completion_response import ChatCompletionResponse
-from ..types.shared_params.interleaved_content import InterleavedContent
-from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk
-from ..types.shared_params.interleaved_content_item import InterleavedContentItem
-from ..types.inference_batch_chat_completion_response import InferenceBatchChatCompletionResponse
-
-__all__ = ["InferenceResource", "AsyncInferenceResource"]
-
-
-class InferenceResource(SyncAPIResource):
-    @cached_property
-    def with_raw_response(self) -> InferenceResourceWithRawResponse:
-        """
-        This property can be used as a prefix for any HTTP method call to return
-        the raw response object instead of the parsed content.
-
-        For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
-        """
-        return InferenceResourceWithRawResponse(self)
-
-    @cached_property
-    def with_streaming_response(self) -> InferenceResourceWithStreamingResponse:
-        """
-        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
-
-        For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
-        """
-        return InferenceResourceWithStreamingResponse(self)
-
-    def batch_chat_completion(
-        self,
-        *,
-        messages_batch: Iterable[Iterable[Message]],
-        model_id: str,
-        logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit,
-        tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceBatchChatCompletionResponse:
-        """
-        Generate chat completions for a batch of messages using the specified model.
-
-        Args:
-          messages_batch: The messages to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/batch-chat-completion",
-            body=maybe_transform(
-                {
-                    "messages_batch": messages_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "tool_config": tool_config,
-                    "tools": tools,
-                },
-                inference_batch_chat_completion_params.InferenceBatchChatCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=InferenceBatchChatCompletionResponse,
-        )
-
-    def batch_completion(
-        self,
-        *,
-        content_batch: SequenceNotStr[InterleavedContent],
-        model_id: str,
-        logprobs: inference_batch_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> BatchCompletion:
-        """
-        Generate completions for a batch of content using the specified model.
-
-        Args:
-          content_batch: The content to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/batch-completion",
-            body=maybe_transform(
-                {
-                    "content_batch": content_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                },
-                inference_batch_completion_params.InferenceBatchCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=BatchCompletion,
-        )
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @overload
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-              <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @overload
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> Stream[ChatCompletionResponseStreamChunk]:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-              <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @overload
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: bool,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-              <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
-    def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | Stream[ChatCompletionResponseStreamChunk]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
-        return self._post(
-            "/v1/inference/chat-completion",
-            body=maybe_transform(
-                {
-                    "messages": messages,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                    "tool_choice": tool_choice,
-                    "tool_config": tool_config,
-                    "tool_prompt_format": tool_prompt_format,
-                    "tools": tools,
-                },
-                inference_chat_completion_params.InferenceChatCompletionParamsStreaming
-                if stream
-                else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=ChatCompletionResponse,
-            stream=stream or False,
-            stream_cls=Stream[ChatCompletionResponseStreamChunk],
-        )
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> Stream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: bool,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | Stream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @required_args(["content", "model_id"], ["content", "model_id", "stream"])
-    def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | Stream[CompletionResponse]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
-        return self._post(
-            "/v1/inference/completion",
-            body=maybe_transform(
-                {
-                    "content": content,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                },
-                inference_completion_params.InferenceCompletionParamsStreaming
-                if stream
-                else inference_completion_params.InferenceCompletionParamsNonStreaming,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=CompletionResponse,
-            stream=stream or False,
-            stream_cls=Stream[CompletionResponse],
-        )
-
-    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.")
-    def embeddings(
-        self,
-        *,
-        contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]],
-        model_id: str,
-        output_dimension: int | Omit = omit,
-        task_type: Literal["query", "document"] | Omit = omit,
-        text_truncation: Literal["none", "start", "end"] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> EmbeddingsResponse:
-        """
-        Generate embeddings for content pieces using the specified model.
-
-        Args:
-          contents: List of contents to generate embeddings for. Each content can be a string or an
-              InterleavedContentItem (and hence can be multimodal). The behavior depends on
-              the model and provider. Some models may only support text.
-
-          model_id: The identifier of the model to use. The model must be an embedding model
-              registered with Llama Stack and available via the /models endpoint.
-
-          output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
-              Matryoshka models.
-
-          task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
-              embedding models.
-
-          text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
-              than the model's max sequence length.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/embeddings",
-            body=maybe_transform(
-                {
-                    "contents": contents,
-                    "model_id": model_id,
-                    "output_dimension": output_dimension,
-                    "task_type": task_type,
-                    "text_truncation": text_truncation,
-                },
-                inference_embeddings_params.InferenceEmbeddingsParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=EmbeddingsResponse,
-        )
-
-    def rerank(
-        self,
-        *,
-        items: SequenceNotStr[inference_rerank_params.Item],
-        model: str,
-        query: inference_rerank_params.Query,
-        max_num_results: int | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceRerankResponse:
-        """
-        Rerank a list of documents based on their relevance to a query.
-
-        Args:
-          items: List of items to rerank. Each item can be a string, text content part, or image
-              content part. Each input must not exceed the model's max input token length.
-
-          model: The identifier of the reranking model to use.
-
-          query: The search query to rank items against. Can be a string, text content part, or
-              image content part. The input must not exceed the model's max input token
-              length.
-
-          max_num_results: (Optional) Maximum number of results to return. Default: returns all.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return self._post(
-            "/v1/inference/rerank",
-            body=maybe_transform(
-                {
-                    "items": items,
-                    "model": model,
-                    "query": query,
-                    "max_num_results": max_num_results,
-                },
-                inference_rerank_params.InferenceRerankParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers,
-                extra_query=extra_query,
-                extra_body=extra_body,
-                timeout=timeout,
-                post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
-            ),
-            cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
-        )
-
-
-class AsyncInferenceResource(AsyncAPIResource):
-    @cached_property
-    def with_raw_response(self) -> AsyncInferenceResourceWithRawResponse:
-        """
-        This property can be used as a prefix for any HTTP method call to return
-        the raw response object instead of the parsed content.
-
-        For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
-        """
-        return AsyncInferenceResourceWithRawResponse(self)
-
-    @cached_property
-    def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse:
-        """
-        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
-
-        For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
-        """
-        return AsyncInferenceResourceWithStreamingResponse(self)
-
-    async def batch_chat_completion(
-        self,
-        *,
-        messages_batch: Iterable[Iterable[Message]],
-        model_id: str,
-        logprobs: inference_batch_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_config: inference_batch_chat_completion_params.ToolConfig | Omit = omit,
-        tools: Iterable[inference_batch_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> InferenceBatchChatCompletionResponse:
-        """
-        Generate chat completions for a batch of messages using the specified model.
-
-        Args:
-          messages_batch: The messages to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return await self._post(
-            "/v1/inference/batch-chat-completion",
-            body=await async_maybe_transform(
-                {
-                    "messages_batch": messages_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "tool_config": tool_config,
-                    "tools": tools,
-                },
-                inference_batch_chat_completion_params.InferenceBatchChatCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=InferenceBatchChatCompletionResponse,
-        )
-
-    async def batch_completion(
-        self,
-        *,
-        content_batch: SequenceNotStr[InterleavedContent],
-        model_id: str,
-        logprobs: inference_batch_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> BatchCompletion:
-        """
-        Generate completions for a batch of content using the specified model.
-
-        Args:
-          content_batch: The content to generate completions for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        return await self._post(
-            "/v1/inference/batch-completion",
-            body=await async_maybe_transform(
-                {
-                    "content_batch": content_batch,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                },
-                inference_batch_completion_params.InferenceBatchCompletionParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=BatchCompletion,
-        )
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @overload
-    async def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse:
-        """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-              <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
+from ..types import inference_rerank_params
+from .._types import Body, Omit, Query, Headers, NotGiven, SequenceNotStr, omit, not_given
+from .._utils import maybe_transform, async_maybe_transform
+from .._compat import cached_property
+from .._resource import SyncAPIResource, AsyncAPIResource
+from .._response import (
+    to_raw_response_wrapper,
+    to_streamed_response_wrapper,
+    async_to_raw_response_wrapper,
+    async_to_streamed_response_wrapper,
+)
+from .._wrappers import DataWrapper
+from .._base_client import make_request_options
+from ..types.inference_rerank_response import InferenceRerankResponse
 
-          extra_query: Add additional query parameters to the request
+__all__ = ["InferenceResource", "AsyncInferenceResource"]
 
-          extra_body: Add additional JSON properties to the request
 
-          timeout: Override the client-level default timeout for this request, in seconds
+class InferenceResource(SyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> InferenceResourceWithRawResponse:
         """
-        ...
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @overload
-    async def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> AsyncStream[ChatCompletionResponseStreamChunk]:
+        For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
         """
-        Generate a chat completion for the given messages using the specified model.
-
-        Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
-
-          tool_config: (Optional) Configuration for tool use.
-
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-              <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
-
-          tools: (Optional) List of tool definitions available to the model.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
+        return InferenceResourceWithRawResponse(self)
 
-          extra_body: Add additional JSON properties to the request
+    @cached_property
+    def with_streaming_response(self) -> InferenceResourceWithStreamingResponse:
+        """
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
 
-          timeout: Override the client-level default timeout for this request, in seconds
+        For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
         """
-        ...
+        return InferenceResourceWithStreamingResponse(self)
 
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @overload
-    async def chat_completion(
+    def rerank(
         self,
         *,
-        messages: Iterable[Message],
-        model_id: str,
-        stream: bool,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
+        items: SequenceNotStr[inference_rerank_params.Item],
+        model: str,
+        query: inference_rerank_params.Query,
+        max_num_results: int | Omit = omit,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]:
+    ) -> InferenceRerankResponse:
         """
-        Generate a chat completion for the given messages using the specified model.
+        Rerank a list of documents based on their relevance to a query.
 
         Args:
-          messages: List of messages in the conversation.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding. There are two
-              options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most
-              providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF
-              grammar. This format is more flexible, but not all providers support it.
-
-          sampling_params: Parameters to control the sampling strategy.
-
-          tool_choice: (Optional) Whether tool use is required or automatic. Defaults to
-              ToolChoice.auto. .. deprecated:: Use tool_config instead.
+          items: List of items to rerank. Each item can be a string, text content part, or image
+              content part. Each input must not exceed the model's max input token length.
 
-          tool_config: (Optional) Configuration for tool use.
+          model: The identifier of the reranking model to use.
 
-          tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack
-              will attempt to use a format that is best adapted to the model. -
-              `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. -
-              `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-              <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-              are output as Python syntax -- a list of function calls. .. deprecated:: Use
-              tool_config instead.
+          query: The search query to rank items against. Can be a string, text content part, or
+              image content part. The input must not exceed the model's max input token
+              length.
 
-          tools: (Optional) List of tool definitions available to the model.
+          max_num_results: (Optional) Maximum number of results to return. Default: returns all.
 
           extra_headers: Send extra headers
 
@@ -1111,306 +81,47 @@ async def chat_completion(
 
           timeout: Override the client-level default timeout for this request, in seconds
         """
-        ...
-
-    @typing_extensions.deprecated(
-        "/v1/inference/chat-completion is deprecated. Please use /v1/openai/v1/chat/completions."
-    )
-    @required_args(["messages", "model_id"], ["messages", "model_id", "stream"])
-    async def chat_completion(
-        self,
-        *,
-        messages: Iterable[Message],
-        model_id: str,
-        logprobs: inference_chat_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        tool_choice: Literal["auto", "required", "none"] | Omit = omit,
-        tool_config: inference_chat_completion_params.ToolConfig | Omit = omit,
-        tool_prompt_format: Literal["json", "function_tag", "python_list"] | Omit = omit,
-        tools: Iterable[inference_chat_completion_params.Tool] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> ChatCompletionResponse | AsyncStream[ChatCompletionResponseStreamChunk]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
-        return await self._post(
-            "/v1/inference/chat-completion",
-            body=await async_maybe_transform(
+        return self._post(
+            "/v1alpha/inference/rerank",
+            body=maybe_transform(
                 {
-                    "messages": messages,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                    "tool_choice": tool_choice,
-                    "tool_config": tool_config,
-                    "tool_prompt_format": tool_prompt_format,
-                    "tools": tools,
+                    "items": items,
+                    "model": model,
+                    "query": query,
+                    "max_num_results": max_num_results,
                 },
-                inference_chat_completion_params.InferenceChatCompletionParamsStreaming
-                if stream
-                else inference_chat_completion_params.InferenceChatCompletionParamsNonStreaming,
+                inference_rerank_params.InferenceRerankParams,
             ),
             options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                extra_headers=extra_headers,
+                extra_query=extra_query,
+                extra_body=extra_body,
+                timeout=timeout,
+                post_parser=DataWrapper[InferenceRerankResponse]._unwrapper,
             ),
-            cast_to=ChatCompletionResponse,
-            stream=stream or False,
-            stream_cls=AsyncStream[ChatCompletionResponseStreamChunk],
+            cast_to=cast(Type[InferenceRerankResponse], DataWrapper[InferenceRerankResponse]),
         )
 
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: Literal[True],
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> AsyncStream[CompletionResponse]:
-        """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
 
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-
-          timeout: Override the client-level default timeout for this request, in seconds
-        """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @overload
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        stream: bool,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | AsyncStream[CompletionResponse]:
+class AsyncInferenceResource(AsyncAPIResource):
+    @cached_property
+    def with_raw_response(self) -> AsyncInferenceResourceWithRawResponse:
         """
-        Generate a completion for the given content using the specified model.
-
-        Args:
-          content: The content to generate a completion for.
-
-          model_id: The identifier of the model to use. The model must be registered with Llama
-              Stack and available via the /models endpoint.
-
-          stream: (Optional) If True, generate an SSE event stream of the response. Defaults to
-              False.
-
-          logprobs: (Optional) If specified, log probabilities for each token position will be
-              returned.
-
-          response_format: (Optional) Grammar specification for guided (structured) decoding.
-
-          sampling_params: (Optional) Parameters to control the sampling strategy.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
+        This property can be used as a prefix for any HTTP method call to return
+        the raw response object instead of the parsed content.
 
-          timeout: Override the client-level default timeout for this request, in seconds
+        For more information, see https://www.github.com/llamastack/llama-stack-client-python#accessing-raw-response-data-eg-headers
         """
-        ...
-
-    @typing_extensions.deprecated("/v1/inference/completion is deprecated. Please use /v1/openai/v1/completions.")
-    @required_args(["content", "model_id"], ["content", "model_id", "stream"])
-    async def completion(
-        self,
-        *,
-        content: InterleavedContent,
-        model_id: str,
-        logprobs: inference_completion_params.Logprobs | Omit = omit,
-        response_format: ResponseFormat | Omit = omit,
-        sampling_params: SamplingParams | Omit = omit,
-        stream: Literal[False] | Literal[True] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> CompletionResponse | AsyncStream[CompletionResponse]:
-        if stream:
-            extra_headers = {"Accept": "text/event-stream", **(extra_headers or {})}
-        return await self._post(
-            "/v1/inference/completion",
-            body=await async_maybe_transform(
-                {
-                    "content": content,
-                    "model_id": model_id,
-                    "logprobs": logprobs,
-                    "response_format": response_format,
-                    "sampling_params": sampling_params,
-                    "stream": stream,
-                },
-                inference_completion_params.InferenceCompletionParamsStreaming
-                if stream
-                else inference_completion_params.InferenceCompletionParamsNonStreaming,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=CompletionResponse,
-            stream=stream or False,
-            stream_cls=AsyncStream[CompletionResponse],
-        )
+        return AsyncInferenceResourceWithRawResponse(self)
 
-    @typing_extensions.deprecated("/v1/inference/embeddings is deprecated. Please use /v1/openai/v1/embeddings.")
-    async def embeddings(
-        self,
-        *,
-        contents: Union[SequenceNotStr[str], Iterable[InterleavedContentItem]],
-        model_id: str,
-        output_dimension: int | Omit = omit,
-        task_type: Literal["query", "document"] | Omit = omit,
-        text_truncation: Literal["none", "start", "end"] | Omit = omit,
-        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
-        # The extra values given here take precedence over values defined on the client or passed to this method.
-        extra_headers: Headers | None = None,
-        extra_query: Query | None = None,
-        extra_body: Body | None = None,
-        timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> EmbeddingsResponse:
+    @cached_property
+    def with_streaming_response(self) -> AsyncInferenceResourceWithStreamingResponse:
         """
-        Generate embeddings for content pieces using the specified model.
-
-        Args:
-          contents: List of contents to generate embeddings for. Each content can be a string or an
-              InterleavedContentItem (and hence can be multimodal). The behavior depends on
-              the model and provider. Some models may only support text.
-
-          model_id: The identifier of the model to use. The model must be an embedding model
-              registered with Llama Stack and available via the /models endpoint.
-
-          output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
-              Matryoshka models.
-
-          task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
-              embedding models.
-
-          text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
-              than the model's max sequence length.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
+        An alternative to `.with_raw_response` that doesn't eagerly read the response body.
 
-          timeout: Override the client-level default timeout for this request, in seconds
+        For more information, see https://www.github.com/llamastack/llama-stack-client-python#with_streaming_response
         """
-        return await self._post(
-            "/v1/inference/embeddings",
-            body=await async_maybe_transform(
-                {
-                    "contents": contents,
-                    "model_id": model_id,
-                    "output_dimension": output_dimension,
-                    "task_type": task_type,
-                    "text_truncation": text_truncation,
-                },
-                inference_embeddings_params.InferenceEmbeddingsParams,
-            ),
-            options=make_request_options(
-                extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
-            ),
-            cast_to=EmbeddingsResponse,
-        )
+        return AsyncInferenceResourceWithStreamingResponse(self)
 
     async def rerank(
         self,
@@ -1450,7 +161,7 @@ async def rerank(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/inference/rerank",
+            "/v1alpha/inference/rerank",
             body=await async_maybe_transform(
                 {
                     "items": items,
@@ -1475,27 +186,6 @@ class InferenceResourceWithRawResponse:
     def __init__(self, inference: InferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = to_raw_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = to_raw_response_wrapper(
-            inference.batch_completion,
-        )
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            to_raw_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.rerank = to_raw_response_wrapper(
             inference.rerank,
         )
@@ -1505,27 +195,6 @@ class AsyncInferenceResourceWithRawResponse:
     def __init__(self, inference: AsyncInferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = async_to_raw_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = async_to_raw_response_wrapper(
-            inference.batch_completion,
-        )
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            async_to_raw_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.rerank = async_to_raw_response_wrapper(
             inference.rerank,
         )
@@ -1535,27 +204,6 @@ class InferenceResourceWithStreamingResponse:
     def __init__(self, inference: InferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = to_streamed_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = to_streamed_response_wrapper(
-            inference.batch_completion,
-        )
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            to_streamed_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.rerank = to_streamed_response_wrapper(
             inference.rerank,
         )
@@ -1565,27 +213,6 @@ class AsyncInferenceResourceWithStreamingResponse:
     def __init__(self, inference: AsyncInferenceResource) -> None:
         self._inference = inference
 
-        self.batch_chat_completion = async_to_streamed_response_wrapper(
-            inference.batch_chat_completion,
-        )
-        self.batch_completion = async_to_streamed_response_wrapper(
-            inference.batch_completion,
-        )
-        self.chat_completion = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                inference.chat_completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.completion = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                inference.completion,  # pyright: ignore[reportDeprecated],
-            )
-        )
-        self.embeddings = (  # pyright: ignore[reportDeprecated]
-            async_to_streamed_response_wrapper(
-                inference.embeddings,  # pyright: ignore[reportDeprecated],
-            )
-        )
         self.rerank = async_to_streamed_response_wrapper(
             inference.rerank,
         )
diff --git a/src/llama_stack_client/resources/models/openai.py b/src/llama_stack_client/resources/models/openai.py
index e4b2fbd8..ab4b4038 100644
--- a/src/llama_stack_client/resources/models/openai.py
+++ b/src/llama_stack_client/resources/models/openai.py
@@ -17,7 +17,7 @@
 )
 from ..._wrappers import DataWrapper
 from ..._base_client import make_request_options
-from ...types.models.openai_list_response import OpenAIListResponse
+from ...types.model_list_response import ModelListResponse
 
 __all__ = ["OpenAIResource", "AsyncOpenAIResource"]
 
@@ -51,18 +51,18 @@ def list(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> OpenAIListResponse:
-        """List models using the OpenAI API."""
+    ) -> ModelListResponse:
+        """List all models."""
         return self._get(
-            "/v1/openai/v1/models",
+            "/v1/models",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                post_parser=DataWrapper[OpenAIListResponse]._unwrapper,
+                post_parser=DataWrapper[ModelListResponse]._unwrapper,
             ),
-            cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]),
+            cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]),
         )
 
 
@@ -95,18 +95,18 @@ async def list(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> OpenAIListResponse:
-        """List models using the OpenAI API."""
+    ) -> ModelListResponse:
+        """List all models."""
         return await self._get(
-            "/v1/openai/v1/models",
+            "/v1/models",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
                 extra_body=extra_body,
                 timeout=timeout,
-                post_parser=DataWrapper[OpenAIListResponse]._unwrapper,
+                post_parser=DataWrapper[ModelListResponse]._unwrapper,
             ),
-            cast_to=cast(Type[OpenAIListResponse], DataWrapper[OpenAIListResponse]),
+            cast_to=cast(Type[ModelListResponse], DataWrapper[ModelListResponse]),
         )
 
 
diff --git a/src/llama_stack_client/resources/moderations.py b/src/llama_stack_client/resources/moderations.py
index a016b5b0..a73dc85a 100644
--- a/src/llama_stack_client/resources/moderations.py
+++ b/src/llama_stack_client/resources/moderations.py
@@ -73,7 +73,7 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._post(
-            "/v1/openai/v1/moderations",
+            "/v1/moderations",
             body=maybe_transform(
                 {
                     "input": input,
@@ -138,7 +138,7 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/openai/v1/moderations",
+            "/v1/moderations",
             body=await async_maybe_transform(
                 {
                     "input": input,
diff --git a/src/llama_stack_client/resources/responses/input_items.py b/src/llama_stack_client/resources/responses/input_items.py
index da06debd..a5836ba7 100644
--- a/src/llama_stack_client/resources/responses/input_items.py
+++ b/src/llama_stack_client/resources/responses/input_items.py
@@ -85,7 +85,7 @@ def list(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return self._get(
-            f"/v1/openai/v1/responses/{response_id}/input_items",
+            f"/v1/responses/{response_id}/input_items",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
@@ -168,7 +168,7 @@ async def list(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return await self._get(
-            f"/v1/openai/v1/responses/{response_id}/input_items",
+            f"/v1/responses/{response_id}/input_items",
             options=make_request_options(
                 extra_headers=extra_headers,
                 extra_query=extra_query,
diff --git a/src/llama_stack_client/resources/responses/responses.py b/src/llama_stack_client/resources/responses/responses.py
index 7f21f3ea..16e38fd0 100644
--- a/src/llama_stack_client/resources/responses/responses.py
+++ b/src/llama_stack_client/resources/responses/responses.py
@@ -228,7 +228,7 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ResponseObject | Stream[ResponseObjectStream]:
         return self._post(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             body=maybe_transform(
                 {
                     "input": input,
@@ -281,7 +281,7 @@ def retrieve(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return self._get(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -323,7 +323,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             page=SyncOpenAICursorPage[ResponseListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -369,7 +369,7 @@ def delete(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return self._delete(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -568,7 +568,7 @@ async def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> ResponseObject | AsyncStream[ResponseObjectStream]:
         return await self._post(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             body=await async_maybe_transform(
                 {
                     "input": input,
@@ -621,7 +621,7 @@ async def retrieve(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return await self._get(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -663,7 +663,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/responses",
+            "/v1/responses",
             page=AsyncOpenAICursorPage[ResponseListResponse],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -709,7 +709,7 @@ async def delete(
         if not response_id:
             raise ValueError(f"Expected a non-empty value for `response_id` but received {response_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/responses/{response_id}",
+            f"/v1/responses/{response_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/vector_stores/files.py b/src/llama_stack_client/resources/vector_stores/files.py
index 39f16a66..f9a1ef31 100644
--- a/src/llama_stack_client/resources/vector_stores/files.py
+++ b/src/llama_stack_client/resources/vector_stores/files.py
@@ -82,7 +82,7 @@ def create(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             body=maybe_transform(
                 {
                     "file_id": file_id,
@@ -126,7 +126,7 @@ def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -165,7 +165,7 @@ def update(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             body=maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams),
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -218,7 +218,7 @@ def list(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._get_api_list(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             page=SyncOpenAICursorPage[VectorStoreFile],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -268,7 +268,7 @@ def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -304,7 +304,7 @@ def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -367,7 +367,7 @@ async def create(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             body=await async_maybe_transform(
                 {
                     "file_id": file_id,
@@ -411,7 +411,7 @@ async def retrieve(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -450,7 +450,7 @@ async def update(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             body=await async_maybe_transform({"attributes": attributes}, file_update_params.FileUpdateParams),
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
@@ -503,7 +503,7 @@ def list(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._get_api_list(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files",
+            f"/v1/vector_stores/{vector_store_id}/files",
             page=AsyncOpenAICursorPage[VectorStoreFile],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -553,7 +553,7 @@ async def delete(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -589,7 +589,7 @@ async def content(
         if not file_id:
             raise ValueError(f"Expected a non-empty value for `file_id` but received {file_id!r}")
         return await self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+            f"/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
diff --git a/src/llama_stack_client/resources/vector_stores/vector_stores.py b/src/llama_stack_client/resources/vector_stores/vector_stores.py
index f3ab01f2..f858100b 100644
--- a/src/llama_stack_client/resources/vector_stores/vector_stores.py
+++ b/src/llama_stack_client/resources/vector_stores/vector_stores.py
@@ -112,7 +112,7 @@ def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._post(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             body=maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
@@ -158,7 +158,7 @@ def retrieve(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -200,7 +200,7 @@ def update(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             body=maybe_transform(
                 {
                     "expires_after": expires_after,
@@ -255,7 +255,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             page=SyncOpenAICursorPage[VectorStore],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -301,7 +301,7 @@ def delete(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -354,7 +354,7 @@ def search(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/search",
+            f"/v1/vector_stores/{vector_store_id}/search",
             body=maybe_transform(
                 {
                     "query": query,
@@ -446,7 +446,7 @@ async def create(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return await self._post(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             body=await async_maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
@@ -492,7 +492,7 @@ async def retrieve(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._get(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -534,7 +534,7 @@ async def update(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             body=await async_maybe_transform(
                 {
                     "expires_after": expires_after,
@@ -589,7 +589,7 @@ def list(
           timeout: Override the client-level default timeout for this request, in seconds
         """
         return self._get_api_list(
-            "/v1/openai/v1/vector_stores",
+            "/v1/vector_stores",
             page=AsyncOpenAICursorPage[VectorStore],
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -635,7 +635,7 @@ async def delete(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._delete(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}",
+            f"/v1/vector_stores/{vector_store_id}",
             options=make_request_options(
                 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
             ),
@@ -688,7 +688,7 @@ async def search(
         if not vector_store_id:
             raise ValueError(f"Expected a non-empty value for `vector_store_id` but received {vector_store_id!r}")
         return await self._post(
-            f"/v1/openai/v1/vector_stores/{vector_store_id}/search",
+            f"/v1/vector_stores/{vector_store_id}/search",
             body=await async_maybe_transform(
                 {
                     "query": query,
diff --git a/src/llama_stack_client/types/__init__.py b/src/llama_stack_client/types/__init__.py
index 56b7f887..f81ada61 100644
--- a/src/llama_stack_client/types/__init__.py
+++ b/src/llama_stack_client/types/__init__.py
@@ -17,18 +17,15 @@
     QueryConfig as QueryConfig,
     QueryResult as QueryResult,
     UserMessage as UserMessage,
-    ContentDelta as ContentDelta,
     ScoringResult as ScoringResult,
     SystemMessage as SystemMessage,
     ResponseFormat as ResponseFormat,
     SamplingParams as SamplingParams,
-    BatchCompletion as BatchCompletion,
     SafetyViolation as SafetyViolation,
     CompletionMessage as CompletionMessage,
     InterleavedContent as InterleavedContent,
     ToolParamDefinition as ToolParamDefinition,
     ToolResponseMessage as ToolResponseMessage,
-    QueryGeneratorConfig as QueryGeneratorConfig,
     ChatCompletionResponse as ChatCompletionResponse,
     InterleavedContentItem as InterleavedContentItem,
 )
@@ -48,7 +45,6 @@
 from .tool_def_param import ToolDefParam as ToolDefParam
 from .create_response import CreateResponse as CreateResponse
 from .response_object import ResponseObject as ResponseObject
-from .token_log_probs import TokenLogProbs as TokenLogProbs
 from .file_list_params import FileListParams as FileListParams
 from .shield_call_step import ShieldCallStep as ShieldCallStep
 from .span_with_status import SpanWithStatus as SpanWithStatus
@@ -61,8 +57,6 @@
 from .tool_list_response import ToolListResponse as ToolListResponse
 from .agent_create_params import AgentCreateParams as AgentCreateParams
 from .agent_list_response import AgentListResponse as AgentListResponse
-from .completion_response import CompletionResponse as CompletionResponse
-from .embeddings_response import EmbeddingsResponse as EmbeddingsResponse
 from .list_files_response import ListFilesResponse as ListFilesResponse
 from .list_tools_response import ListToolsResponse as ListToolsResponse
 from .model_list_response import ModelListResponse as ModelListResponse
@@ -71,7 +65,6 @@
 from .tool_execution_step import ToolExecutionStep as ToolExecutionStep
 from .tool_response_param import ToolResponseParam as ToolResponseParam
 from .delete_file_response import DeleteFileResponse as DeleteFileResponse
-from .eval_candidate_param import EvalCandidateParam as EvalCandidateParam
 from .eval_run_eval_params import EvalRunEvalParams as EvalRunEvalParams
 from .list_models_response import ListModelsResponse as ListModelsResponse
 from .list_routes_response import ListRoutesResponse as ListRoutesResponse
@@ -134,8 +127,6 @@
 from .vector_store_create_params import VectorStoreCreateParams as VectorStoreCreateParams
 from .vector_store_search_params import VectorStoreSearchParams as VectorStoreSearchParams
 from .vector_store_update_params import VectorStoreUpdateParams as VectorStoreUpdateParams
-from .inference_completion_params import InferenceCompletionParams as InferenceCompletionParams
-from .inference_embeddings_params import InferenceEmbeddingsParams as InferenceEmbeddingsParams
 from .list_vector_stores_response import ListVectorStoresResponse as ListVectorStoresResponse
 from .telemetry_get_span_response import TelemetryGetSpanResponse as TelemetryGetSpanResponse
 from .vector_db_register_response import VectorDBRegisterResponse as VectorDBRegisterResponse
@@ -154,26 +145,15 @@
 from .list_scoring_functions_response import ListScoringFunctionsResponse as ListScoringFunctionsResponse
 from .telemetry_query_traces_response import TelemetryQueryTracesResponse as TelemetryQueryTracesResponse
 from .tool_runtime_invoke_tool_params import ToolRuntimeInvokeToolParams as ToolRuntimeInvokeToolParams
-from .inference_chat_completion_params import InferenceChatCompletionParams as InferenceChatCompletionParams
 from .list_post_training_jobs_response import ListPostTrainingJobsResponse as ListPostTrainingJobsResponse
 from .scoring_function_register_params import ScoringFunctionRegisterParams as ScoringFunctionRegisterParams
 from .telemetry_get_span_tree_response import TelemetryGetSpanTreeResponse as TelemetryGetSpanTreeResponse
 from .telemetry_query_metrics_response import TelemetryQueryMetricsResponse as TelemetryQueryMetricsResponse
 from .tool_runtime_list_tools_response import ToolRuntimeListToolsResponse as ToolRuntimeListToolsResponse
-from .inference_batch_completion_params import InferenceBatchCompletionParams as InferenceBatchCompletionParams
 from .synthetic_data_generation_response import SyntheticDataGenerationResponse as SyntheticDataGenerationResponse
-from .chat_completion_response_stream_chunk import (
-    ChatCompletionResponseStreamChunk as ChatCompletionResponseStreamChunk,
-)
-from .inference_batch_chat_completion_params import (
-    InferenceBatchChatCompletionParams as InferenceBatchChatCompletionParams,
-)
 from .telemetry_save_spans_to_dataset_params import (
     TelemetrySaveSpansToDatasetParams as TelemetrySaveSpansToDatasetParams,
 )
-from .inference_batch_chat_completion_response import (
-    InferenceBatchChatCompletionResponse as InferenceBatchChatCompletionResponse,
-)
 from .post_training_preference_optimize_params import (
     PostTrainingPreferenceOptimizeParams as PostTrainingPreferenceOptimizeParams,
 )
diff --git a/src/llama_stack_client/types/agents/__init__.py b/src/llama_stack_client/types/agents/__init__.py
index f4f48353..3a144840 100644
--- a/src/llama_stack_client/types/agents/__init__.py
+++ b/src/llama_stack_client/types/agents/__init__.py
@@ -13,5 +13,4 @@
 from .step_retrieve_response import StepRetrieveResponse as StepRetrieveResponse
 from .session_create_response import SessionCreateResponse as SessionCreateResponse
 from .session_retrieve_params import SessionRetrieveParams as SessionRetrieveParams
-from .turn_response_event_payload import TurnResponseEventPayload as TurnResponseEventPayload
 from .agent_turn_response_stream_chunk import AgentTurnResponseStreamChunk as AgentTurnResponseStreamChunk
diff --git a/src/llama_stack_client/types/agents/turn_response_event.py b/src/llama_stack_client/types/agents/turn_response_event.py
index df213246..c52121ab 100644
--- a/src/llama_stack_client/types/agents/turn_response_event.py
+++ b/src/llama_stack_client/types/agents/turn_response_event.py
@@ -1,11 +1,160 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
+from typing import Dict, List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from .turn import Turn
+from ..._utils import PropertyInfo
 from ..._models import BaseModel
-from .turn_response_event_payload import TurnResponseEventPayload
+from ..inference_step import InferenceStep
+from ..shared.tool_call import ToolCall
+from ..shield_call_step import ShieldCallStep
+from ..tool_execution_step import ToolExecutionStep
+from ..memory_retrieval_step import MemoryRetrievalStep
+
+__all__ = [
+    "TurnResponseEvent",
+    "Payload",
+    "PayloadAgentTurnResponseStepStartPayload",
+    "PayloadAgentTurnResponseStepProgressPayload",
+    "PayloadAgentTurnResponseStepProgressPayloadDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta",
+    "PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall",
+    "PayloadAgentTurnResponseStepCompletePayload",
+    "PayloadAgentTurnResponseStepCompletePayloadStepDetails",
+    "PayloadAgentTurnResponseTurnStartPayload",
+    "PayloadAgentTurnResponseTurnCompletePayload",
+    "PayloadAgentTurnResponseTurnAwaitingInputPayload",
+]
+
+
+class PayloadAgentTurnResponseStepStartPayload(BaseModel):
+    event_type: Literal["step_start"]
+    """Type of event being reported"""
+
+    step_id: str
+    """Unique identifier for the step within a turn"""
+
+    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+    """Type of step being executed"""
+
+    metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
+    """(Optional) Additional metadata for the step"""
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta(BaseModel):
+    text: str
+    """The incremental text content"""
+
+    type: Literal["text"]
+    """Discriminator type of the delta. Always "text" """
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta(BaseModel):
+    image: str
+    """The incremental image data as bytes"""
+
+    type: Literal["image"]
+    """Discriminator type of the delta. Always "image" """
+
+
+PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
+
+
+class PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta(BaseModel):
+    parse_status: Literal["started", "in_progress", "failed", "succeeded"]
+    """Current parsing status of the tool call"""
+
+    tool_call: PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDeltaToolCall
+    """Either an in-progress tool call string or the final parsed tool call"""
+
+    type: Literal["tool_call"]
+    """Discriminator type of the delta. Always "tool_call" """
+
+
+PayloadAgentTurnResponseStepProgressPayloadDelta: TypeAlias = Annotated[
+    Union[
+        PayloadAgentTurnResponseStepProgressPayloadDeltaTextDelta,
+        PayloadAgentTurnResponseStepProgressPayloadDeltaImageDelta,
+        PayloadAgentTurnResponseStepProgressPayloadDeltaToolCallDelta,
+    ],
+    PropertyInfo(discriminator="type"),
+]
+
+
+class PayloadAgentTurnResponseStepProgressPayload(BaseModel):
+    delta: PayloadAgentTurnResponseStepProgressPayloadDelta
+    """Incremental content changes during step execution"""
+
+    event_type: Literal["step_progress"]
+    """Type of event being reported"""
+
+    step_id: str
+    """Unique identifier for the step within a turn"""
+
+    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+    """Type of step being executed"""
+
+
+PayloadAgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[
+    Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep],
+    PropertyInfo(discriminator="step_type"),
+]
+
+
+class PayloadAgentTurnResponseStepCompletePayload(BaseModel):
+    event_type: Literal["step_complete"]
+    """Type of event being reported"""
+
+    step_details: PayloadAgentTurnResponseStepCompletePayloadStepDetails
+    """Complete details of the executed step"""
+
+    step_id: str
+    """Unique identifier for the step within a turn"""
+
+    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
+    """Type of step being executed"""
+
+
+class PayloadAgentTurnResponseTurnStartPayload(BaseModel):
+    event_type: Literal["turn_start"]
+    """Type of event being reported"""
+
+    turn_id: str
+    """Unique identifier for the turn within a session"""
+
+
+class PayloadAgentTurnResponseTurnCompletePayload(BaseModel):
+    event_type: Literal["turn_complete"]
+    """Type of event being reported"""
+
+    turn: Turn
+    """Complete turn data including all steps and results"""
+
+
+class PayloadAgentTurnResponseTurnAwaitingInputPayload(BaseModel):
+    event_type: Literal["turn_awaiting_input"]
+    """Type of event being reported"""
+
+    turn: Turn
+    """Turn data when waiting for external tool responses"""
+
 
-__all__ = ["TurnResponseEvent"]
+Payload: TypeAlias = Annotated[
+    Union[
+        PayloadAgentTurnResponseStepStartPayload,
+        PayloadAgentTurnResponseStepProgressPayload,
+        PayloadAgentTurnResponseStepCompletePayload,
+        PayloadAgentTurnResponseTurnStartPayload,
+        PayloadAgentTurnResponseTurnCompletePayload,
+        PayloadAgentTurnResponseTurnAwaitingInputPayload,
+    ],
+    PropertyInfo(discriminator="event_type"),
+]
 
 
 class TurnResponseEvent(BaseModel):
-    payload: TurnResponseEventPayload
+    payload: Payload
     """Event-specific payload containing event data"""
diff --git a/src/llama_stack_client/types/agents/turn_response_event_payload.py b/src/llama_stack_client/types/agents/turn_response_event_payload.py
deleted file mode 100644
index 1844c61e..00000000
--- a/src/llama_stack_client/types/agents/turn_response_event_payload.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Dict, List, Union, Optional
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from .turn import Turn
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-from ..inference_step import InferenceStep
-from ..shield_call_step import ShieldCallStep
-from ..tool_execution_step import ToolExecutionStep
-from ..shared.content_delta import ContentDelta
-from ..memory_retrieval_step import MemoryRetrievalStep
-
-__all__ = [
-    "TurnResponseEventPayload",
-    "AgentTurnResponseStepStartPayload",
-    "AgentTurnResponseStepProgressPayload",
-    "AgentTurnResponseStepCompletePayload",
-    "AgentTurnResponseStepCompletePayloadStepDetails",
-    "AgentTurnResponseTurnStartPayload",
-    "AgentTurnResponseTurnCompletePayload",
-    "AgentTurnResponseTurnAwaitingInputPayload",
-]
-
-
-class AgentTurnResponseStepStartPayload(BaseModel):
-    event_type: Literal["step_start"]
-    """Type of event being reported"""
-
-    step_id: str
-    """Unique identifier for the step within a turn"""
-
-    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
-    """Type of step being executed"""
-
-    metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
-    """(Optional) Additional metadata for the step"""
-
-
-class AgentTurnResponseStepProgressPayload(BaseModel):
-    delta: ContentDelta
-    """Incremental content changes during step execution"""
-
-    event_type: Literal["step_progress"]
-    """Type of event being reported"""
-
-    step_id: str
-    """Unique identifier for the step within a turn"""
-
-    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
-    """Type of step being executed"""
-
-
-AgentTurnResponseStepCompletePayloadStepDetails: TypeAlias = Annotated[
-    Union[InferenceStep, ToolExecutionStep, ShieldCallStep, MemoryRetrievalStep],
-    PropertyInfo(discriminator="step_type"),
-]
-
-
-class AgentTurnResponseStepCompletePayload(BaseModel):
-    event_type: Literal["step_complete"]
-    """Type of event being reported"""
-
-    step_details: AgentTurnResponseStepCompletePayloadStepDetails
-    """Complete details of the executed step"""
-
-    step_id: str
-    """Unique identifier for the step within a turn"""
-
-    step_type: Literal["inference", "tool_execution", "shield_call", "memory_retrieval"]
-    """Type of step being executed"""
-
-
-class AgentTurnResponseTurnStartPayload(BaseModel):
-    event_type: Literal["turn_start"]
-    """Type of event being reported"""
-
-    turn_id: str
-    """Unique identifier for the turn within a session"""
-
-
-class AgentTurnResponseTurnCompletePayload(BaseModel):
-    event_type: Literal["turn_complete"]
-    """Type of event being reported"""
-
-    turn: Turn
-    """Complete turn data including all steps and results"""
-
-
-class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
-    event_type: Literal["turn_awaiting_input"]
-    """Type of event being reported"""
-
-    turn: Turn
-    """Turn data when waiting for external tool responses"""
-
-
-TurnResponseEventPayload: TypeAlias = Annotated[
-    Union[
-        AgentTurnResponseStepStartPayload,
-        AgentTurnResponseStepProgressPayload,
-        AgentTurnResponseStepCompletePayload,
-        AgentTurnResponseTurnStartPayload,
-        AgentTurnResponseTurnCompletePayload,
-        AgentTurnResponseTurnAwaitingInputPayload,
-    ],
-    PropertyInfo(discriminator="event_type"),
-]
diff --git a/src/llama_stack_client/types/benchmark_config_param.py b/src/llama_stack_client/types/benchmark_config_param.py
index 740bf99b..dc968521 100644
--- a/src/llama_stack_client/types/benchmark_config_param.py
+++ b/src/llama_stack_client/types/benchmark_config_param.py
@@ -2,17 +2,42 @@
 
 from __future__ import annotations
 
-from typing import Dict
-from typing_extensions import Required, TypedDict
+from typing import Dict, Union
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .eval_candidate_param import EvalCandidateParam
 from .scoring_fn_params_param import ScoringFnParamsParam
+from .shared_params.agent_config import AgentConfig
+from .shared_params.system_message import SystemMessage
+from .shared_params.sampling_params import SamplingParams
 
-__all__ = ["BenchmarkConfigParam"]
+__all__ = ["BenchmarkConfigParam", "EvalCandidate", "EvalCandidateModelCandidate", "EvalCandidateAgentCandidate"]
+
+
+class EvalCandidateModelCandidate(TypedDict, total=False):
+    model: Required[str]
+    """The model ID to evaluate."""
+
+    sampling_params: Required[SamplingParams]
+    """The sampling parameters for the model."""
+
+    type: Required[Literal["model"]]
+
+    system_message: SystemMessage
+    """(Optional) The system message providing instructions or context to the model."""
+
+
+class EvalCandidateAgentCandidate(TypedDict, total=False):
+    config: Required[AgentConfig]
+    """The configuration for the agent candidate."""
+
+    type: Required[Literal["agent"]]
+
+
+EvalCandidate: TypeAlias = Union[EvalCandidateModelCandidate, EvalCandidateAgentCandidate]
 
 
 class BenchmarkConfigParam(TypedDict, total=False):
-    eval_candidate: Required[EvalCandidateParam]
+    eval_candidate: Required[EvalCandidate]
     """The candidate to evaluate."""
 
     scoring_params: Required[Dict[str, ScoringFnParamsParam]]
diff --git a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py b/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
deleted file mode 100644
index 1a55f3d1..00000000
--- a/src/llama_stack_client/types/chat_completion_response_stream_chunk.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Optional
-from typing_extensions import Literal
-
-from .._models import BaseModel
-from .shared.metric import Metric
-from .token_log_probs import TokenLogProbs
-from .shared.content_delta import ContentDelta
-
-__all__ = ["ChatCompletionResponseStreamChunk", "Event"]
-
-
-class Event(BaseModel):
-    delta: ContentDelta
-    """Content generated since last event.
-
-    This can be one or more tokens, or a tool call.
-    """
-
-    event_type: Literal["start", "complete", "progress"]
-    """Type of the event"""
-
-    logprobs: Optional[List[TokenLogProbs]] = None
-    """Optional log probabilities for generated tokens"""
-
-    stop_reason: Optional[Literal["end_of_turn", "end_of_message", "out_of_tokens"]] = None
-    """Optional reason why generation stopped, if complete"""
-
-
-class ChatCompletionResponseStreamChunk(BaseModel):
-    event: Event
-    """The event containing the new content"""
-
-    metrics: Optional[List[Metric]] = None
-    """(Optional) List of metrics associated with the API response"""
diff --git a/src/llama_stack_client/types/completion_response.py b/src/llama_stack_client/types/completion_response.py
deleted file mode 100644
index 9718be8a..00000000
--- a/src/llama_stack_client/types/completion_response.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List, Optional
-from typing_extensions import Literal
-
-from .._models import BaseModel
-from .shared.metric import Metric
-from .token_log_probs import TokenLogProbs
-
-__all__ = ["CompletionResponse"]
-
-
-class CompletionResponse(BaseModel):
-    content: str
-    """The generated completion text"""
-
-    stop_reason: Literal["end_of_turn", "end_of_message", "out_of_tokens"]
-    """Reason why generation stopped"""
-
-    logprobs: Optional[List[TokenLogProbs]] = None
-    """Optional log probabilities for generated tokens"""
-
-    metrics: Optional[List[Metric]] = None
-    """(Optional) List of metrics associated with the API response"""
diff --git a/src/llama_stack_client/types/embeddings_response.py b/src/llama_stack_client/types/embeddings_response.py
deleted file mode 100644
index f36c6b97..00000000
--- a/src/llama_stack_client/types/embeddings_response.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from .._models import BaseModel
-
-__all__ = ["EmbeddingsResponse"]
-
-
-class EmbeddingsResponse(BaseModel):
-    embeddings: List[List[float]]
-    """List of embedding vectors, one per input content.
-
-    Each embedding is a list of floats. The dimensionality of the embedding is
-    model-specific; you can check model metadata using /models/{model_id}
-    """
diff --git a/src/llama_stack_client/types/eval_candidate_param.py b/src/llama_stack_client/types/eval_candidate_param.py
deleted file mode 100644
index be1b21c8..00000000
--- a/src/llama_stack_client/types/eval_candidate_param.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-from .shared_params.agent_config import AgentConfig
-from .shared_params.system_message import SystemMessage
-from .shared_params.sampling_params import SamplingParams
-
-__all__ = ["EvalCandidateParam", "ModelCandidate", "AgentCandidate"]
-
-
-class ModelCandidate(TypedDict, total=False):
-    model: Required[str]
-    """The model ID to evaluate."""
-
-    sampling_params: Required[SamplingParams]
-    """The sampling parameters for the model."""
-
-    type: Required[Literal["model"]]
-
-    system_message: SystemMessage
-    """(Optional) The system message providing instructions or context to the model."""
-
-
-class AgentCandidate(TypedDict, total=False):
-    config: Required[AgentConfig]
-    """The configuration for the agent candidate."""
-
-    type: Required[Literal["agent"]]
-
-
-EvalCandidateParam: TypeAlias = Union[ModelCandidate, AgentCandidate]
diff --git a/src/llama_stack_client/types/file_create_params.py b/src/llama_stack_client/types/file_create_params.py
index 8322c0a9..2be39a7a 100644
--- a/src/llama_stack_client/types/file_create_params.py
+++ b/src/llama_stack_client/types/file_create_params.py
@@ -6,7 +6,7 @@
 
 from .._types import FileTypes
 
-__all__ = ["FileCreateParams"]
+__all__ = ["FileCreateParams", "ExpiresAfter"]
 
 
 class FileCreateParams(TypedDict, total=False):
@@ -14,3 +14,16 @@ class FileCreateParams(TypedDict, total=False):
 
     purpose: Required[Literal["assistants", "batch"]]
     """Valid purpose values for OpenAI Files API."""
+
+    expires_after: ExpiresAfter
+    """Control expiration of uploaded files. Params:
+
+    - anchor, must be "created_at"
+    - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+    """
+
+
+class ExpiresAfter(TypedDict, total=False):
+    anchor: Required[Literal["created_at"]]
+
+    seconds: Required[int]
diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_params.py b/src/llama_stack_client/types/inference_batch_chat_completion_params.py
deleted file mode 100644
index b5da0f0e..00000000
--- a/src/llama_stack_client/types/inference_batch_chat_completion_params.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Dict, Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.message import Message
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.tool_param_definition import ToolParamDefinition
-
-__all__ = ["InferenceBatchChatCompletionParams", "Logprobs", "ToolConfig", "Tool"]
-
-
-class InferenceBatchChatCompletionParams(TypedDict, total=False):
-    messages_batch: Required[Iterable[Iterable[Message]]]
-    """The messages to generate completions for."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding."""
-
-    sampling_params: SamplingParams
-    """(Optional) Parameters to control the sampling strategy."""
-
-    tool_config: ToolConfig
-    """(Optional) Configuration for tool use."""
-
-    tools: Iterable[Tool]
-    """(Optional) List of tool definitions available to the model."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
-
-
-class ToolConfig(TypedDict, total=False):
-    system_message_behavior: Literal["append", "replace"]
-    """(Optional) Config for how to override the default system prompt.
-
-    - `SystemMessageBehavior.append`: Appends the provided system message to the
-      default system prompt. - `SystemMessageBehavior.replace`: Replaces the default
-      system prompt with the provided system message. The system message can include
-      the string '{{function_definitions}}' to indicate where the function
-      definitions should be inserted.
-    """
-
-    tool_choice: Union[Literal["auto", "required", "none"], str]
-    """(Optional) Whether tool use is automatic, required, or none.
-
-    Can also specify a tool name to use a specific tool. Defaults to
-    ToolChoice.auto.
-    """
-
-    tool_prompt_format: Literal["json", "function_tag", "python_list"]
-    """(Optional) Instructs the model how to format tool calls.
-
-    By default, Llama Stack will attempt to use a format that is best adapted to the
-    model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
-    object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-    <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-    are output as Python syntax -- a list of function calls.
-    """
-
-
-class Tool(TypedDict, total=False):
-    tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]]
-
-    description: str
-
-    parameters: Dict[str, ToolParamDefinition]
diff --git a/src/llama_stack_client/types/inference_batch_chat_completion_response.py b/src/llama_stack_client/types/inference_batch_chat_completion_response.py
deleted file mode 100644
index ed24908d..00000000
--- a/src/llama_stack_client/types/inference_batch_chat_completion_response.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from .._models import BaseModel
-from .shared.chat_completion_response import ChatCompletionResponse
-
-__all__ = ["InferenceBatchChatCompletionResponse"]
-
-
-class InferenceBatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
-    """List of chat completion responses, one for each conversation in the batch"""
diff --git a/src/llama_stack_client/types/inference_batch_completion_params.py b/src/llama_stack_client/types/inference_batch_completion_params.py
deleted file mode 100644
index b225b883..00000000
--- a/src/llama_stack_client/types/inference_batch_completion_params.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing_extensions import Required, TypedDict
-
-from .._types import SequenceNotStr
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.interleaved_content import InterleavedContent
-
-__all__ = ["InferenceBatchCompletionParams", "Logprobs"]
-
-
-class InferenceBatchCompletionParams(TypedDict, total=False):
-    content_batch: Required[SequenceNotStr[InterleavedContent]]
-    """The content to generate completions for."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding."""
-
-    sampling_params: SamplingParams
-    """(Optional) Parameters to control the sampling strategy."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
diff --git a/src/llama_stack_client/types/inference_chat_completion_params.py b/src/llama_stack_client/types/inference_chat_completion_params.py
deleted file mode 100644
index 746d3dee..00000000
--- a/src/llama_stack_client/types/inference_chat_completion_params.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Dict, Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.message import Message
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.tool_param_definition import ToolParamDefinition
-
-__all__ = [
-    "InferenceChatCompletionParamsBase",
-    "Logprobs",
-    "ToolConfig",
-    "Tool",
-    "InferenceChatCompletionParamsNonStreaming",
-    "InferenceChatCompletionParamsStreaming",
-]
-
-
-class InferenceChatCompletionParamsBase(TypedDict, total=False):
-    messages: Required[Iterable[Message]]
-    """List of messages in the conversation."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding.
-
-    There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
-    schema. Most providers support this format. - `ResponseFormat.grammar`: The
-    grammar is a BNF grammar. This format is more flexible, but not all providers
-    support it.
-    """
-
-    sampling_params: SamplingParams
-    """Parameters to control the sampling strategy."""
-
-    tool_choice: Literal["auto", "required", "none"]
-    """(Optional) Whether tool use is required or automatic.
-
-    Defaults to ToolChoice.auto. .. deprecated:: Use tool_config instead.
-    """
-
-    tool_config: ToolConfig
-    """(Optional) Configuration for tool use."""
-
-    tool_prompt_format: Literal["json", "function_tag", "python_list"]
-    """(Optional) Instructs the model how to format tool calls.
-
-    By default, Llama Stack will attempt to use a format that is best adapted to the
-    model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
-    object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-    <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-    are output as Python syntax -- a list of function calls. .. deprecated:: Use
-    tool_config instead.
-    """
-
-    tools: Iterable[Tool]
-    """(Optional) List of tool definitions available to the model."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
-
-
-class ToolConfig(TypedDict, total=False):
-    system_message_behavior: Literal["append", "replace"]
-    """(Optional) Config for how to override the default system prompt.
-
-    - `SystemMessageBehavior.append`: Appends the provided system message to the
-      default system prompt. - `SystemMessageBehavior.replace`: Replaces the default
-      system prompt with the provided system message. The system message can include
-      the string '{{function_definitions}}' to indicate where the function
-      definitions should be inserted.
-    """
-
-    tool_choice: Union[Literal["auto", "required", "none"], str]
-    """(Optional) Whether tool use is automatic, required, or none.
-
-    Can also specify a tool name to use a specific tool. Defaults to
-    ToolChoice.auto.
-    """
-
-    tool_prompt_format: Literal["json", "function_tag", "python_list"]
-    """(Optional) Instructs the model how to format tool calls.
-
-    By default, Llama Stack will attempt to use a format that is best adapted to the
-    model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON
-    object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a
-    <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls
-    are output as Python syntax -- a list of function calls.
-    """
-
-
-class Tool(TypedDict, total=False):
-    tool_name: Required[Union[Literal["brave_search", "wolfram_alpha", "photogen", "code_interpreter"], str]]
-
-    description: str
-
-    parameters: Dict[str, ToolParamDefinition]
-
-
-class InferenceChatCompletionParamsNonStreaming(InferenceChatCompletionParamsBase, total=False):
-    stream: Literal[False]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-class InferenceChatCompletionParamsStreaming(InferenceChatCompletionParamsBase):
-    stream: Required[Literal[True]]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-InferenceChatCompletionParams = Union[InferenceChatCompletionParamsNonStreaming, InferenceChatCompletionParamsStreaming]
diff --git a/src/llama_stack_client/types/inference_completion_params.py b/src/llama_stack_client/types/inference_completion_params.py
deleted file mode 100644
index c122f017..00000000
--- a/src/llama_stack_client/types/inference_completion_params.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypedDict
-
-from .shared_params.response_format import ResponseFormat
-from .shared_params.sampling_params import SamplingParams
-from .shared_params.interleaved_content import InterleavedContent
-
-__all__ = [
-    "InferenceCompletionParamsBase",
-    "Logprobs",
-    "InferenceCompletionParamsNonStreaming",
-    "InferenceCompletionParamsStreaming",
-]
-
-
-class InferenceCompletionParamsBase(TypedDict, total=False):
-    content: Required[InterleavedContent]
-    """The content to generate a completion for."""
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be registered with Llama Stack and available via the /models
-    endpoint.
-    """
-
-    logprobs: Logprobs
-    """
-    (Optional) If specified, log probabilities for each token position will be
-    returned.
-    """
-
-    response_format: ResponseFormat
-    """(Optional) Grammar specification for guided (structured) decoding."""
-
-    sampling_params: SamplingParams
-    """(Optional) Parameters to control the sampling strategy."""
-
-
-class Logprobs(TypedDict, total=False):
-    top_k: int
-    """How many tokens (for each position) to return log probabilities for."""
-
-
-class InferenceCompletionParamsNonStreaming(InferenceCompletionParamsBase, total=False):
-    stream: Literal[False]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-class InferenceCompletionParamsStreaming(InferenceCompletionParamsBase):
-    stream: Required[Literal[True]]
-    """(Optional) If True, generate an SSE event stream of the response.
-
-    Defaults to False.
-    """
-
-
-InferenceCompletionParams = Union[InferenceCompletionParamsNonStreaming, InferenceCompletionParamsStreaming]
diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py
deleted file mode 100644
index a1be545b..00000000
--- a/src/llama_stack_client/types/inference_embeddings_params.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union, Iterable
-from typing_extensions import Literal, Required, TypedDict
-
-from .._types import SequenceNotStr
-from .shared_params.interleaved_content_item import InterleavedContentItem
-
-__all__ = ["InferenceEmbeddingsParams"]
-
-
-class InferenceEmbeddingsParams(TypedDict, total=False):
-    contents: Required[Union[SequenceNotStr[str], Iterable[InterleavedContentItem]]]
-    """List of contents to generate embeddings for.
-
-    Each content can be a string or an InterleavedContentItem (and hence can be
-    multimodal). The behavior depends on the model and provider. Some models may
-    only support text.
-    """
-
-    model_id: Required[str]
-    """The identifier of the model to use.
-
-    The model must be an embedding model registered with Llama Stack and available
-    via the /models endpoint.
-    """
-
-    output_dimension: int
-    """(Optional) Output dimensionality for the embeddings.
-
-    Only supported by Matryoshka models.
-    """
-
-    task_type: Literal["query", "document"]
-    """
-    (Optional) How is the embedding being used? This is only supported by asymmetric
-    embedding models.
-    """
-
-    text_truncation: Literal["none", "start", "end"]
-    """
-    (Optional) Config for how to truncate text for embedding when text is longer
-    than the model's max sequence length.
-    """
diff --git a/src/llama_stack_client/types/models/openai_list_response.py b/src/llama_stack_client/types/models/openai_list_response.py
index f14845d5..5b6c0358 100644
--- a/src/llama_stack_client/types/models/openai_list_response.py
+++ b/src/llama_stack_client/types/models/openai_list_response.py
@@ -1,21 +1,10 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import List
-from typing_extensions import Literal, TypeAlias
+from typing_extensions import TypeAlias
 
-from ..._models import BaseModel
+from ..model import Model
 
-__all__ = ["OpenAIListResponse", "OpenAIListResponseItem"]
+__all__ = ["OpenAIListResponse"]
 
-
-class OpenAIListResponseItem(BaseModel):
-    id: str
-
-    created: int
-
-    object: Literal["model"]
-
-    owned_by: str
-
-
-OpenAIListResponse: TypeAlias = List[OpenAIListResponseItem]
+OpenAIListResponse: TypeAlias = List[Model]
diff --git a/src/llama_stack_client/types/response_list_response.py b/src/llama_stack_client/types/response_list_response.py
index ae50d44a..ac7ec1b1 100644
--- a/src/llama_stack_client/types/response_list_response.py
+++ b/src/llama_stack_client/types/response_list_response.py
@@ -570,6 +570,3 @@ class ResponseListResponse(BaseModel):
 
     truncation: Optional[str] = None
     """(Optional) Truncation strategy applied to the response"""
-
-    user: Optional[str] = None
-    """(Optional) User identifier associated with the request"""
diff --git a/src/llama_stack_client/types/response_object.py b/src/llama_stack_client/types/response_object.py
index c0f348a9..b618ddf5 100644
--- a/src/llama_stack_client/types/response_object.py
+++ b/src/llama_stack_client/types/response_object.py
@@ -361,6 +361,3 @@ def output_text(self) -> str:
 
     truncation: Optional[str] = None
     """(Optional) Truncation strategy applied to the response"""
-
-    user: Optional[str] = None
-    """(Optional) User identifier associated with the request"""
diff --git a/src/llama_stack_client/types/shared/__init__.py b/src/llama_stack_client/types/shared/__init__.py
index fb14d8a6..f346cda7 100644
--- a/src/llama_stack_client/types/shared/__init__.py
+++ b/src/llama_stack_client/types/shared/__init__.py
@@ -9,17 +9,14 @@
 from .query_config import QueryConfig as QueryConfig
 from .query_result import QueryResult as QueryResult
 from .user_message import UserMessage as UserMessage
-from .content_delta import ContentDelta as ContentDelta
 from .scoring_result import ScoringResult as ScoringResult
 from .system_message import SystemMessage as SystemMessage
 from .response_format import ResponseFormat as ResponseFormat
 from .sampling_params import SamplingParams as SamplingParams
-from .batch_completion import BatchCompletion as BatchCompletion
 from .safety_violation import SafetyViolation as SafetyViolation
 from .completion_message import CompletionMessage as CompletionMessage
 from .interleaved_content import InterleavedContent as InterleavedContent
 from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
 from .tool_response_message import ToolResponseMessage as ToolResponseMessage
-from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
 from .chat_completion_response import ChatCompletionResponse as ChatCompletionResponse
 from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared/batch_completion.py b/src/llama_stack_client/types/shared/batch_completion.py
deleted file mode 100644
index 43a0a735..00000000
--- a/src/llama_stack_client/types/shared/batch_completion.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import List
-
-from ..._models import BaseModel
-from ..completion_response import CompletionResponse
-
-__all__ = ["BatchCompletion"]
-
-
-class BatchCompletion(BaseModel):
-    batch: List[CompletionResponse]
-    """List of completion responses, one for each input in the batch"""
diff --git a/src/llama_stack_client/types/shared/chat_completion_response.py b/src/llama_stack_client/types/shared/chat_completion_response.py
index 30191439..eb78a109 100644
--- a/src/llama_stack_client/types/shared/chat_completion_response.py
+++ b/src/llama_stack_client/types/shared/chat_completion_response.py
@@ -1,20 +1,24 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import List, Optional
+from typing import Dict, List, Optional
 
 from .metric import Metric
 from ..._models import BaseModel
-from ..token_log_probs import TokenLogProbs
 from .completion_message import CompletionMessage
 
-__all__ = ["ChatCompletionResponse"]
+__all__ = ["ChatCompletionResponse", "Logprob"]
+
+
+class Logprob(BaseModel):
+    logprobs_by_token: Dict[str, float]
+    """Dictionary mapping tokens to their log probabilities"""
 
 
 class ChatCompletionResponse(BaseModel):
     completion_message: CompletionMessage
     """The complete response message"""
 
-    logprobs: Optional[List[TokenLogProbs]] = None
+    logprobs: Optional[List[Logprob]] = None
     """Optional log probabilities for generated tokens"""
 
     metrics: Optional[List[Metric]] = None
diff --git a/src/llama_stack_client/types/shared/content_delta.py b/src/llama_stack_client/types/shared/content_delta.py
deleted file mode 100644
index 7ed58d13..00000000
--- a/src/llama_stack_client/types/shared/content_delta.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Union
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-from .tool_call import ToolCall
-
-__all__ = ["ContentDelta", "TextDelta", "ImageDelta", "ToolCallDelta", "ToolCallDeltaToolCall"]
-
-
-class TextDelta(BaseModel):
-    text: str
-    """The incremental text content"""
-
-    type: Literal["text"]
-    """Discriminator type of the delta. Always "text" """
-
-
-class ImageDelta(BaseModel):
-    image: str
-    """The incremental image data as bytes"""
-
-    type: Literal["image"]
-    """Discriminator type of the delta. Always "image" """
-
-
-ToolCallDeltaToolCall: TypeAlias = Union[str, ToolCall]
-
-
-class ToolCallDelta(BaseModel):
-    parse_status: Literal["started", "in_progress", "failed", "succeeded"]
-    """Current parsing status of the tool call"""
-
-    tool_call: ToolCallDeltaToolCall
-    """Either an in-progress tool call string or the final parsed tool call"""
-
-    type: Literal["tool_call"]
-    """Discriminator type of the delta. Always "tool_call" """
-
-
-ContentDelta: TypeAlias = Annotated[Union[TextDelta, ImageDelta, ToolCallDelta], PropertyInfo(discriminator="type")]
diff --git a/src/llama_stack_client/types/shared/query_config.py b/src/llama_stack_client/types/shared/query_config.py
index 389514c7..a4a1f741 100644
--- a/src/llama_stack_client/types/shared/query_config.py
+++ b/src/llama_stack_client/types/shared/query_config.py
@@ -5,9 +5,41 @@
 
 from ..._utils import PropertyInfo
 from ..._models import BaseModel
-from .query_generator_config import QueryGeneratorConfig
 
-__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"]
+__all__ = [
+    "QueryConfig",
+    "QueryGeneratorConfig",
+    "QueryGeneratorConfigDefaultRagQueryGeneratorConfig",
+    "QueryGeneratorConfigLlmragQueryGeneratorConfig",
+    "Ranker",
+    "RankerRrfRanker",
+    "RankerWeightedRanker",
+]
+
+
+class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(BaseModel):
+    separator: str
+    """String separator used to join query terms"""
+
+    type: Literal["default"]
+    """Type of query generator, always 'default'"""
+
+
+class QueryGeneratorConfigLlmragQueryGeneratorConfig(BaseModel):
+    model: str
+    """Name of the language model to use for query generation"""
+
+    template: str
+    """Template string for formatting the query generation prompt"""
+
+    type: Literal["llm"]
+    """Type of query generator, always 'llm'"""
+
+
+QueryGeneratorConfig: TypeAlias = Annotated[
+    Union[QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig],
+    PropertyInfo(discriminator="type"),
+]
 
 
 class RankerRrfRanker(BaseModel):
diff --git a/src/llama_stack_client/types/shared/query_generator_config.py b/src/llama_stack_client/types/shared/query_generator_config.py
deleted file mode 100644
index 624fc190..00000000
--- a/src/llama_stack_client/types/shared/query_generator_config.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Union
-from typing_extensions import Literal, Annotated, TypeAlias
-
-from ..._utils import PropertyInfo
-from ..._models import BaseModel
-
-__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"]
-
-
-class DefaultRagQueryGeneratorConfig(BaseModel):
-    separator: str
-    """String separator used to join query terms"""
-
-    type: Literal["default"]
-    """Type of query generator, always 'default'"""
-
-
-class LlmragQueryGeneratorConfig(BaseModel):
-    model: str
-    """Name of the language model to use for query generation"""
-
-    template: str
-    """Template string for formatting the query generation prompt"""
-
-    type: Literal["llm"]
-    """Type of query generator, always 'llm'"""
-
-
-QueryGeneratorConfig: TypeAlias = Annotated[
-    Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig], PropertyInfo(discriminator="type")
-]
diff --git a/src/llama_stack_client/types/shared/tool_param_definition.py b/src/llama_stack_client/types/shared/tool_param_definition.py
index 1466c1f9..316f1e01 100644
--- a/src/llama_stack_client/types/shared/tool_param_definition.py
+++ b/src/llama_stack_client/types/shared/tool_param_definition.py
@@ -14,4 +14,8 @@ class ToolParamDefinition(BaseModel):
 
     description: Optional[str] = None
 
+    items: Union[bool, float, str, List[object], object, None] = None
+
     required: Optional[bool] = None
+
+    title: Optional[str] = None
diff --git a/src/llama_stack_client/types/shared_params/__init__.py b/src/llama_stack_client/types/shared_params/__init__.py
index 3a0842e8..894d8a8d 100644
--- a/src/llama_stack_client/types/shared_params/__init__.py
+++ b/src/llama_stack_client/types/shared_params/__init__.py
@@ -11,7 +11,5 @@
 from .sampling_params import SamplingParams as SamplingParams
 from .completion_message import CompletionMessage as CompletionMessage
 from .interleaved_content import InterleavedContent as InterleavedContent
-from .tool_param_definition import ToolParamDefinition as ToolParamDefinition
 from .tool_response_message import ToolResponseMessage as ToolResponseMessage
-from .query_generator_config import QueryGeneratorConfig as QueryGeneratorConfig
 from .interleaved_content_item import InterleavedContentItem as InterleavedContentItem
diff --git a/src/llama_stack_client/types/shared_params/query_config.py b/src/llama_stack_client/types/shared_params/query_config.py
index d008c48c..91a5b596 100644
--- a/src/llama_stack_client/types/shared_params/query_config.py
+++ b/src/llama_stack_client/types/shared_params/query_config.py
@@ -5,9 +5,39 @@
 from typing import Union
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from .query_generator_config import QueryGeneratorConfig
+__all__ = [
+    "QueryConfig",
+    "QueryGeneratorConfig",
+    "QueryGeneratorConfigDefaultRagQueryGeneratorConfig",
+    "QueryGeneratorConfigLlmragQueryGeneratorConfig",
+    "Ranker",
+    "RankerRrfRanker",
+    "RankerWeightedRanker",
+]
 
-__all__ = ["QueryConfig", "Ranker", "RankerRrfRanker", "RankerWeightedRanker"]
+
+class QueryGeneratorConfigDefaultRagQueryGeneratorConfig(TypedDict, total=False):
+    separator: Required[str]
+    """String separator used to join query terms"""
+
+    type: Required[Literal["default"]]
+    """Type of query generator, always 'default'"""
+
+
+class QueryGeneratorConfigLlmragQueryGeneratorConfig(TypedDict, total=False):
+    model: Required[str]
+    """Name of the language model to use for query generation"""
+
+    template: Required[str]
+    """Template string for formatting the query generation prompt"""
+
+    type: Required[Literal["llm"]]
+    """Type of query generator, always 'llm'"""
+
+
+QueryGeneratorConfig: TypeAlias = Union[
+    QueryGeneratorConfigDefaultRagQueryGeneratorConfig, QueryGeneratorConfigLlmragQueryGeneratorConfig
+]
 
 
 class RankerRrfRanker(TypedDict, total=False):
diff --git a/src/llama_stack_client/types/shared_params/query_generator_config.py b/src/llama_stack_client/types/shared_params/query_generator_config.py
deleted file mode 100644
index 8c589bf9..00000000
--- a/src/llama_stack_client/types/shared_params/query_generator_config.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, Required, TypeAlias, TypedDict
-
-__all__ = ["QueryGeneratorConfig", "DefaultRagQueryGeneratorConfig", "LlmragQueryGeneratorConfig"]
-
-
-class DefaultRagQueryGeneratorConfig(TypedDict, total=False):
-    separator: Required[str]
-    """String separator used to join query terms"""
-
-    type: Required[Literal["default"]]
-    """Type of query generator, always 'default'"""
-
-
-class LlmragQueryGeneratorConfig(TypedDict, total=False):
-    model: Required[str]
-    """Name of the language model to use for query generation"""
-
-    template: Required[str]
-    """Template string for formatting the query generation prompt"""
-
-    type: Required[Literal["llm"]]
-    """Type of query generator, always 'llm'"""
-
-
-QueryGeneratorConfig: TypeAlias = Union[DefaultRagQueryGeneratorConfig, LlmragQueryGeneratorConfig]
diff --git a/src/llama_stack_client/types/shared_params/tool_param_definition.py b/src/llama_stack_client/types/shared_params/tool_param_definition.py
deleted file mode 100644
index 2d7805fe..00000000
--- a/src/llama_stack_client/types/shared_params/tool_param_definition.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from __future__ import annotations
-
-from typing import Union, Iterable
-from typing_extensions import Required, TypedDict
-
-__all__ = ["ToolParamDefinition"]
-
-
-class ToolParamDefinition(TypedDict, total=False):
-    param_type: Required[str]
-
-    default: Union[bool, float, str, Iterable[object], object, None]
-
-    description: str
-
-    required: bool
diff --git a/src/llama_stack_client/types/token_log_probs.py b/src/llama_stack_client/types/token_log_probs.py
deleted file mode 100644
index b1a0a2b4..00000000
--- a/src/llama_stack_client/types/token_log_probs.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
-
-from typing import Dict
-
-from .._models import BaseModel
-
-__all__ = ["TokenLogProbs"]
-
-
-class TokenLogProbs(BaseModel):
-    logprobs_by_token: Dict[str, float]
-    """Dictionary mapping tokens to their log probabilities"""
diff --git a/src/llama_stack_client/types/tool.py b/src/llama_stack_client/types/tool.py
index c6994268..a7243b64 100644
--- a/src/llama_stack_client/types/tool.py
+++ b/src/llama_stack_client/types/tool.py
@@ -24,6 +24,12 @@ class Parameter(BaseModel):
     default: Union[bool, float, str, List[object], object, None] = None
     """(Optional) Default value for the parameter if not provided"""
 
+    items: Optional[object] = None
+    """Type of the elements when parameter_type is array"""
+
+    title: Optional[str] = None
+    """(Optional) Title of the parameter"""
+
 
 class Tool(BaseModel):
     description: str
diff --git a/src/llama_stack_client/types/tool_def.py b/src/llama_stack_client/types/tool_def.py
index c82a9b8a..21949b41 100644
--- a/src/llama_stack_client/types/tool_def.py
+++ b/src/llama_stack_client/types/tool_def.py
@@ -23,6 +23,12 @@ class Parameter(BaseModel):
     default: Union[bool, float, str, List[object], object, None] = None
     """(Optional) Default value for the parameter if not provided"""
 
+    items: Optional[object] = None
+    """Type of the elements when parameter_type is array"""
+
+    title: Optional[str] = None
+    """(Optional) Title of the parameter"""
+
 
 class ToolDef(BaseModel):
     name: str
diff --git a/src/llama_stack_client/types/tool_def_param.py b/src/llama_stack_client/types/tool_def_param.py
index 93ad8285..a50437b2 100644
--- a/src/llama_stack_client/types/tool_def_param.py
+++ b/src/llama_stack_client/types/tool_def_param.py
@@ -24,6 +24,12 @@ class Parameter(TypedDict, total=False):
     default: Union[bool, float, str, Iterable[object], object, None]
     """(Optional) Default value for the parameter if not provided"""
 
+    items: object
+    """Type of the elements when parameter_type is array"""
+
+    title: str
+    """(Optional) Title of the parameter"""
+
 
 class ToolDefParam(TypedDict, total=False):
     name: Required[str]
diff --git a/tests/api_resources/models/test_openai.py b/tests/api_resources/models/test_openai.py
index ea64cce2..f94d2bf6 100644
--- a/tests/api_resources/models/test_openai.py
+++ b/tests/api_resources/models/test_openai.py
@@ -9,7 +9,7 @@
 
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types.models import OpenAIListResponse
+from llama_stack_client.types import ModelListResponse
 
 base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
 
@@ -20,7 +20,7 @@ class TestOpenAI:
     @parametrize
     def test_method_list(self, client: LlamaStackClient) -> None:
         openai = client.models.openai.list()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     def test_raw_response_list(self, client: LlamaStackClient) -> None:
@@ -29,7 +29,7 @@ def test_raw_response_list(self, client: LlamaStackClient) -> None:
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         openai = response.parse()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     def test_streaming_response_list(self, client: LlamaStackClient) -> None:
@@ -38,7 +38,7 @@ def test_streaming_response_list(self, client: LlamaStackClient) -> None:
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             openai = response.parse()
-            assert_matches_type(OpenAIListResponse, openai, path=["response"])
+            assert_matches_type(ModelListResponse, openai, path=["response"])
 
         assert cast(Any, response.is_closed) is True
 
@@ -51,7 +51,7 @@ class TestAsyncOpenAI:
     @parametrize
     async def test_method_list(self, async_client: AsyncLlamaStackClient) -> None:
         openai = await async_client.models.openai.list()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -60,7 +60,7 @@ async def test_raw_response_list(self, async_client: AsyncLlamaStackClient) -> N
         assert response.is_closed is True
         assert response.http_request.headers.get("X-Stainless-Lang") == "python"
         openai = await response.parse()
-        assert_matches_type(OpenAIListResponse, openai, path=["response"])
+        assert_matches_type(ModelListResponse, openai, path=["response"])
 
     @parametrize
     async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient) -> None:
@@ -69,6 +69,6 @@ async def test_streaming_response_list(self, async_client: AsyncLlamaStackClient
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
 
             openai = await response.parse()
-            assert_matches_type(OpenAIListResponse, openai, path=["response"])
+            assert_matches_type(ModelListResponse, openai, path=["response"])
 
         assert cast(Any, response.is_closed) is True
diff --git a/tests/api_resources/test_agents.py b/tests/api_resources/test_agents.py
index 18b34012..c19bc9bf 100644
--- a/tests/api_resources/test_agents.py
+++ b/tests/api_resources/test_agents.py
@@ -49,6 +49,8 @@ def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
                                 "parameter_type": "parameter_type",
                                 "required": True,
                                 "default": True,
+                                "items": {},
+                                "title": "title",
                             }
                         ],
                     }
@@ -253,6 +255,8 @@ async def test_method_create_with_all_params(self, async_client: AsyncLlamaStack
                                 "parameter_type": "parameter_type",
                                 "required": True,
                                 "default": True,
+                                "items": {},
+                                "title": "title",
                             }
                         ],
                     }
diff --git a/tests/api_resources/test_files.py b/tests/api_resources/test_files.py
index d9b29ffc..83b763ab 100644
--- a/tests/api_resources/test_files.py
+++ b/tests/api_resources/test_files.py
@@ -26,6 +26,18 @@ def test_method_create(self, client: LlamaStackClient) -> None:
         )
         assert_matches_type(File, file, path=["response"])
 
+    @parametrize
+    def test_method_create_with_all_params(self, client: LlamaStackClient) -> None:
+        file = client.files.create(
+            file=b"raw file contents",
+            purpose="assistants",
+            expires_after={
+                "anchor": "created_at",
+                "seconds": 0,
+            },
+        )
+        assert_matches_type(File, file, path=["response"])
+
     @parametrize
     def test_raw_response_create(self, client: LlamaStackClient) -> None:
         response = client.files.with_raw_response.create(
@@ -215,6 +227,18 @@ async def test_method_create(self, async_client: AsyncLlamaStackClient) -> None:
         )
         assert_matches_type(File, file, path=["response"])
 
+    @parametrize
+    async def test_method_create_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
+        file = await async_client.files.create(
+            file=b"raw file contents",
+            purpose="assistants",
+            expires_after={
+                "anchor": "created_at",
+                "seconds": 0,
+            },
+        )
+        assert_matches_type(File, file, path=["response"])
+
     @parametrize
     async def test_raw_response_create(self, async_client: AsyncLlamaStackClient) -> None:
         response = await async_client.files.with_raw_response.create(
diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py
index 474ff7cf..f26802c2 100644
--- a/tests/api_resources/test_inference.py
+++ b/tests/api_resources/test_inference.py
@@ -9,15 +9,7 @@
 
 from tests.utils import assert_matches_type
 from llama_stack_client import LlamaStackClient, AsyncLlamaStackClient
-from llama_stack_client.types import (
-    CompletionResponse,
-    EmbeddingsResponse,
-    InferenceRerankResponse,
-    InferenceBatchChatCompletionResponse,
-)
-from llama_stack_client.types.shared import BatchCompletion, ChatCompletionResponse
-
-# pyright: reportDeprecated=false
+from llama_stack_client.types import InferenceRerankResponse
 
 base_url = os.environ.get("TEST_API_BASE_URL", "http://127.0.0.1:4010")
 
@@ -25,539 +17,6 @@
 class TestInference:
     parametrize = pytest.mark.parametrize("client", [False, True], indirect=True, ids=["loose", "strict"])
 
-    @parametrize
-    def test_method_batch_chat_completion(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_batch_chat_completion_with_all_params(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ]
-            ],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-            tool_config={
-                "system_message_behavior": "append",
-                "tool_choice": "auto",
-                "tool_prompt_format": "json",
-            },
-            tools=[
-                {
-                    "tool_name": "brave_search",
-                    "description": "description",
-                    "parameters": {
-                        "foo": {
-                            "param_type": "param_type",
-                            "default": True,
-                            "description": "description",
-                            "required": True,
-                        }
-                    },
-                }
-            ],
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_batch_chat_completion(self, client: LlamaStackClient) -> None:
-        response = client.inference.with_raw_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_batch_chat_completion(self, client: LlamaStackClient) -> None:
-        with client.inference.with_streaming_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = response.parse()
-            assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_batch_completion(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    def test_method_batch_completion_with_all_params(self, client: LlamaStackClient) -> None:
-        inference = client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_batch_completion(self, client: LlamaStackClient) -> None:
-        response = client.inference.with_raw_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_batch_completion(self, client: LlamaStackClient) -> None:
-        with client.inference.with_streaming_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = response.parse()
-            assert_matches_type(BatchCompletion, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_chat_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "required": True,
-                            }
-                        },
-                    }
-                ],
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_chat_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = response.parse()
-                assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_method_chat_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "required": True,
-                            }
-                        },
-                    }
-                ],
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_raw_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = response.parse()
-        stream.close()
-
-    @parametrize
-    def test_streaming_response_chat_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                stream = response.parse()
-                stream.close()
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_completion_with_all_params_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.completion(
-                content="string",
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_completion_overload_1(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = response.parse()
-                assert_matches_type(CompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_method_completion_with_all_params_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-            )
-
-        inference_stream.response.close()
-
-    @parametrize
-    def test_raw_response_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = response.parse()
-        stream.close()
-
-    @parametrize
-    def test_streaming_response_completion_overload_2(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                stream = response.parse()
-                stream.close()
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    def test_method_embeddings(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    def test_method_embeddings_with_all_params(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-                output_dimension=0,
-                task_type="query",
-                text_truncation="none",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    def test_raw_response_embeddings(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = client.inference.with_raw_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = response.parse()
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    def test_streaming_response_embeddings(self, client: LlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            with client.inference.with_streaming_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = response.parse()
-                assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
     @parametrize
     def test_method_rerank(self, client: LlamaStackClient) -> None:
         inference = client.inference.rerank(
@@ -611,539 +70,6 @@ class TestAsyncInference:
         "async_client", [False, True, {"http_client": "aiohttp"}], indirect=True, ids=["loose", "strict", "aiohttp"]
     )
 
-    @parametrize
-    async def test_method_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_batch_chat_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ]
-            ],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-            tool_config={
-                "system_message_behavior": "append",
-                "tool_choice": "auto",
-                "tool_prompt_format": "json",
-            },
-            tools=[
-                {
-                    "tool_name": "brave_search",
-                    "description": "description",
-                    "parameters": {
-                        "foo": {
-                            "param_type": "param_type",
-                            "default": True,
-                            "description": "description",
-                            "required": True,
-                        }
-                    },
-                }
-            ],
-        )
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        response = await async_client.inference.with_raw_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_batch_chat_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        async with async_client.inference.with_streaming_response.batch_chat_completion(
-            messages_batch=[
-                [
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ]
-            ],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = await response.parse()
-            assert_matches_type(InferenceBatchChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    async def test_method_batch_completion_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        inference = await async_client.inference.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-            logprobs={"top_k": 0},
-            response_format={
-                "json_schema": {"foo": True},
-                "type": "json_schema",
-            },
-            sampling_params={
-                "strategy": {"type": "greedy"},
-                "max_tokens": 0,
-                "repetition_penalty": 0,
-                "stop": ["string"],
-            },
-        )
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        response = await async_client.inference.with_raw_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(BatchCompletion, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_batch_completion(self, async_client: AsyncLlamaStackClient) -> None:
-        async with async_client.inference.with_streaming_response.batch_completion(
-            content_batch=["string"],
-            model_id="model_id",
-        ) as response:
-            assert not response.is_closed
-            assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-            inference = await response.parse()
-            assert_matches_type(BatchCompletion, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_chat_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "required": True,
-                            }
-                        },
-                    }
-                ],
-            )
-
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_chat_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = await response.parse()
-                assert_matches_type(ChatCompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_method_chat_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                        "context": "string",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                tool_choice="auto",
-                tool_config={
-                    "system_message_behavior": "append",
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                },
-                tool_prompt_format="json",
-                tools=[
-                    {
-                        "tool_name": "brave_search",
-                        "description": "description",
-                        "parameters": {
-                            "foo": {
-                                "param_type": "param_type",
-                                "default": True,
-                                "description": "description",
-                                "required": True,
-                            }
-                        },
-                    }
-                ],
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_raw_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = await response.parse()
-        await stream.close()
-
-    @parametrize
-    async def test_streaming_response_chat_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.chat_completion(
-                messages=[
-                    {
-                        "content": "string",
-                        "role": "user",
-                    }
-                ],
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                stream = await response.parse()
-                await stream.close()
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_completion_with_all_params_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-                stream=False,
-            )
-
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(CompletionResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_completion_overload_1(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = await response.parse()
-                assert_matches_type(CompletionResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_method_completion_with_all_params_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference_stream = await async_client.inference.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-                logprobs={"top_k": 0},
-                response_format={
-                    "json_schema": {"foo": True},
-                    "type": "json_schema",
-                },
-                sampling_params={
-                    "strategy": {"type": "greedy"},
-                    "max_tokens": 0,
-                    "repetition_penalty": 0,
-                    "stop": ["string"],
-                },
-            )
-
-        await inference_stream.response.aclose()
-
-    @parametrize
-    async def test_raw_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            )
-
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        stream = await response.parse()
-        await stream.close()
-
-    @parametrize
-    async def test_streaming_response_completion_overload_2(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.completion(
-                content="string",
-                model_id="model_id",
-                stream=True,
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                stream = await response.parse()
-                await stream.close()
-
-        assert cast(Any, response.is_closed) is True
-
-    @parametrize
-    async def test_method_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_method_embeddings_with_all_params(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            inference = await async_client.inference.embeddings(
-                contents=["string"],
-                model_id="model_id",
-                output_dimension=0,
-                task_type="query",
-                text_truncation="none",
-            )
-
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_raw_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            response = await async_client.inference.with_raw_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            )
-
-        assert response.is_closed is True
-        assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-        inference = await response.parse()
-        assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-    @parametrize
-    async def test_streaming_response_embeddings(self, async_client: AsyncLlamaStackClient) -> None:
-        with pytest.warns(DeprecationWarning):
-            async with async_client.inference.with_streaming_response.embeddings(
-                contents=["string"],
-                model_id="model_id",
-            ) as response:
-                assert not response.is_closed
-                assert response.http_request.headers.get("X-Stainless-Lang") == "python"
-
-                inference = await response.parse()
-                assert_matches_type(EmbeddingsResponse, inference, path=["response"])
-
-        assert cast(Any, response.is_closed) is True
-
     @parametrize
     async def test_method_rerank(self, async_client: AsyncLlamaStackClient) -> None:
         inference = await async_client.inference.rerank(
diff --git a/tests/test_client.py b/tests/test_client.py
index a5bce12c..708c7420 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -678,17 +678,17 @@ def test_parse_retry_after_header(self, remaining_retries: int, retry_after: str
     @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
     @pytest.mark.respx(base_url=base_url)
     def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+        respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error"))
 
         with pytest.raises(APITimeoutError):
-            client.inference.with_streaming_response.chat_completion(
+            client.chat.completions.with_streaming_response.create(
                 messages=[
                     {
                         "content": "string",
                         "role": "user",
                     }
                 ],
-                model_id="model_id",
+                model="model",
             ).__enter__()
 
         assert _get_open_connections(self.client) == 0
@@ -696,17 +696,17 @@ def test_retrying_timeout_errors_doesnt_leak(self, respx_mock: MockRouter, clien
     @mock.patch("llama_stack_client._base_client.BaseClient._calculate_retry_timeout", _low_retry_timeout)
     @pytest.mark.respx(base_url=base_url)
     def test_retrying_status_errors_doesnt_leak(self, respx_mock: MockRouter, client: LlamaStackClient) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500))
+        respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500))
 
         with pytest.raises(APIStatusError):
-            client.inference.with_streaming_response.chat_completion(
+            client.chat.completions.with_streaming_response.create(
                 messages=[
                     {
                         "content": "string",
                         "role": "user",
                     }
                 ],
-                model_id="model_id",
+                model="model",
             ).__enter__()
         assert _get_open_connections(self.client) == 0
 
@@ -734,16 +734,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = client.inference.with_raw_response.chat_completion(
+        response = client.chat.completions.with_raw_response.create(
             messages=[
                 {
                     "content": "string",
                     "role": "user",
                 }
             ],
-            model_id="model_id",
+            model="model",
         )
 
         assert response.retries_taken == failures_before_success
@@ -766,16 +766,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = client.inference.with_raw_response.chat_completion(
+        response = client.chat.completions.with_raw_response.create(
             messages=[
                 {
                     "content": "string",
                     "role": "user",
                 }
             ],
-            model_id="model_id",
+            model="model",
             extra_headers={"x-stainless-retry-count": Omit()},
         )
 
@@ -798,16 +798,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = client.inference.with_raw_response.chat_completion(
+        response = client.chat.completions.with_raw_response.create(
             messages=[
                 {
                     "content": "string",
                     "role": "user",
                 }
             ],
-            model_id="model_id",
+            model="model",
             extra_headers={"x-stainless-retry-count": "42"},
         )
 
@@ -1498,17 +1498,17 @@ async def test_parse_retry_after_header(self, remaining_retries: int, retry_afte
     async def test_retrying_timeout_errors_doesnt_leak(
         self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
     ) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=httpx.TimeoutException("Test timeout error"))
+        respx_mock.post("/v1/chat/completions").mock(side_effect=httpx.TimeoutException("Test timeout error"))
 
         with pytest.raises(APITimeoutError):
-            await async_client.inference.with_streaming_response.chat_completion(
+            await async_client.chat.completions.with_streaming_response.create(
                 messages=[
                     {
                         "content": "string",
                         "role": "user",
                     }
                 ],
-                model_id="model_id",
+                model="model",
             ).__aenter__()
 
         assert _get_open_connections(self.client) == 0
@@ -1518,17 +1518,17 @@ async def test_retrying_timeout_errors_doesnt_leak(
     async def test_retrying_status_errors_doesnt_leak(
         self, respx_mock: MockRouter, async_client: AsyncLlamaStackClient
     ) -> None:
-        respx_mock.post("/v1/inference/chat-completion").mock(return_value=httpx.Response(500))
+        respx_mock.post("/v1/chat/completions").mock(return_value=httpx.Response(500))
 
         with pytest.raises(APIStatusError):
-            await async_client.inference.with_streaming_response.chat_completion(
+            await async_client.chat.completions.with_streaming_response.create(
                 messages=[
                     {
                         "content": "string",
                         "role": "user",
                     }
                 ],
-                model_id="model_id",
+                model="model",
             ).__aenter__()
         assert _get_open_connections(self.client) == 0
 
@@ -1557,16 +1557,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = await client.inference.with_raw_response.chat_completion(
+        response = await client.chat.completions.with_raw_response.create(
             messages=[
                 {
                     "content": "string",
                     "role": "user",
                 }
             ],
-            model_id="model_id",
+            model="model",
         )
 
         assert response.retries_taken == failures_before_success
@@ -1590,16 +1590,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = await client.inference.with_raw_response.chat_completion(
+        response = await client.chat.completions.with_raw_response.create(
             messages=[
                 {
                     "content": "string",
                     "role": "user",
                 }
             ],
-            model_id="model_id",
+            model="model",
             extra_headers={"x-stainless-retry-count": Omit()},
         )
 
@@ -1623,16 +1623,16 @@ def retry_handler(_request: httpx.Request) -> httpx.Response:
                 return httpx.Response(500)
             return httpx.Response(200)
 
-        respx_mock.post("/v1/inference/chat-completion").mock(side_effect=retry_handler)
+        respx_mock.post("/v1/chat/completions").mock(side_effect=retry_handler)
 
-        response = await client.inference.with_raw_response.chat_completion(
+        response = await client.chat.completions.with_raw_response.create(
             messages=[
                 {
                     "content": "string",
                     "role": "user",
                 }
             ],
-            model_id="model_id",
+            model="model",
             extra_headers={"x-stainless-retry-count": "42"},
         )