v0.1.4 - Sync updates from stainless branch: yanxi0830/dev (#164)

yanxi0830 · web-flow · commit bb3fa1752ab2 · 2025-02-21T13:52:18.000-08:00
# What does this PR do?
- as title

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
LLAMA_STACK_CONFIG=fireworks pytest -v tests/client-sdk/agents/test_agents.py --inference-model meta-llama/Llama-3.1-8B-Instruc
```

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)
diff --git a/src/llama_stack_client/_client.py b/src/llama_stack_client/_client.py
@@ -135,7 +135,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LLAMA_STACK_BASE_URL")
         if base_url is None:
-            base_url = "http://any-hosted-llama-stack.com"
+            base_url = f"http://any-hosted-llama-stack.com"
 
         custom_headers = default_headers or {}
         custom_headers["X-LlamaStack-Client-Version"] = __version__
@@ -351,7 +351,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LLAMA_STACK_BASE_URL")
         if base_url is None:
-            base_url = "http://any-hosted-llama-stack.com"
+            base_url = f"http://any-hosted-llama-stack.com"
 
         custom_headers = default_headers or {}
         custom_headers["X-LlamaStack-Client-Version"] = __version__
diff --git a/src/llama_stack_client/_files.py b/src/llama_stack_client/_files.py
@@ -71,7 +71,7 @@ def _transform_file(file: FileTypes) -> HttpxFileTypes:
     if is_tuple_t(file):
         return (file[0], _read_file_content(file[1]), *file[2:])
 
-    raise TypeError("Expected file types input to be a FileContent type or to be a tuple")
+    raise TypeError(f"Expected file types input to be a FileContent type or to be a tuple")
 
 
 def _read_file_content(file: FileContent) -> HttpxFileContent:
@@ -113,7 +113,7 @@ async def _async_transform_file(file: FileTypes) -> HttpxFileTypes:
     if is_tuple_t(file):
         return (file[0], await _async_read_file_content(file[1]), *file[2:])
 
-    raise TypeError("Expected file types input to be a FileContent type or to be a tuple")
+    raise TypeError(f"Expected file types input to be a FileContent type or to be a tuple")
 
 
 async def _async_read_file_content(file: FileContent) -> HttpxFileContent:
diff --git a/src/llama_stack_client/_response.py b/src/llama_stack_client/_response.py
@@ -229,7 +229,7 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
             # the response class ourselves but that is something that should be supported directly in httpx
             # as it would be easy to incorrectly construct the Response object due to the multitude of arguments.
             if cast_to != httpx.Response:
-                raise ValueError("Subclasses of httpx.Response cannot be passed to `cast_to`")
+                raise ValueError(f"Subclasses of httpx.Response cannot be passed to `cast_to`")
             return cast(R, response)
 
         if (
@@ -245,9 +245,9 @@ def _parse(self, *, to: type[_T] | None = None) -> R | _T:
 
         if (
             cast_to is not object
-            and origin is not list
-            and origin is not dict
-            and origin is not Union
+            and not origin is list
+            and not origin is dict
+            and not origin is Union
             and not issubclass(origin, BaseModel)
         ):
             raise RuntimeError(
diff --git a/src/llama_stack_client/resources/agents/turn.py b/src/llama_stack_client/resources/agents/turn.py
@@ -247,8 +247,18 @@ def resume(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> Turn:
-        """
+        """Resume an agent turn with executed tool call responses.
+
+        When a Turn has the
+        status `awaiting_input` due to pending input from client side tool calls, this
+        endpoint can be used to submit the outputs from the tool calls once they are
+        ready.
+
         Args:
+          tool_responses: The tool call responses to resume the turn with.
+
+          stream: Whether to stream the response.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -275,8 +285,18 @@ def resume(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> Stream[AgentTurnResponseStreamChunk]:
-        """
+        """Resume an agent turn with executed tool call responses.
+
+        When a Turn has the
+        status `awaiting_input` due to pending input from client side tool calls, this
+        endpoint can be used to submit the outputs from the tool calls once they are
+        ready.
+
         Args:
+          stream: Whether to stream the response.
+
+          tool_responses: The tool call responses to resume the turn with.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -303,8 +323,18 @@ def resume(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> Turn | Stream[AgentTurnResponseStreamChunk]:
-        """
+        """Resume an agent turn with executed tool call responses.
+
+        When a Turn has the
+        status `awaiting_input` due to pending input from client side tool calls, this
+        endpoint can be used to submit the outputs from the tool calls once they are
+        ready.
+
         Args:
+          stream: Whether to stream the response.
+
+          tool_responses: The tool call responses to resume the turn with.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -571,8 +601,18 @@ async def resume(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> Turn:
-        """
+        """Resume an agent turn with executed tool call responses.
+
+        When a Turn has the
+        status `awaiting_input` due to pending input from client side tool calls, this
+        endpoint can be used to submit the outputs from the tool calls once they are
+        ready.
+
         Args:
+          tool_responses: The tool call responses to resume the turn with.
+
+          stream: Whether to stream the response.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -599,8 +639,18 @@ async def resume(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> AsyncStream[AgentTurnResponseStreamChunk]:
-        """
+        """Resume an agent turn with executed tool call responses.
+
+        When a Turn has the
+        status `awaiting_input` due to pending input from client side tool calls, this
+        endpoint can be used to submit the outputs from the tool calls once they are
+        ready.
+
         Args:
+          stream: Whether to stream the response.
+
+          tool_responses: The tool call responses to resume the turn with.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -627,8 +677,18 @@ async def resume(
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
     ) -> Turn | AsyncStream[AgentTurnResponseStreamChunk]:
-        """
+        """Resume an agent turn with executed tool call responses.
+
+        When a Turn has the
+        status `awaiting_input` due to pending input from client side tool calls, this
+        endpoint can be used to submit the outputs from the tool calls once they are
+        ready.
+
         Args:
+          stream: Whether to stream the response.
+
+          tool_responses: The tool call responses to resume the turn with.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
diff --git a/src/llama_stack_client/resources/inference.py b/src/llama_stack_client/resources/inference.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import List, Iterable
+from typing import List, Union, Iterable
 from typing_extensions import Literal, overload
 
 import httpx
@@ -36,6 +36,7 @@
 from ..types.shared.chat_completion_response import ChatCompletionResponse
 from ..types.shared_params.interleaved_content import InterleavedContent
 from ..types.chat_completion_response_stream_chunk import ChatCompletionResponseStreamChunk
+from ..types.shared_params.interleaved_content_item import InterleavedContentItem
 
 __all__ = ["InferenceResource", "AsyncInferenceResource"]
 
@@ -493,8 +494,11 @@ def completion(
     def embeddings(
         self,
         *,
-        contents: List[InterleavedContent],
+        contents: Union[List[str], Iterable[InterleavedContentItem]],
         model_id: str,
+        output_dimension: int | NotGiven = NOT_GIVEN,
+        task_type: Literal["query", "document"] | NotGiven = NOT_GIVEN,
+        text_truncation: Literal["none", "start", "end"] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -506,13 +510,22 @@ def embeddings(
         Generate embeddings for content pieces using the specified model.
 
         Args:
-          contents: List of contents to generate embeddings for. Note that content can be
-              multimodal. The behavior depends on the model and provider. Some models may only
-              support text.
+          contents: List of contents to generate embeddings for. Each content can be a string or an
+              InterleavedContentItem (and hence can be multimodal). The behavior depends on
+              the model and provider. Some models may only support text.
 
           model_id: The identifier of the model to use. The model must be an embedding model
               registered with Llama Stack and available via the /models endpoint.
 
+          output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
+              Matryoshka models.
+
+          task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
+              embedding models.
+
+          text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
+              than the model's max sequence length.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -527,6 +540,9 @@ def embeddings(
                 {
                     "contents": contents,
                     "model_id": model_id,
+                    "output_dimension": output_dimension,
+                    "task_type": task_type,
+                    "text_truncation": text_truncation,
                 },
                 inference_embeddings_params.InferenceEmbeddingsParams,
             ),
@@ -990,8 +1006,11 @@ async def completion(
     async def embeddings(
         self,
         *,
-        contents: List[InterleavedContent],
+        contents: Union[List[str], Iterable[InterleavedContentItem]],
         model_id: str,
+        output_dimension: int | NotGiven = NOT_GIVEN,
+        task_type: Literal["query", "document"] | NotGiven = NOT_GIVEN,
+        text_truncation: Literal["none", "start", "end"] | NotGiven = NOT_GIVEN,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -1003,13 +1022,22 @@ async def embeddings(
         Generate embeddings for content pieces using the specified model.
 
         Args:
-          contents: List of contents to generate embeddings for. Note that content can be
-              multimodal. The behavior depends on the model and provider. Some models may only
-              support text.
+          contents: List of contents to generate embeddings for. Each content can be a string or an
+              InterleavedContentItem (and hence can be multimodal). The behavior depends on
+              the model and provider. Some models may only support text.
 
           model_id: The identifier of the model to use. The model must be an embedding model
               registered with Llama Stack and available via the /models endpoint.
 
+          output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by
+              Matryoshka models.
+
+          task_type: (Optional) How is the embedding being used? This is only supported by asymmetric
+              embedding models.
+
+          text_truncation: (Optional) Config for how to truncate text for embedding when text is longer
+              than the model's max sequence length.
+
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -1024,6 +1052,9 @@ async def embeddings(
                 {
                     "contents": contents,
                     "model_id": model_id,
+                    "output_dimension": output_dimension,
+                    "task_type": task_type,
+                    "text_truncation": text_truncation,
                 },
                 inference_embeddings_params.InferenceEmbeddingsParams,
             ),
diff --git a/src/llama_stack_client/types/agents/turn_resume_params.py b/src/llama_stack_client/types/agents/turn_resume_params.py
@@ -16,14 +16,17 @@ class TurnResumeParamsBase(TypedDict, total=False):
     session_id: Required[str]
 
     tool_responses: Required[Iterable[ToolResponseMessage]]
+    """The tool call responses to resume the turn with."""
 
 
 class TurnResumeParamsNonStreaming(TurnResumeParamsBase, total=False):
     stream: Literal[False]
+    """Whether to stream the response."""
 
 
 class TurnResumeParamsStreaming(TurnResumeParamsBase):
     stream: Required[Literal[True]]
+    """Whether to stream the response."""
 
 
 TurnResumeParams = Union[TurnResumeParamsNonStreaming, TurnResumeParamsStreaming]
diff --git a/src/llama_stack_client/types/inference_embeddings_params.py b/src/llama_stack_client/types/inference_embeddings_params.py
@@ -2,20 +2,21 @@
 
 from __future__ import annotations
 
-from typing import List
-from typing_extensions import Required, TypedDict
+from typing import List, Union, Iterable
+from typing_extensions import Literal, Required, TypedDict
 
-from .shared_params.interleaved_content import InterleavedContent
+from .shared_params.interleaved_content_item import InterleavedContentItem
 
 __all__ = ["InferenceEmbeddingsParams"]
 
 
 class InferenceEmbeddingsParams(TypedDict, total=False):
-    contents: Required[List[InterleavedContent]]
+    contents: Required[Union[List[str], Iterable[InterleavedContentItem]]]
     """List of contents to generate embeddings for.
 
-    Note that content can be multimodal. The behavior depends on the model and
-    provider. Some models may only support text.
+    Each content can be a string or an InterleavedContentItem (and hence can be
+    multimodal). The behavior depends on the model and provider. Some models may
+    only support text.
     """
 
     model_id: Required[str]
@@ -24,3 +25,21 @@ class InferenceEmbeddingsParams(TypedDict, total=False):
     The model must be an embedding model registered with Llama Stack and available
     via the /models endpoint.
     """
+
+    output_dimension: int
+    """(Optional) Output dimensionality for the embeddings.
+
+    Only supported by Matryoshka models.
+    """
+
+    task_type: Literal["query", "document"]
+    """
+    (Optional) How is the embedding being used? This is only supported by asymmetric
+    embedding models.
+    """
+
+    text_truncation: Literal["none", "start", "end"]
+    """
+    (Optional) Config for how to truncate text for embedding when text is longer
+    than the model's max sequence length.
+    """
diff --git a/src/llama_stack_client/types/shared/query_result.py b/src/llama_stack_client/types/shared/query_result.py
@@ -1,6 +1,6 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import Optional
+from typing import Dict, List, Union, Optional
 
 from ..._models import BaseModel
 from .interleaved_content import InterleavedContent
@@ -9,5 +9,7 @@
 
 
 class QueryResult(BaseModel):
+    metadata: Dict[str, Union[bool, float, str, List[object], object, None]]
+
     content: Optional[InterleavedContent] = None
     """A image content item"""
diff --git a/src/llama_stack_client/types/tool_invocation_result.py b/src/llama_stack_client/types/tool_invocation_result.py
@@ -1,6 +1,6 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
-from typing import Optional
+from typing import Dict, List, Union, Optional
 
 from .._models import BaseModel
 from .shared.interleaved_content import InterleavedContent
@@ -15,3 +15,5 @@ class ToolInvocationResult(BaseModel):
     error_code: Optional[int] = None
 
     error_message: Optional[str] = None
+
+    metadata: Optional[Dict[str, Union[bool, float, str, List[object], object, None]]] = None
diff --git a/src/llama_stack_client/types/tool_response.py b/src/llama_stack_client/types/tool_response.py
diff --git a/tests/api_resources/test_inference.py b/tests/api_resources/test_inference.py