feat(vector_io): Implement Contextual Retrieval for improved RAG search quality

stainless-app[bot] · stainless-app[bot] · commit 89ec5a7bf405 · 2026-02-19T17:24:17.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 108
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-373eb8eb3cc02e6f8a9fa33079a5e735886fbf62958ee83e3cdef7bb4c41be37.yml
-openapi_spec_hash: fe1fa50161da4f095d128b0de7787e96
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/llamastack%2Fllama-stack-client-621e8b8ef37d5ebf024fe3bf6a59486a90debf01acca2c9bb4e9032e2dff92d3.yml
+openapi_spec_hash: 51f623cd3ea4addf8f939dd4ef8962c8
 config_hash: 6aa61d4143c3e3df785972c0287d1370
diff --git a/src/llama_stack_client/types/vector_store_create_params.py b/src/llama_stack_client/types/vector_store_create_params.py
@@ -19,6 +19,8 @@
     "ChunkingStrategyVectorStoreChunkingStrategyAuto",
     "ChunkingStrategyVectorStoreChunkingStrategyStatic",
     "ChunkingStrategyVectorStoreChunkingStrategyStaticStatic",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextual",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextualContextual",
 ]
 
 
@@ -58,6 +60,53 @@ class ChunkingStrategyVectorStoreChunkingStrategyStatic(TypedDict, total=False):
     type: Literal["static"]
 
 
+class ChunkingStrategyVectorStoreChunkingStrategyContextualContextual(TypedDict, total=False):
+    """Configuration for contextual chunking."""
+
+    chunk_overlap_tokens: int
+    """Tokens to overlap between adjacent chunks.
+
+    Must be less than max_chunk_size_tokens.
+    """
+
+    context_prompt: str
+    """Prompt template for contextual retrieval.
+
+    Uses WHOLE_DOCUMENT and CHUNK_CONTENT placeholders wrapped in double curly
+    braces.
+    """
+
+    max_chunk_size_tokens: int
+    """Maximum tokens per chunk. Suggested ~700 to allow room for prepended context."""
+
+    max_concurrency: Optional[int]
+    """Maximum concurrent LLM calls. Falls back to config default if not provided."""
+
+    model_id: Optional[str]
+    """LLM model for generating context.
+
+    Falls back to VectorStoresConfig.contextual_retrieval_params.model if not
+    provided.
+    """
+
+    timeout_seconds: Optional[int]
+    """Timeout per LLM call in seconds. Falls back to config default if not provided."""
+
+
+class ChunkingStrategyVectorStoreChunkingStrategyContextual(TypedDict, total=False):
+    """
+    Contextual chunking strategy that uses an LLM to situate chunks within the document.
+    """
+
+    contextual: Required[ChunkingStrategyVectorStoreChunkingStrategyContextualContextual]
+    """Configuration for contextual chunking."""
+
+    type: Literal["contextual"]
+    """Strategy type identifier."""
+
+
 ChunkingStrategy: TypeAlias = Union[
-    ChunkingStrategyVectorStoreChunkingStrategyAuto, ChunkingStrategyVectorStoreChunkingStrategyStatic
+    ChunkingStrategyVectorStoreChunkingStrategyAuto,
+    ChunkingStrategyVectorStoreChunkingStrategyStatic,
+    ChunkingStrategyVectorStoreChunkingStrategyContextual,
 ]
diff --git a/src/llama_stack_client/types/vector_stores/file_batch_create_params.py b/src/llama_stack_client/types/vector_stores/file_batch_create_params.py
@@ -19,6 +19,8 @@
     "ChunkingStrategyVectorStoreChunkingStrategyAuto",
     "ChunkingStrategyVectorStoreChunkingStrategyStatic",
     "ChunkingStrategyVectorStoreChunkingStrategyStaticStatic",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextual",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextualContextual",
 ]
 
 
@@ -54,6 +56,53 @@ class ChunkingStrategyVectorStoreChunkingStrategyStatic(TypedDict, total=False):
     type: Literal["static"]
 
 
+class ChunkingStrategyVectorStoreChunkingStrategyContextualContextual(TypedDict, total=False):
+    """Configuration for contextual chunking."""
+
+    chunk_overlap_tokens: int
+    """Tokens to overlap between adjacent chunks.
+
+    Must be less than max_chunk_size_tokens.
+    """
+
+    context_prompt: str
+    """Prompt template for contextual retrieval.
+
+    Uses WHOLE_DOCUMENT and CHUNK_CONTENT placeholders wrapped in double curly
+    braces.
+    """
+
+    max_chunk_size_tokens: int
+    """Maximum tokens per chunk. Suggested ~700 to allow room for prepended context."""
+
+    max_concurrency: Optional[int]
+    """Maximum concurrent LLM calls. Falls back to config default if not provided."""
+
+    model_id: Optional[str]
+    """LLM model for generating context.
+
+    Falls back to VectorStoresConfig.contextual_retrieval_params.model if not
+    provided.
+    """
+
+    timeout_seconds: Optional[int]
+    """Timeout per LLM call in seconds. Falls back to config default if not provided."""
+
+
+class ChunkingStrategyVectorStoreChunkingStrategyContextual(TypedDict, total=False):
+    """
+    Contextual chunking strategy that uses an LLM to situate chunks within the document.
+    """
+
+    contextual: Required[ChunkingStrategyVectorStoreChunkingStrategyContextualContextual]
+    """Configuration for contextual chunking."""
+
+    type: Literal["contextual"]
+    """Strategy type identifier."""
+
+
 ChunkingStrategy: TypeAlias = Union[
-    ChunkingStrategyVectorStoreChunkingStrategyAuto, ChunkingStrategyVectorStoreChunkingStrategyStatic
+    ChunkingStrategyVectorStoreChunkingStrategyAuto,
+    ChunkingStrategyVectorStoreChunkingStrategyStatic,
+    ChunkingStrategyVectorStoreChunkingStrategyContextual,
 ]
diff --git a/src/llama_stack_client/types/vector_stores/file_create_params.py b/src/llama_stack_client/types/vector_stores/file_create_params.py
@@ -17,6 +17,8 @@
     "ChunkingStrategyVectorStoreChunkingStrategyAuto",
     "ChunkingStrategyVectorStoreChunkingStrategyStatic",
     "ChunkingStrategyVectorStoreChunkingStrategyStaticStatic",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextual",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextualContextual",
 ]
 
 
@@ -54,6 +56,53 @@ class ChunkingStrategyVectorStoreChunkingStrategyStatic(TypedDict, total=False):
     type: Literal["static"]
 
 
+class ChunkingStrategyVectorStoreChunkingStrategyContextualContextual(TypedDict, total=False):
+    """Configuration for contextual chunking."""
+
+    chunk_overlap_tokens: int
+    """Tokens to overlap between adjacent chunks.
+
+    Must be less than max_chunk_size_tokens.
+    """
+
+    context_prompt: str
+    """Prompt template for contextual retrieval.
+
+    Uses WHOLE_DOCUMENT and CHUNK_CONTENT placeholders wrapped in double curly
+    braces.
+    """
+
+    max_chunk_size_tokens: int
+    """Maximum tokens per chunk. Suggested ~700 to allow room for prepended context."""
+
+    max_concurrency: Optional[int]
+    """Maximum concurrent LLM calls. Falls back to config default if not provided."""
+
+    model_id: Optional[str]
+    """LLM model for generating context.
+
+    Falls back to VectorStoresConfig.contextual_retrieval_params.model if not
+    provided.
+    """
+
+    timeout_seconds: Optional[int]
+    """Timeout per LLM call in seconds. Falls back to config default if not provided."""
+
+
+class ChunkingStrategyVectorStoreChunkingStrategyContextual(TypedDict, total=False):
+    """
+    Contextual chunking strategy that uses an LLM to situate chunks within the document.
+    """
+
+    contextual: Required[ChunkingStrategyVectorStoreChunkingStrategyContextualContextual]
+    """Configuration for contextual chunking."""
+
+    type: Literal["contextual"]
+    """Strategy type identifier."""
+
+
 ChunkingStrategy: TypeAlias = Union[
-    ChunkingStrategyVectorStoreChunkingStrategyAuto, ChunkingStrategyVectorStoreChunkingStrategyStatic
+    ChunkingStrategyVectorStoreChunkingStrategyAuto,
+    ChunkingStrategyVectorStoreChunkingStrategyStatic,
+    ChunkingStrategyVectorStoreChunkingStrategyContextual,
 ]
diff --git a/src/llama_stack_client/types/vector_stores/vector_store_file.py b/src/llama_stack_client/types/vector_stores/vector_store_file.py
@@ -9,6 +9,8 @@
 from typing import Dict, Union, Optional
 from typing_extensions import Literal, Annotated, TypeAlias
 
+from pydantic import Field as FieldInfo
+
 from ..._utils import PropertyInfo
 from ..._models import BaseModel
 
@@ -18,6 +20,8 @@
     "ChunkingStrategyVectorStoreChunkingStrategyAuto",
     "ChunkingStrategyVectorStoreChunkingStrategyStatic",
     "ChunkingStrategyVectorStoreChunkingStrategyStaticStatic",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextual",
+    "ChunkingStrategyVectorStoreChunkingStrategyContextualContextual",
     "LastError",
 ]
 
@@ -45,8 +49,57 @@ class ChunkingStrategyVectorStoreChunkingStrategyStatic(BaseModel):
     type: Optional[Literal["static"]] = None
 
 
+class ChunkingStrategyVectorStoreChunkingStrategyContextualContextual(BaseModel):
+    """Configuration for contextual chunking."""
+
+    chunk_overlap_tokens: Optional[int] = None
+    """Tokens to overlap between adjacent chunks.
+
+    Must be less than max_chunk_size_tokens.
+    """
+
+    context_prompt: Optional[str] = None
+    """Prompt template for contextual retrieval.
+
+    Uses WHOLE_DOCUMENT and CHUNK_CONTENT placeholders wrapped in double curly
+    braces.
+    """
+
+    max_chunk_size_tokens: Optional[int] = None
+    """Maximum tokens per chunk. Suggested ~700 to allow room for prepended context."""
+
+    max_concurrency: Optional[int] = None
+    """Maximum concurrent LLM calls. Falls back to config default if not provided."""
+
+    api_model_id: Optional[str] = FieldInfo(alias="model_id", default=None)
+    """LLM model for generating context.
+
+    Falls back to VectorStoresConfig.contextual_retrieval_params.model if not
+    provided.
+    """
+
+    timeout_seconds: Optional[int] = None
+    """Timeout per LLM call in seconds. Falls back to config default if not provided."""
+
+
+class ChunkingStrategyVectorStoreChunkingStrategyContextual(BaseModel):
+    """
+    Contextual chunking strategy that uses an LLM to situate chunks within the document.
+    """
+
+    contextual: ChunkingStrategyVectorStoreChunkingStrategyContextualContextual
+    """Configuration for contextual chunking."""
+
+    type: Optional[Literal["contextual"]] = None
+    """Strategy type identifier."""
+
+
 ChunkingStrategy: TypeAlias = Annotated[
-    Union[ChunkingStrategyVectorStoreChunkingStrategyAuto, ChunkingStrategyVectorStoreChunkingStrategyStatic],
+    Union[
+        ChunkingStrategyVectorStoreChunkingStrategyAuto,
+        ChunkingStrategyVectorStoreChunkingStrategyStatic,
+        ChunkingStrategyVectorStoreChunkingStrategyContextual,
+    ],
     PropertyInfo(discriminator="type"),
 ]