From 95c00a703787774f003cd48e7fbe9151377a8179 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 14 Jun 2025 11:31:45 -0400 Subject: [PATCH 1/3] change README and `llama-stack-client inference chat-completion` to use chat.completions.create --- README.md | 50 ++++++++++++------- .../lib/cli/inference/inference.py | 8 +-- .../lib/inference/event_logger.py | 20 ++++++++ 3 files changed, 55 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index f1200e4d..2f2694b0 100644 --- a/README.md +++ b/README.md @@ -29,20 +29,14 @@ The full API of this library can be found in [api.md](api.md). You may find basi ```python from llama_stack_client import LlamaStackClient -from llama_stack_client.types import UserMessage client = LlamaStackClient( base_url=f"http://{host}:{port}", ) -response = client.inference.chat_completion( - messages=[ - UserMessage( - content="hello world, write me a 2 sentence poem about the moon", - role="user", - ), - ], - model_id="meta-llama/Llama-3.2-3B-Instruct", +response = client.chat.completions.create( + messages=[{"role": "user", "content": "hello world, write me a 2 sentence poem about the moon"}], + model="meta-llama/Llama-3.2-3B-Instruct", stream=False, ) print(response) @@ -54,16 +48,34 @@ llama-stack-client inference chat-completion --message "hello, what model are yo ``` ```python -ChatCompletionResponse( - completion_message=CompletionMessage( - content="Hello! I'm an AI model, and I'm based on a large language model architecture. My knowledge cutoff is December 2023, which means I was trained on a dataset that was current up to that point in time.\n\nI don't have a specific model name, but I'm similar to other -conversational AI models like LLaMA, Bard, or ChatGPT. My primary function is to understand and respond to human language, generating human-like text based on the input I receive.\n\nI'm designed to be helpful and informative, and I can assist with a wide range of topics and tasks, -from answering questions and providing information to generating text and completing tasks. How can I help you today?", - role='assistant', - stop_reason='end_of_turn', - tool_calls=[] - ), - logprobs=None +OpenAIChatCompletion( + id='AmivnS0iMv-mmEE4_A0DK1T', + choices=[ + OpenAIChatCompletionChoice( + finish_reason='stop', + index=0, + message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam( + role='assistant', + content="Hello! I am an AI designed by Meta AI, and my model is a type of recurrent neural network (RNN) called a transformer. My specific architecture is based on the BERT (Bidirectional Encoder Representations from Transformers) model, which is a pre-trained language model that has been fine-tuned for a variety of natural language processing tasks.\n\nHere are some key details about my model:\n\n* **Model type:** Transformer-based language model\n* **Architecture:** BERT (Bidirectional Encoder Representations from Transformers)\n* **Training data:** A massive corpus of text data, including but not limited to:\n\t+ Web pages\n\t+ Books\n\t+ Articles\n\t+ Forums\n\t+ Social media platforms\n* **Parameters:** My model has approximately 1.5 billion parameters, which allows me to understand and generate human-like language.\n* **Capabilities:** I can perform a wide range of tasks, including but not limited to:\n\t+ Answering questions\n\t+ Generating text\n\t+ Translating languages\n\t+ Summarizing content\n\t+ Offering suggestions and ideas\n\nI'm constantly learning and improving, so please bear with me if I make any mistakes or don't quite understand what you're asking. How can I assist you today?", + name=None, + tool_calls=None, + function_call=None + ), + logprobs=OpenAIChatCompletionChoiceLogprobs(content=None, refusal=None) + ) + ], + created=1749825661, + model='Llama-3.3-70B-Instruct', + object='chat.completion', + system_fingerprint=None, + usage={ + 'completion_tokens': 258, + 'prompt_tokens': 16, + 'total_tokens': 274, + 'completion_tokens_details': None, + 'prompt_tokens_details': None + }, + service_tier=None ) ``` diff --git a/src/llama_stack_client/lib/cli/inference/inference.py b/src/llama_stack_client/lib/cli/inference/inference.py index 772e9311..0cc16396 100644 --- a/src/llama_stack_client/lib/cli/inference/inference.py +++ b/src/llama_stack_client/lib/cli/inference/inference.py @@ -46,8 +46,8 @@ def chat_completion(ctx, message: str, stream: bool, session: bool, model_id: Op messages = [] if message: messages.append({"role": "user", "content": message}) - response = client.inference.chat_completion( - model_id=model_id, + response = client.chat.completions.create( + model=model_id, messages=messages, stream=stream, ) @@ -69,8 +69,8 @@ def chat_session(client, model_id: Optional[str], messages: List[Dict[str, str]] console.print("Exiting") break messages.append({"role": "user", "content": message}) - response = client.inference.chat_completion( - model_id=model_id, + response = client.chat.completions.create( + model=model_id, messages=messages, stream=True, ) diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py index f74f1bd4..88f20ee7 100644 --- a/src/llama_stack_client/lib/inference/event_logger.py +++ b/src/llama_stack_client/lib/inference/event_logger.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from termcolor import cprint +from llama_stack_client.types import ChatCompletionChunk class InferenceStreamPrintableEvent: def __init__( @@ -43,9 +44,28 @@ def yield_printable_events(self, chunk): elif event.event_type == "complete": yield InferenceStreamPrintableEvent("") +class ChatCompletionsStreamLogEventPrinter: + def yield_printable_events(self, chunk): + choice = chunk.choices[0] + delta = choice.delta + if delta: + if delta.role: + yield InferenceStreamPrintableEvent(f"{delta.role}> ", color="cyan", end="") + if delta.content: + yield InferenceStreamPrintableEvent(delta.content, color="yellow", end="") + if choice.finish_reason: + if choice.finish_reason == "length": + yield InferenceStreamPrintableEvent("", color="red", end="") + yield InferenceStreamPrintableEvent() + class EventLogger: def log(self, event_generator): printer = InferenceStreamLogEventPrinter() + # Check if the event generator is of type Stream[ChatCompletionChunk] + if hasattr(event_generator, "_cast_to"): + if event_generator._cast_to == ChatCompletionChunk: + # If it is, use the ChatCompletionsStreamLogEventPrinter + printer = ChatCompletionsStreamLogEventPrinter() for chunk in event_generator: yield from printer.yield_printable_events(chunk) From 95ad03464f0735f47e13a24c51b5a20d08c9555f Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 24 Jun 2025 19:28:20 -0400 Subject: [PATCH 2/3] ruff format --- src/llama_stack_client/lib/inference/event_logger.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py index 88f20ee7..61207425 100644 --- a/src/llama_stack_client/lib/inference/event_logger.py +++ b/src/llama_stack_client/lib/inference/event_logger.py @@ -7,6 +7,7 @@ from llama_stack_client.types import ChatCompletionChunk + class InferenceStreamPrintableEvent: def __init__( self, @@ -44,6 +45,7 @@ def yield_printable_events(self, chunk): elif event.event_type == "complete": yield InferenceStreamPrintableEvent("") + class ChatCompletionsStreamLogEventPrinter: def yield_printable_events(self, chunk): choice = chunk.choices[0] From 9208bf8915d555c30347023ab03c288a538137d0 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 25 Jun 2025 07:40:53 -0400 Subject: [PATCH 3/3] merge the ChatCompletionsStreamLogEventPrinter class into the existing InferenceStreamPrintableEvent class, add type hints --- .../lib/inference/event_logger.py | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py index 61207425..14b46372 100644 --- a/src/llama_stack_client/lib/inference/event_logger.py +++ b/src/llama_stack_client/lib/inference/event_logger.py @@ -3,9 +3,9 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Generator from termcolor import cprint - -from llama_stack_client.types import ChatCompletionChunk +from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk class InferenceStreamPrintableEvent: @@ -27,7 +27,19 @@ class InferenceStreamLogEventPrinter: def __init__(self): self.is_thinking = False - def yield_printable_events(self, chunk): + def yield_printable_events( + self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk + ) -> Generator[InferenceStreamPrintableEvent, None, None]: + # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk) + if hasattr(chunk, "event"): + yield from self._handle_inference_stream_chunk(chunk) + # Check if the chunk has choices attribute (ChatCompletionChunk) + elif hasattr(chunk, "choices") and len(chunk.choices) > 0: + yield from self._handle_chat_completion_chunk(chunk) + + def _handle_inference_stream_chunk( + self, chunk: ChatCompletionResponseStreamChunk + ) -> Generator[InferenceStreamPrintableEvent, None, None]: event = chunk.event if event.event_type == "start": yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="") @@ -45,9 +57,9 @@ def yield_printable_events(self, chunk): elif event.event_type == "complete": yield InferenceStreamPrintableEvent("") - -class ChatCompletionsStreamLogEventPrinter: - def yield_printable_events(self, chunk): + def _handle_chat_completion_chunk( + self, chunk: ChatCompletionChunk + ) -> Generator[InferenceStreamPrintableEvent, None, None]: choice = chunk.choices[0] delta = choice.delta if delta: @@ -64,10 +76,5 @@ def yield_printable_events(self, chunk): class EventLogger: def log(self, event_generator): printer = InferenceStreamLogEventPrinter() - # Check if the event generator is of type Stream[ChatCompletionChunk] - if hasattr(event_generator, "_cast_to"): - if event_generator._cast_to == ChatCompletionChunk: - # If it is, use the ChatCompletionsStreamLogEventPrinter - printer = ChatCompletionsStreamLogEventPrinter() for chunk in event_generator: yield from printer.yield_printable_events(chunk)