From 95c00a703787774f003cd48e7fbe9151377a8179 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Sat, 14 Jun 2025 11:31:45 -0400
Subject: [PATCH 1/3] change README and `llama-stack-client inference
 chat-completion` to use chat.completions.create

---
 README.md                                     | 50 ++++++++++++-------
 .../lib/cli/inference/inference.py            |  8 +--
 .../lib/inference/event_logger.py             | 20 ++++++++
 3 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index f1200e4d..2f2694b0 100644
--- a/README.md
+++ b/README.md
@@ -29,20 +29,14 @@ The full API of this library can be found in [api.md](api.md). You may find basi
 
 ```python
 from llama_stack_client import LlamaStackClient
-from llama_stack_client.types import UserMessage
 
 client = LlamaStackClient(
     base_url=f"http://{host}:{port}",
 )
 
-response = client.inference.chat_completion(
-    messages=[
-        UserMessage(
-            content="hello world, write me a 2 sentence poem about the moon",
-            role="user",
-        ),
-    ],
-    model_id="meta-llama/Llama-3.2-3B-Instruct",
+response = client.chat.completions.create(
+    messages=[{"role": "user", "content": "hello world, write me a 2 sentence poem about the moon"}],
+    model="meta-llama/Llama-3.2-3B-Instruct",
     stream=False,
 )
 print(response)
@@ -54,16 +48,34 @@ llama-stack-client inference chat-completion --message "hello, what model are yo
 ```
 
 ```python
-ChatCompletionResponse(
-    completion_message=CompletionMessage(
-        content="Hello! I'm an AI model, and I'm based on a large language model architecture. My knowledge cutoff is December 2023, which means I was trained on a dataset that was current up to that point in time.\n\nI don't have a specific model name, but I'm similar to other
-conversational AI models like LLaMA, Bard, or ChatGPT. My primary function is to understand and respond to human language, generating human-like text based on the input I receive.\n\nI'm designed to be helpful and informative, and I can assist with a wide range of topics and tasks,
-from answering questions and providing information to generating text and completing tasks. How can I help you today?",
-        role='assistant',
-        stop_reason='end_of_turn',
-        tool_calls=[]
-    ),
-    logprobs=None
+OpenAIChatCompletion(
+    id='AmivnS0iMv-mmEE4_A0DK1T',
+    choices=[
+        OpenAIChatCompletionChoice(
+            finish_reason='stop',
+            index=0,
+            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+                role='assistant',
+                content="Hello! I am an AI designed by Meta AI, and my model is a type of recurrent neural network (RNN) called a transformer. My specific architecture is based on the BERT (Bidirectional Encoder Representations from Transformers) model, which is a pre-trained language model that has been fine-tuned for a variety of natural language processing tasks.\n\nHere are some key details about my model:\n\n* **Model type:** Transformer-based language model\n* **Architecture:** BERT (Bidirectional Encoder Representations from Transformers)\n* **Training data:** A massive corpus of text data, including but not limited to:\n\t+ Web pages\n\t+ Books\n\t+ Articles\n\t+ Forums\n\t+ Social media platforms\n* **Parameters:** My model has approximately 1.5 billion parameters, which allows me to understand and generate human-like language.\n* **Capabilities:** I can perform a wide range of tasks, including but not limited to:\n\t+ Answering questions\n\t+ Generating text\n\t+ Translating languages\n\t+ Summarizing content\n\t+ Offering suggestions and ideas\n\nI'm constantly learning and improving, so please bear with me if I make any mistakes or don't quite understand what you're asking. How can I assist you today?",
+                name=None,
+                tool_calls=None,
+                function_call=None
+            ),
+            logprobs=OpenAIChatCompletionChoiceLogprobs(content=None, refusal=None)
+        )
+    ],
+    created=1749825661,
+    model='Llama-3.3-70B-Instruct',
+    object='chat.completion',
+    system_fingerprint=None,
+    usage={
+        'completion_tokens': 258,
+        'prompt_tokens': 16,
+        'total_tokens': 274,
+        'completion_tokens_details': None,
+        'prompt_tokens_details': None
+    },
+    service_tier=None
 )
 ```
 
diff --git a/src/llama_stack_client/lib/cli/inference/inference.py b/src/llama_stack_client/lib/cli/inference/inference.py
index 772e9311..0cc16396 100644
--- a/src/llama_stack_client/lib/cli/inference/inference.py
+++ b/src/llama_stack_client/lib/cli/inference/inference.py
@@ -46,8 +46,8 @@ def chat_completion(ctx, message: str, stream: bool, session: bool, model_id: Op
     messages = []
     if message:
         messages.append({"role": "user", "content": message})
-        response = client.inference.chat_completion(
-            model_id=model_id,
+        response = client.chat.completions.create(
+            model=model_id,
             messages=messages,
             stream=stream,
         )
@@ -69,8 +69,8 @@ def chat_session(client, model_id: Optional[str], messages: List[Dict[str, str]]
                 console.print("Exiting")
                 break
             messages.append({"role": "user", "content": message})
-            response = client.inference.chat_completion(
-                model_id=model_id,
+            response = client.chat.completions.create(
+                model=model_id,
                 messages=messages,
                 stream=True,
             )
diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
index f74f1bd4..88f20ee7 100644
--- a/src/llama_stack_client/lib/inference/event_logger.py
+++ b/src/llama_stack_client/lib/inference/event_logger.py
@@ -5,6 +5,7 @@
 # the root directory of this source tree.
 from termcolor import cprint
 
+from llama_stack_client.types import ChatCompletionChunk
 
 class InferenceStreamPrintableEvent:
     def __init__(
@@ -43,9 +44,28 @@ def yield_printable_events(self, chunk):
         elif event.event_type == "complete":
             yield InferenceStreamPrintableEvent("")
 
+class ChatCompletionsStreamLogEventPrinter:
+    def yield_printable_events(self, chunk):
+        choice = chunk.choices[0]
+        delta = choice.delta
+        if delta:
+            if delta.role:
+                yield InferenceStreamPrintableEvent(f"{delta.role}> ", color="cyan", end="")
+            if delta.content:
+                yield InferenceStreamPrintableEvent(delta.content, color="yellow", end="")
+            if choice.finish_reason:
+                if choice.finish_reason == "length":
+                    yield InferenceStreamPrintableEvent("<truncated>", color="red", end="")
+                yield InferenceStreamPrintableEvent()
+
 
 class EventLogger:
     def log(self, event_generator):
         printer = InferenceStreamLogEventPrinter()
+        # Check if the event generator is of type Stream[ChatCompletionChunk]
+        if hasattr(event_generator, "_cast_to"):
+            if event_generator._cast_to == ChatCompletionChunk:
+                # If it is, use the ChatCompletionsStreamLogEventPrinter
+                printer = ChatCompletionsStreamLogEventPrinter()
         for chunk in event_generator:
             yield from printer.yield_printable_events(chunk)

From 95ad03464f0735f47e13a24c51b5a20d08c9555f Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 24 Jun 2025 19:28:20 -0400
Subject: [PATCH 2/3] ruff format

---
 src/llama_stack_client/lib/inference/event_logger.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
index 88f20ee7..61207425 100644
--- a/src/llama_stack_client/lib/inference/event_logger.py
+++ b/src/llama_stack_client/lib/inference/event_logger.py
@@ -7,6 +7,7 @@
 
 from llama_stack_client.types import ChatCompletionChunk
 
+
 class InferenceStreamPrintableEvent:
     def __init__(
         self,
@@ -44,6 +45,7 @@ def yield_printable_events(self, chunk):
         elif event.event_type == "complete":
             yield InferenceStreamPrintableEvent("")
 
+
 class ChatCompletionsStreamLogEventPrinter:
     def yield_printable_events(self, chunk):
         choice = chunk.choices[0]

From 9208bf8915d555c30347023ab03c288a538137d0 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Wed, 25 Jun 2025 07:40:53 -0400
Subject: [PATCH 3/3] merge the ChatCompletionsStreamLogEventPrinter class into
 the existing InferenceStreamPrintableEvent class, add type hints

---
 .../lib/inference/event_logger.py             | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
index 61207425..14b46372 100644
--- a/src/llama_stack_client/lib/inference/event_logger.py
+++ b/src/llama_stack_client/lib/inference/event_logger.py
@@ -3,9 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Generator
 from termcolor import cprint
-
-from llama_stack_client.types import ChatCompletionChunk
+from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk
 
 
 class InferenceStreamPrintableEvent:
@@ -27,7 +27,19 @@ class InferenceStreamLogEventPrinter:
     def __init__(self):
         self.is_thinking = False
 
-    def yield_printable_events(self, chunk):
+    def yield_printable_events(
+        self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
+    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
+        # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
+        if hasattr(chunk, "event"):
+            yield from self._handle_inference_stream_chunk(chunk)
+        # Check if the chunk has choices attribute (ChatCompletionChunk)
+        elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
+            yield from self._handle_chat_completion_chunk(chunk)
+
+    def _handle_inference_stream_chunk(
+        self, chunk: ChatCompletionResponseStreamChunk
+    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
         event = chunk.event
         if event.event_type == "start":
             yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
@@ -45,9 +57,9 @@ def yield_printable_events(self, chunk):
         elif event.event_type == "complete":
             yield InferenceStreamPrintableEvent("")
 
-
-class ChatCompletionsStreamLogEventPrinter:
-    def yield_printable_events(self, chunk):
+    def _handle_chat_completion_chunk(
+        self, chunk: ChatCompletionChunk
+    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
         choice = chunk.choices[0]
         delta = choice.delta
         if delta:
@@ -64,10 +76,5 @@ def yield_printable_events(self, chunk):
 class EventLogger:
     def log(self, event_generator):
         printer = InferenceStreamLogEventPrinter()
-        # Check if the event generator is of type Stream[ChatCompletionChunk]
-        if hasattr(event_generator, "_cast_to"):
-            if event_generator._cast_to == ChatCompletionChunk:
-                # If it is, use the ChatCompletionsStreamLogEventPrinter
-                printer = ChatCompletionsStreamLogEventPrinter()
         for chunk in event_generator:
             yield from printer.yield_printable_events(chunk)