change README and llama-stack-client inference chat-completion to use chat.completions.create (#240)

mattf · web-flow · commit 29f75d00a548 · 2025-06-25T21:33:02.000+05:30
# What does this PR do?

updates README to use chat.completions.create instead of
inference.chat_completion

## Test Plan

run examples in the README
diff --git a/README.md b/README.md
@@ -29,20 +29,14 @@ The full API of this library can be found in [api.md](api.md). You may find basi
 
 ```python
 from llama_stack_client import LlamaStackClient
-from llama_stack_client.types import UserMessage
 
 client = LlamaStackClient(
     base_url=f"http://{host}:{port}",
 )
 
-response = client.inference.chat_completion(
-    messages=[
-        UserMessage(
-            content="hello world, write me a 2 sentence poem about the moon",
-            role="user",
-        ),
-    ],
-    model_id="meta-llama/Llama-3.2-3B-Instruct",
+response = client.chat.completions.create(
+    messages=[{"role": "user", "content": "hello world, write me a 2 sentence poem about the moon"}],
+    model="meta-llama/Llama-3.2-3B-Instruct",
     stream=False,
 )
 print(response)
@@ -54,16 +48,34 @@ llama-stack-client inference chat-completion --message "hello, what model are yo
 ```
 
 ```python
-ChatCompletionResponse(
-    completion_message=CompletionMessage(
-        content="Hello! I'm an AI model, and I'm based on a large language model architecture. My knowledge cutoff is December 2023, which means I was trained on a dataset that was current up to that point in time.\n\nI don't have a specific model name, but I'm similar to other
-conversational AI models like LLaMA, Bard, or ChatGPT. My primary function is to understand and respond to human language, generating human-like text based on the input I receive.\n\nI'm designed to be helpful and informative, and I can assist with a wide range of topics and tasks,
-from answering questions and providing information to generating text and completing tasks. How can I help you today?",
-        role='assistant',
-        stop_reason='end_of_turn',
-        tool_calls=[]
-    ),
-    logprobs=None
+OpenAIChatCompletion(
+    id='AmivnS0iMv-mmEE4_A0DK1T',
+    choices=[
+        OpenAIChatCompletionChoice(
+            finish_reason='stop',
+            index=0,
+            message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
+                role='assistant',
+                content="Hello! I am an AI designed by Meta AI, and my model is a type of recurrent neural network (RNN) called a transformer. My specific architecture is based on the BERT (Bidirectional Encoder Representations from Transformers) model, which is a pre-trained language model that has been fine-tuned for a variety of natural language processing tasks.\n\nHere are some key details about my model:\n\n* **Model type:** Transformer-based language model\n* **Architecture:** BERT (Bidirectional Encoder Representations from Transformers)\n* **Training data:** A massive corpus of text data, including but not limited to:\n\t+ Web pages\n\t+ Books\n\t+ Articles\n\t+ Forums\n\t+ Social media platforms\n* **Parameters:** My model has approximately 1.5 billion parameters, which allows me to understand and generate human-like language.\n* **Capabilities:** I can perform a wide range of tasks, including but not limited to:\n\t+ Answering questions\n\t+ Generating text\n\t+ Translating languages\n\t+ Summarizing content\n\t+ Offering suggestions and ideas\n\nI'm constantly learning and improving, so please bear with me if I make any mistakes or don't quite understand what you're asking. How can I assist you today?",
+                name=None,
+                tool_calls=None,
+                function_call=None
+            ),
+            logprobs=OpenAIChatCompletionChoiceLogprobs(content=None, refusal=None)
+        )
+    ],
+    created=1749825661,
+    model='Llama-3.3-70B-Instruct',
+    object='chat.completion',
+    system_fingerprint=None,
+    usage={
+        'completion_tokens': 258,
+        'prompt_tokens': 16,
+        'total_tokens': 274,
+        'completion_tokens_details': None,
+        'prompt_tokens_details': None
+    },
+    service_tier=None
 )
 ```
 
diff --git a/src/llama_stack_client/lib/cli/inference/inference.py b/src/llama_stack_client/lib/cli/inference/inference.py
@@ -46,8 +46,8 @@ def chat_completion(ctx, message: str, stream: bool, session: bool, model_id: Op
     messages = []
     if message:
         messages.append({"role": "user", "content": message})
-        response = client.inference.chat_completion(
-            model_id=model_id,
+        response = client.chat.completions.create(
+            model=model_id,
             messages=messages,
             stream=stream,
         )
@@ -69,8 +69,8 @@ def chat_session(client, model_id: Optional[str], messages: List[Dict[str, str]]
                 console.print("Exiting")
                 break
             messages.append({"role": "user", "content": message})
-            response = client.inference.chat_completion(
-                model_id=model_id,
+            response = client.chat.completions.create(
+                model=model_id,
                 messages=messages,
                 stream=True,
             )
diff --git a/src/llama_stack_client/lib/inference/event_logger.py b/src/llama_stack_client/lib/inference/event_logger.py
@@ -3,7 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Generator
 from termcolor import cprint
+from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk
 
 
 class InferenceStreamPrintableEvent:
@@ -25,7 +27,19 @@ class InferenceStreamLogEventPrinter:
     def __init__(self):
         self.is_thinking = False
 
-    def yield_printable_events(self, chunk):
+    def yield_printable_events(
+        self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
+    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
+        # Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
+        if hasattr(chunk, "event"):
+            yield from self._handle_inference_stream_chunk(chunk)
+        # Check if the chunk has choices attribute (ChatCompletionChunk)
+        elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
+            yield from self._handle_chat_completion_chunk(chunk)
+
+    def _handle_inference_stream_chunk(
+        self, chunk: ChatCompletionResponseStreamChunk
+    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
         event = chunk.event
         if event.event_type == "start":
             yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
@@ -43,6 +57,21 @@ def yield_printable_events(self, chunk):
         elif event.event_type == "complete":
             yield InferenceStreamPrintableEvent("")
 
+    def _handle_chat_completion_chunk(
+        self, chunk: ChatCompletionChunk
+    ) -> Generator[InferenceStreamPrintableEvent, None, None]:
+        choice = chunk.choices[0]
+        delta = choice.delta
+        if delta:
+            if delta.role:
+                yield InferenceStreamPrintableEvent(f"{delta.role}> ", color="cyan", end="")
+            if delta.content:
+                yield InferenceStreamPrintableEvent(delta.content, color="yellow", end="")
+            if choice.finish_reason:
+                if choice.finish_reason == "length":
+                    yield InferenceStreamPrintableEvent("<truncated>", color="red", end="")
+                yield InferenceStreamPrintableEvent()
+
 
 class EventLogger:
     def log(self, event_generator):