Skip to content

Commit 29f75d0

Browse files
authored
change README and llama-stack-client inference chat-completion to use chat.completions.create (#240)
# What does this PR do? updates README to use chat.completions.create instead of inference.chat_completion ## Test Plan run examples in the README
1 parent 52c0b5d commit 29f75d0

File tree

3 files changed

+65
-24
lines changed

3 files changed

+65
-24
lines changed

README.md

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,14 @@ The full API of this library can be found in [api.md](api.md). You may find basi
2929

3030
```python
3131
from llama_stack_client import LlamaStackClient
32-
from llama_stack_client.types import UserMessage
3332

3433
client = LlamaStackClient(
3534
base_url=f"http://{host}:{port}",
3635
)
3736

38-
response = client.inference.chat_completion(
39-
messages=[
40-
UserMessage(
41-
content="hello world, write me a 2 sentence poem about the moon",
42-
role="user",
43-
),
44-
],
45-
model_id="meta-llama/Llama-3.2-3B-Instruct",
37+
response = client.chat.completions.create(
38+
messages=[{"role": "user", "content": "hello world, write me a 2 sentence poem about the moon"}],
39+
model="meta-llama/Llama-3.2-3B-Instruct",
4640
stream=False,
4741
)
4842
print(response)
@@ -54,16 +48,34 @@ llama-stack-client inference chat-completion --message "hello, what model are yo
5448
```
5549

5650
```python
57-
ChatCompletionResponse(
58-
completion_message=CompletionMessage(
59-
content="Hello! I'm an AI model, and I'm based on a large language model architecture. My knowledge cutoff is December 2023, which means I was trained on a dataset that was current up to that point in time.\n\nI don't have a specific model name, but I'm similar to other
60-
conversational AI models like LLaMA, Bard, or ChatGPT. My primary function is to understand and respond to human language, generating human-like text based on the input I receive.\n\nI'm designed to be helpful and informative, and I can assist with a wide range of topics and tasks,
61-
from answering questions and providing information to generating text and completing tasks. How can I help you today?",
62-
role='assistant',
63-
stop_reason='end_of_turn',
64-
tool_calls=[]
65-
),
66-
logprobs=None
51+
OpenAIChatCompletion(
52+
id='AmivnS0iMv-mmEE4_A0DK1T',
53+
choices=[
54+
OpenAIChatCompletionChoice(
55+
finish_reason='stop',
56+
index=0,
57+
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
58+
role='assistant',
59+
content="Hello! I am an AI designed by Meta AI, and my model is a type of recurrent neural network (RNN) called a transformer. My specific architecture is based on the BERT (Bidirectional Encoder Representations from Transformers) model, which is a pre-trained language model that has been fine-tuned for a variety of natural language processing tasks.\n\nHere are some key details about my model:\n\n* **Model type:** Transformer-based language model\n* **Architecture:** BERT (Bidirectional Encoder Representations from Transformers)\n* **Training data:** A massive corpus of text data, including but not limited to:\n\t+ Web pages\n\t+ Books\n\t+ Articles\n\t+ Forums\n\t+ Social media platforms\n* **Parameters:** My model has approximately 1.5 billion parameters, which allows me to understand and generate human-like language.\n* **Capabilities:** I can perform a wide range of tasks, including but not limited to:\n\t+ Answering questions\n\t+ Generating text\n\t+ Translating languages\n\t+ Summarizing content\n\t+ Offering suggestions and ideas\n\nI'm constantly learning and improving, so please bear with me if I make any mistakes or don't quite understand what you're asking. How can I assist you today?",
60+
name=None,
61+
tool_calls=None,
62+
function_call=None
63+
),
64+
logprobs=OpenAIChatCompletionChoiceLogprobs(content=None, refusal=None)
65+
)
66+
],
67+
created=1749825661,
68+
model='Llama-3.3-70B-Instruct',
69+
object='chat.completion',
70+
system_fingerprint=None,
71+
usage={
72+
'completion_tokens': 258,
73+
'prompt_tokens': 16,
74+
'total_tokens': 274,
75+
'completion_tokens_details': None,
76+
'prompt_tokens_details': None
77+
},
78+
service_tier=None
6779
)
6880
```
6981

src/llama_stack_client/lib/cli/inference/inference.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def chat_completion(ctx, message: str, stream: bool, session: bool, model_id: Op
4646
messages = []
4747
if message:
4848
messages.append({"role": "user", "content": message})
49-
response = client.inference.chat_completion(
50-
model_id=model_id,
49+
response = client.chat.completions.create(
50+
model=model_id,
5151
messages=messages,
5252
stream=stream,
5353
)
@@ -69,8 +69,8 @@ def chat_session(client, model_id: Optional[str], messages: List[Dict[str, str]]
6969
console.print("Exiting")
7070
break
7171
messages.append({"role": "user", "content": message})
72-
response = client.inference.chat_completion(
73-
model_id=model_id,
72+
response = client.chat.completions.create(
73+
model=model_id,
7474
messages=messages,
7575
stream=True,
7676
)

src/llama_stack_client/lib/inference/event_logger.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
#
44
# This source code is licensed under the terms described in the LICENSE file in
55
# the root directory of this source tree.
6+
from typing import Generator
67
from termcolor import cprint
8+
from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk
79

810

911
class InferenceStreamPrintableEvent:
@@ -25,7 +27,19 @@ class InferenceStreamLogEventPrinter:
2527
def __init__(self):
2628
self.is_thinking = False
2729

28-
def yield_printable_events(self, chunk):
30+
def yield_printable_events(
31+
self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
32+
) -> Generator[InferenceStreamPrintableEvent, None, None]:
33+
# Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
34+
if hasattr(chunk, "event"):
35+
yield from self._handle_inference_stream_chunk(chunk)
36+
# Check if the chunk has choices attribute (ChatCompletionChunk)
37+
elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
38+
yield from self._handle_chat_completion_chunk(chunk)
39+
40+
def _handle_inference_stream_chunk(
41+
self, chunk: ChatCompletionResponseStreamChunk
42+
) -> Generator[InferenceStreamPrintableEvent, None, None]:
2943
event = chunk.event
3044
if event.event_type == "start":
3145
yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
@@ -43,6 +57,21 @@ def yield_printable_events(self, chunk):
4357
elif event.event_type == "complete":
4458
yield InferenceStreamPrintableEvent("")
4559

60+
def _handle_chat_completion_chunk(
61+
self, chunk: ChatCompletionChunk
62+
) -> Generator[InferenceStreamPrintableEvent, None, None]:
63+
choice = chunk.choices[0]
64+
delta = choice.delta
65+
if delta:
66+
if delta.role:
67+
yield InferenceStreamPrintableEvent(f"{delta.role}> ", color="cyan", end="")
68+
if delta.content:
69+
yield InferenceStreamPrintableEvent(delta.content, color="yellow", end="")
70+
if choice.finish_reason:
71+
if choice.finish_reason == "length":
72+
yield InferenceStreamPrintableEvent("<truncated>", color="red", end="")
73+
yield InferenceStreamPrintableEvent()
74+
4675

4776
class EventLogger:
4877
def log(self, event_generator):

0 commit comments

Comments
 (0)