Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 31 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,14 @@ The full API of this library can be found in [api.md](api.md). You may find basi

```python
from llama_stack_client import LlamaStackClient
from llama_stack_client.types import UserMessage

client = LlamaStackClient(
base_url=f"http://{host}:{port}",
)

response = client.inference.chat_completion(
messages=[
UserMessage(
content="hello world, write me a 2 sentence poem about the moon",
role="user",
),
],
model_id="meta-llama/Llama-3.2-3B-Instruct",
response = client.chat.completions.create(
messages=[{"role": "user", "content": "hello world, write me a 2 sentence poem about the moon"}],
model="meta-llama/Llama-3.2-3B-Instruct",
stream=False,
)
print(response)
Expand All @@ -54,16 +48,34 @@ llama-stack-client inference chat-completion --message "hello, what model are yo
```

```python
ChatCompletionResponse(
completion_message=CompletionMessage(
content="Hello! I'm an AI model, and I'm based on a large language model architecture. My knowledge cutoff is December 2023, which means I was trained on a dataset that was current up to that point in time.\n\nI don't have a specific model name, but I'm similar to other
conversational AI models like LLaMA, Bard, or ChatGPT. My primary function is to understand and respond to human language, generating human-like text based on the input I receive.\n\nI'm designed to be helpful and informative, and I can assist with a wide range of topics and tasks,
from answering questions and providing information to generating text and completing tasks. How can I help you today?",
role='assistant',
stop_reason='end_of_turn',
tool_calls=[]
),
logprobs=None
OpenAIChatCompletion(
id='AmivnS0iMv-mmEE4_A0DK1T',
choices=[
OpenAIChatCompletionChoice(
finish_reason='stop',
index=0,
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
role='assistant',
content="Hello! I am an AI designed by Meta AI, and my model is a type of recurrent neural network (RNN) called a transformer. My specific architecture is based on the BERT (Bidirectional Encoder Representations from Transformers) model, which is a pre-trained language model that has been fine-tuned for a variety of natural language processing tasks.\n\nHere are some key details about my model:\n\n* **Model type:** Transformer-based language model\n* **Architecture:** BERT (Bidirectional Encoder Representations from Transformers)\n* **Training data:** A massive corpus of text data, including but not limited to:\n\t+ Web pages\n\t+ Books\n\t+ Articles\n\t+ Forums\n\t+ Social media platforms\n* **Parameters:** My model has approximately 1.5 billion parameters, which allows me to understand and generate human-like language.\n* **Capabilities:** I can perform a wide range of tasks, including but not limited to:\n\t+ Answering questions\n\t+ Generating text\n\t+ Translating languages\n\t+ Summarizing content\n\t+ Offering suggestions and ideas\n\nI'm constantly learning and improving, so please bear with me if I make any mistakes or don't quite understand what you're asking. How can I assist you today?",
name=None,
tool_calls=None,
function_call=None
),
logprobs=OpenAIChatCompletionChoiceLogprobs(content=None, refusal=None)
)
],
created=1749825661,
model='Llama-3.3-70B-Instruct',
object='chat.completion',
system_fingerprint=None,
usage={
'completion_tokens': 258,
'prompt_tokens': 16,
'total_tokens': 274,
'completion_tokens_details': None,
'prompt_tokens_details': None
},
service_tier=None
)
```

Expand Down
8 changes: 4 additions & 4 deletions src/llama_stack_client/lib/cli/inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def chat_completion(ctx, message: str, stream: bool, session: bool, model_id: Op
messages = []
if message:
messages.append({"role": "user", "content": message})
response = client.inference.chat_completion(
model_id=model_id,
response = client.chat.completions.create(
model=model_id,
messages=messages,
stream=stream,
)
Expand All @@ -69,8 +69,8 @@ def chat_session(client, model_id: Optional[str], messages: List[Dict[str, str]]
console.print("Exiting")
break
messages.append({"role": "user", "content": message})
response = client.inference.chat_completion(
model_id=model_id,
response = client.chat.completions.create(
model=model_id,
messages=messages,
stream=True,
)
Expand Down
31 changes: 30 additions & 1 deletion src/llama_stack_client/lib/inference/event_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Generator
from termcolor import cprint
from llama_stack_client.types import ChatCompletionResponseStreamChunk, ChatCompletionChunk


class InferenceStreamPrintableEvent:
Expand All @@ -25,7 +27,19 @@ class InferenceStreamLogEventPrinter:
def __init__(self):
self.is_thinking = False

def yield_printable_events(self, chunk):
def yield_printable_events(
self, chunk: ChatCompletionResponseStreamChunk | ChatCompletionChunk
) -> Generator[InferenceStreamPrintableEvent, None, None]:
# Check if the chunk has event attribute (ChatCompletionResponseStreamChunk)
if hasattr(chunk, "event"):
yield from self._handle_inference_stream_chunk(chunk)
# Check if the chunk has choices attribute (ChatCompletionChunk)
elif hasattr(chunk, "choices") and len(chunk.choices) > 0:
yield from self._handle_chat_completion_chunk(chunk)

def _handle_inference_stream_chunk(
self, chunk: ChatCompletionResponseStreamChunk
) -> Generator[InferenceStreamPrintableEvent, None, None]:
event = chunk.event
if event.event_type == "start":
yield InferenceStreamPrintableEvent("Assistant> ", color="cyan", end="")
Expand All @@ -43,6 +57,21 @@ def yield_printable_events(self, chunk):
elif event.event_type == "complete":
yield InferenceStreamPrintableEvent("")

def _handle_chat_completion_chunk(
self, chunk: ChatCompletionChunk
) -> Generator[InferenceStreamPrintableEvent, None, None]:
choice = chunk.choices[0]
delta = choice.delta
if delta:
if delta.role:
yield InferenceStreamPrintableEvent(f"{delta.role}> ", color="cyan", end="")
if delta.content:
yield InferenceStreamPrintableEvent(delta.content, color="yellow", end="")
if choice.finish_reason:
if choice.finish_reason == "length":
yield InferenceStreamPrintableEvent("<truncated>", color="red", end="")
yield InferenceStreamPrintableEvent()


class EventLogger:
def log(self, event_generator):
Expand Down
Loading