Skip to content

Commit 1a00e45

Browse files
committed
Update settings fields and defaults
1 parent 8675397 commit 1a00e45

File tree

1 file changed

+55
-39
lines changed

1 file changed

+55
-39
lines changed

llama_cpp/server/app.py

Lines changed: 55 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,41 @@
1313

1414

1515
class Settings(BaseSettings):
16-
model: str
17-
n_ctx: int = 2048
18-
n_batch: int = 512
19-
n_threads: int = max((os.cpu_count() or 2) // 2, 1)
20-
f16_kv: bool = True
21-
use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
22-
use_mmap: bool = True
23-
embedding: bool = True
24-
last_n_tokens_size: int = 64
25-
logits_all: bool = False
26-
cache: bool = False # WARNING: This is an experimental feature
27-
vocab_only: bool = False
16+
model: str = Field(
17+
description="The path to the model to use for generating completions."
18+
)
19+
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
20+
n_batch: int = Field(
21+
default=512, ge=1, description="The batch size to use per eval."
22+
)
23+
n_threads: int = Field(
24+
default=max((os.cpu_count() or 2) // 2, 1),
25+
ge=1,
26+
description="The number of threads to use.",
27+
)
28+
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
29+
use_mlock: bool = Field(
30+
default=bool(llama_cpp.llama_mlock_supported().value),
31+
description="Use mlock.",
32+
)
33+
use_mmap: bool = Field(
34+
default=bool(llama_cpp.llama_mmap_supported().value),
35+
description="Use mmap.",
36+
)
37+
embedding: bool = Field(default=True, description="Whether to use embeddings.")
38+
last_n_tokens_size: int = Field(
39+
default=64,
40+
ge=0,
41+
description="Last n tokens to keep for repeat penalty calculation.",
42+
)
43+
logits_all: bool = Field(default=True, description="Whether to return logits.")
44+
cache: bool = Field(
45+
default=False,
46+
description="Use a cache to reduce processing times for evaluated prompts.",
47+
)
48+
vocab_only: bool = Field(
49+
default=False, description="Whether to only return the vocabulary."
50+
)
2851

2952

3053
router = APIRouter()
@@ -74,79 +97,75 @@ def get_llama():
7497
with llama_lock:
7598
yield llama
7699

77-
model_field = Field(
78-
description="The model to use for generating completions."
79-
)
100+
101+
model_field = Field(description="The model to use for generating completions.")
80102

81103
max_tokens_field = Field(
82-
default=16,
83-
ge=1,
84-
le=2048,
85-
description="The maximum number of tokens to generate."
104+
default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
86105
)
87106

88107
temperature_field = Field(
89108
default=0.8,
90109
ge=0.0,
91110
le=2.0,
92-
description="Adjust the randomness of the generated text.\n\n" +
93-
"Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run."
111+
description="Adjust the randomness of the generated text.\n\n"
112+
+ "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
94113
)
95114

96115
top_p_field = Field(
97116
default=0.95,
98117
ge=0.0,
99118
le=1.0,
100-
description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" +
101-
"Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text."
119+
description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
120+
+ "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
102121
)
103122

104123
stop_field = Field(
105124
default=None,
106-
description="A list of tokens at which to stop generation. If None, no stop tokens are used."
125+
description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
107126
)
108127

109128
stream_field = Field(
110129
default=False,
111-
description="Whether to stream the results as they are generated. Useful for chatbots."
130+
description="Whether to stream the results as they are generated. Useful for chatbots.",
112131
)
113132

114133
top_k_field = Field(
115134
default=40,
116135
ge=0,
117-
description="Limit the next token selection to the K most probable tokens.\n\n" +
118-
"Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text."
136+
description="Limit the next token selection to the K most probable tokens.\n\n"
137+
+ "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
119138
)
120139

121140
repeat_penalty_field = Field(
122141
default=1.0,
123142
ge=0.0,
124-
description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" +
125-
"Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient."
143+
description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
144+
+ "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
126145
)
127146

147+
128148
class CreateCompletionRequest(BaseModel):
129149
prompt: Optional[str] = Field(
130-
default="",
131-
description="The prompt to generate completions for."
150+
default="", description="The prompt to generate completions for."
132151
)
133152
suffix: Optional[str] = Field(
134153
default=None,
135-
description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots."
154+
description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
136155
)
137156
max_tokens: int = max_tokens_field
138157
temperature: float = temperature_field
139158
top_p: float = top_p_field
140159
echo: bool = Field(
141160
default=False,
142-
description="Whether to echo the prompt in the generated text. Useful for chatbots."
161+
description="Whether to echo the prompt in the generated text. Useful for chatbots.",
143162
)
144163
stop: Optional[List[str]] = stop_field
145164
stream: bool = stream_field
146165
logprobs: Optional[int] = Field(
147166
default=None,
148167
ge=0,
149-
description="The number of logprobs to generate. If None, no logprobs are generated."
168+
description="The number of logprobs to generate. If None, no logprobs are generated.",
150169
)
151170

152171
# ignored or currently unsupported
@@ -204,9 +223,7 @@ def create_completion(
204223

205224
class CreateEmbeddingRequest(BaseModel):
206225
model: Optional[str] = model_field
207-
input: str = Field(
208-
description="The input to embed."
209-
)
226+
input: str = Field(description="The input to embed.")
210227
user: Optional[str]
211228

212229
class Config:
@@ -239,8 +256,7 @@ class ChatCompletionRequestMessage(BaseModel):
239256

240257
class CreateChatCompletionRequest(BaseModel):
241258
messages: List[ChatCompletionRequestMessage] = Field(
242-
default=[],
243-
description="A list of messages to generate completions for."
259+
default=[], description="A list of messages to generate completions for."
244260
)
245261
max_tokens: int = max_tokens_field
246262
temperature: float = temperature_field

0 commit comments

Comments
 (0)