Skip to content

Commit 6ce3d85

Browse files
server: (webui) add --webui-config (#18028)
* server/webui: add server-side WebUI config support Add CLI arguments --webui-config (inline JSON) and --webui-config-file (file path) to configure WebUI default settings from server side. Backend changes: - Parse JSON once in server_context::load_model() for performance - Cache parsed config in webui_settings member (zero overhead on /props) - Add proper error handling in router mode with try/catch - Expose webui_settings in /props endpoint for both router and child modes Frontend changes: - Add 14 configurable WebUI settings via parameter sync - Add tests for webui settings extraction - Fix subpath support with base path in API calls Addresses feedback from @ngxson and @ggerganov * server: address review feedback from ngxson * server: regenerate README with llama-gen-docs
1 parent e85e9d7 commit 6ce3d85

File tree

15 files changed

+163
-27
lines changed

15 files changed

+163
-27
lines changed

common/arg.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2610,6 +2610,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26102610
params.api_prefix = value;
26112611
}
26122612
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2613+
add_opt(common_arg(
2614+
{"--webui-config"}, "JSON",
2615+
"JSON that provides default WebUI settings (overrides WebUI defaults)",
2616+
[](common_params & params, const std::string & value) {
2617+
params.webui_config_json = value;
2618+
}
2619+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
2620+
add_opt(common_arg(
2621+
{"--webui-config-file"}, "PATH",
2622+
"JSON file that provides default WebUI settings (overrides WebUI defaults)",
2623+
[](common_params & params, const std::string & value) {
2624+
params.webui_config_json = read_file(value);
2625+
}
2626+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
26132627
add_opt(common_arg(
26142628
{"--webui"},
26152629
{"--no-webui"},

common/common.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,8 +484,11 @@ struct common_params {
484484

485485
std::map<std::string, std::string> default_template_kwargs;
486486

487+
// webui configs
488+
bool webui = true;
489+
std::string webui_config_json;
490+
487491
// "advanced" endpoints are disabled by default for better security
488-
bool webui = true;
489492
bool endpoint_slots = true;
490493
bool endpoint_props = false; // only control POST requests, not GET
491494
bool endpoint_metrics = false;

tools/server/README.md

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
4646
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
4747
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
4848
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
49-
| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
49+
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
5050
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
5151
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
5252
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
@@ -82,13 +82,16 @@ For the ful list of features, please refer to [server's changelog](https://githu
8282
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
8383
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
8484
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
85+
| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
86+
| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
87+
| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
8588
| `--check-tensors` | check model tensor data for invalid values (default: false) |
86-
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
89+
| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
8790
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
88-
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
89-
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
90-
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
91-
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
91+
| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
92+
| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
93+
| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
94+
| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
9295
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
9396
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
9497
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
@@ -120,7 +123,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
120123
| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
121124
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
122125
| `--temp N` | temperature (default: 0.8) |
123-
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
126+
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
124127
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
125128
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
126129
| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
@@ -177,6 +180,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
177180
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
178181
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
179182
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
183+
| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
184+
| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
180185
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
181186
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
182187
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |

tools/server/server-context.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,8 @@ struct server_context_impl {
544544

545545
server_metrics metrics;
546546

547+
json webui_settings = json::object();
548+
547549
// Necessary similarity of prompt for slot selection
548550
float slot_prompt_similarity = 0.0f;
549551

@@ -575,6 +577,16 @@ struct server_context_impl {
575577

576578
params_base = params;
577579

580+
webui_settings = json::object();
581+
if (!params_base.webui_config_json.empty()) {
582+
try {
583+
webui_settings = json::parse(params_base.webui_config_json);
584+
} catch (const std::exception & e) {
585+
SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
586+
return false;
587+
}
588+
}
589+
578590
llama_init = common_init_from_params(params_base);
579591

580592
model = llama_init->model();
@@ -3103,7 +3115,6 @@ void server_routes::init_routes() {
31033115
};
31043116
}
31053117

3106-
// this endpoint is publicly available, please only return what is safe to be exposed
31073118
json data = {
31083119
{ "default_generation_settings", default_generation_settings_for_props },
31093120
{ "total_slots", ctx_server.params_base.n_parallel },
@@ -3117,6 +3128,7 @@ void server_routes::init_routes() {
31173128
{ "endpoint_props", params.endpoint_props },
31183129
{ "endpoint_metrics", params.endpoint_metrics },
31193130
{ "webui", params.webui },
3131+
{ "webui_settings", ctx_server.webui_settings },
31203132
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
31213133
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
31223134
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},

tools/server/server-models.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,7 @@ void server_models_routes::init_routes() {
818818
{"params", json{}},
819819
{"n_ctx", 0},
820820
}},
821+
{"webui_settings", webui_settings},
821822
});
822823
return res;
823824
}

tools/server/server-models.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "common.h"
44
#include "preset.h"
5+
#include "server-common.h"
56
#include "server-http.h"
67

78
#include <mutex>
@@ -149,9 +150,18 @@ struct server_models {
149150

150151
struct server_models_routes {
151152
common_params params;
153+
json webui_settings = json::object();
152154
server_models models;
153155
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
154156
: params(params), models(params, argc, argv, envp) {
157+
if (!this->params.webui_config_json.empty()) {
158+
try {
159+
webui_settings = json::parse(this->params.webui_config_json);
160+
} catch (const std::exception & e) {
161+
LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
162+
throw;
163+
}
164+
}
155165
init_routes();
156166
}
157167

tools/server/server.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "log.h"
99

1010
#include <atomic>
11+
#include <exception>
1112
#include <signal.h>
1213
#include <thread> // for std::thread::hardware_concurrency
1314

@@ -124,7 +125,12 @@ int main(int argc, char ** argv, char ** envp) {
124125
std::optional<server_models_routes> models_routes{};
125126
if (is_router_server) {
126127
// setup server instances manager
127-
models_routes.emplace(params, argc, argv, envp);
128+
try {
129+
models_routes.emplace(params, argc, argv, envp);
130+
} catch (const std::exception & e) {
131+
LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
132+
return 1;
133+
}
128134

129135
// proxy handlers
130136
// note: routes.get_health stays the same

tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
<script lang="ts">
2+
import { base } from '$app/paths';
23
import { AlertTriangle, RefreshCw, Key, CheckCircle, XCircle } from '@lucide/svelte';
34
import { goto } from '$app/navigation';
45
import { Button } from '$lib/components/ui/button';
@@ -64,7 +65,7 @@
6465
settingsStore.updateConfig('apiKey', apiKeyInput.trim());
6566
6667
// Test the API key by making a real request to the server
67-
const response = await fetch('./props', {
68+
const response = await fetch(`${base}/props`, {
6869
headers: {
6970
'Content-Type': 'application/json',
7071
Authorization: `Bearer ${apiKeyInput.trim()}`

tools/server/webui/src/lib/services/parameter-sync.spec.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,5 +130,19 @@ describe('ParameterSyncService', () => {
130130
expect(result.max_tokens).toBe(-1);
131131
expect(result.temperature).toBe(0.7);
132132
});
133+
134+
it('should merge webui settings from props when provided', () => {
135+
const result = ParameterSyncService.extractServerDefaults(null, {
136+
pasteLongTextToFileLen: 0,
137+
pdfAsImage: true,
138+
renderUserContentAsMarkdown: false,
139+
theme: 'dark'
140+
});
141+
142+
expect(result.pasteLongTextToFileLen).toBe(0);
143+
expect(result.pdfAsImage).toBe(true);
144+
expect(result.renderUserContentAsMarkdown).toBe(false);
145+
expect(result.theme).toBeUndefined();
146+
});
133147
});
134148
});

tools/server/webui/src/lib/services/parameter-sync.ts

Lines changed: 75 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,55 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
5555
{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
5656
{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
5757
{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
58-
{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true }
58+
{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true },
59+
{
60+
key: 'pasteLongTextToFileLen',
61+
serverKey: 'pasteLongTextToFileLen',
62+
type: 'number',
63+
canSync: true
64+
},
65+
{ key: 'pdfAsImage', serverKey: 'pdfAsImage', type: 'boolean', canSync: true },
66+
{
67+
key: 'showThoughtInProgress',
68+
serverKey: 'showThoughtInProgress',
69+
type: 'boolean',
70+
canSync: true
71+
},
72+
{ key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
73+
{
74+
key: 'disableReasoningFormat',
75+
serverKey: 'disableReasoningFormat',
76+
type: 'boolean',
77+
canSync: true
78+
},
79+
{ key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
80+
{ key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
81+
{
82+
key: 'askForTitleConfirmation',
83+
serverKey: 'askForTitleConfirmation',
84+
type: 'boolean',
85+
canSync: true
86+
},
87+
{ key: 'disableAutoScroll', serverKey: 'disableAutoScroll', type: 'boolean', canSync: true },
88+
{
89+
key: 'renderUserContentAsMarkdown',
90+
serverKey: 'renderUserContentAsMarkdown',
91+
type: 'boolean',
92+
canSync: true
93+
},
94+
{ key: 'autoMicOnEmpty', serverKey: 'autoMicOnEmpty', type: 'boolean', canSync: true },
95+
{
96+
key: 'pyInterpreterEnabled',
97+
serverKey: 'pyInterpreterEnabled',
98+
type: 'boolean',
99+
canSync: true
100+
},
101+
{
102+
key: 'enableContinueGeneration',
103+
serverKey: 'enableContinueGeneration',
104+
type: 'boolean',
105+
canSync: true
106+
}
59107
];
60108

61109
export class ParameterSyncService {
@@ -74,25 +122,39 @@ export class ParameterSyncService {
74122
* Extract server default parameters that can be synced
75123
*/
76124
static extractServerDefaults(
77-
serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null
125+
serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
126+
webuiSettings?: Record<string, string | number | boolean>
78127
): ParameterRecord {
79-
if (!serverParams) return {};
80-
81128
const extracted: ParameterRecord = {};
82129

83-
for (const param of SYNCABLE_PARAMETERS) {
84-
if (param.canSync && param.serverKey in serverParams) {
85-
const value = (serverParams as unknown as Record<string, ParameterValue>)[param.serverKey];
86-
if (value !== undefined) {
87-
// Apply precision rounding to avoid JavaScript floating-point issues
88-
extracted[param.key] = this.roundFloatingPoint(value);
130+
if (serverParams) {
131+
for (const param of SYNCABLE_PARAMETERS) {
132+
if (param.canSync && param.serverKey in serverParams) {
133+
const value = (serverParams as unknown as Record<string, ParameterValue>)[
134+
param.serverKey
135+
];
136+
if (value !== undefined) {
137+
// Apply precision rounding to avoid JavaScript floating-point issues
138+
extracted[param.key] = this.roundFloatingPoint(value);
139+
}
89140
}
90141
}
142+
143+
// Handle samplers array conversion to string
144+
if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
145+
extracted.samplers = serverParams.samplers.join(';');
146+
}
91147
}
92148

93-
// Handle samplers array conversion to string
94-
if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
95-
extracted.samplers = serverParams.samplers.join(';');
149+
if (webuiSettings) {
150+
for (const param of SYNCABLE_PARAMETERS) {
151+
if (param.canSync && param.serverKey in webuiSettings) {
152+
const value = webuiSettings[param.serverKey];
153+
if (value !== undefined) {
154+
extracted[param.key] = this.roundFloatingPoint(value);
155+
}
156+
}
157+
}
96158
}
97159

98160
return extracted;

0 commit comments

Comments
 (0)