From 4e475a8b7435d88939b9f4a8272f467f1997928a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 13:46:52 +0100
Subject: [PATCH 1/8] presets: refactor, allow cascade presets from different
 sources

---
 common/arg.cpp                 |   4 +-
 common/preset.cpp              | 193 +++++++++++++++++++-
 common/preset.h                |  47 ++++-
 tools/server/server-models.cpp | 316 +++++++++------------------------
 tools/server/server-models.h   |  23 +--
 5 files changed, 325 insertions(+), 258 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index b6d16168ebc..4a0ec656bdd 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -873,7 +873,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         sampler_type_chars += common_sampler_type_to_chr(sampler);
         sampler_type_names += common_sampler_type_to_str(sampler) + ";";
     }
-    sampler_type_names.pop_back();
+    if (!sampler_type_names.empty()) {
+        sampler_type_names.pop_back(); // remove last semicolon
+    }
 
 
     /**
diff --git a/common/preset.cpp b/common/preset.cpp
index 60746aad581..9a21fe75e64 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -2,6 +2,7 @@
 #include "preset.h"
 #include "peg-parser.h"
 #include "log.h"
+#include "download.h"
 
 #include <fstream>
 #include <sstream>
@@ -15,9 +16,13 @@ static std::string rm_leading_dashes(const std::string & str) {
     return str.substr(pos);
 }
 
-std::vector<std::string> common_preset::to_args() const {
+std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
     std::vector<std::string> args;
 
+    if (!bin_path.empty()) {
+        args.push_back(bin_path);
+    }
+
     for (const auto & [opt, value] : options) {
         args.push_back(opt.args.back()); // use the last arg as the main arg
         if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
@@ -63,6 +68,52 @@ std::string common_preset::to_ini() const {
     return ss.str();
 }
 
+void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
+    // try if option exists, update it
+    for (auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            val = value;
+            return;
+        }
+    }
+    // if option does not exist, we need to add it
+    if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
+        throw std::runtime_error(string_format(
+            "%s: option with env '%s' not found in ctx_params",
+            __func__, env.c_str()
+        ));
+    }
+    options[ctx.key_to_opt.at(env)] = value;
+}
+
+void common_preset::unset_option(const std::string & env) {
+    for (auto it = options.begin(); it != options.end(); ) {
+        const common_arg & opt = it->first;
+        if (opt.env && env == opt.env) {
+            it = options.erase(it);
+            return;
+        } else {
+            ++it;
+        }
+    }
+}
+
+bool common_preset::get_option(const std::string & env, std::string & value) const {
+    for (const auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            value = val;
+            return true;
+        }
+    }
+    return false;
+}
+
+void common_preset::merge(const common_preset & other) {
+    for (const auto & [opt, val] : other.options) {
+        options[opt] = val; // overwrite existing options
+    }
+}
+
 static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
     std::map<std::string, std::map<std::string, std::string>> parsed;
 
@@ -172,9 +223,12 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
     return value;
 }
 
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
+common_preset_context::common_preset_context(common_params & default_params, llama_example ex)
+    : ctx_params(common_params_parser_init(default_params, ex)),
+      key_to_opt(get_map_key_opt(ctx_params)) {}
+
+common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
     common_presets out;
-    auto key_to_opt = get_map_key_opt(ctx_params);
     auto ini_data = parse_ini_from_file(path);
 
     for (auto section : ini_data) {
@@ -188,7 +242,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
         for (const auto & [key, value] : section.second) {
             LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
             if (key_to_opt.find(key) != key_to_opt.end()) {
-                auto & opt = key_to_opt[key];
+                const auto & opt = key_to_opt.at(key);
                 if (is_bool_arg(opt)) {
                     preset.options[opt] = parse_bool_arg(opt, key, value);
                 } else {
@@ -199,8 +253,139 @@ common_presets common_presets_load(const std::string & path, common_params_conte
                 // TODO: maybe warn about unknown key?
             }
         }
+
+        if (preset.name == "*") {
+            // handle global preset
+            global = preset;
+        } else {
+            out[preset.name] = preset;
+        }
+    }
+
+    return out;
+}
+
+common_presets common_preset_context::load_from_cache() const {
+    common_presets out;
+
+    auto cached_models = common_list_cached_models();
+    for (const auto & model : cached_models) {
+        common_preset preset;
+        preset.name = model.to_string();
+        preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
+    if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(models_dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+
+    // convert local models to presets
+    common_presets out;
+    common_preset base;
+    base.set_option(*this, "LLAMA_ARG_MODEL",  "");
+    base.set_option(*this, "LLAMA_ARG_MMPROJ", "");
+    for (const auto & model : models) {
+        common_preset preset = base; // copy
+        preset.name = model.name;
+        preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
+        if (!model.path_mmproj.empty()) {
+            preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
+        }
         out[preset.name] = preset;
     }
 
     return out;
 }
+
+common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
+    common_preset preset;
+    preset.name = COMMON_PRESET_DEFAULT_NAME;
+
+    bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
+    if (!ok) {
+        throw std::runtime_error("failed to parse CLI arguments into preset");
+    }
+
+    return preset;
+}
+
+common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
+    common_presets out = base; // copy
+    for (const auto & [name, preset_added] : added) {
+        if (out.find(name) != out.end()) {
+            // if exists, merge
+            common_preset & target = out[name];
+            target.merge(preset_added);
+        } else {
+            // otherwise, add directly
+            out[name] = preset_added;
+        }
+    }
+    return out;
+}
+
+common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
+    common_presets out;
+    for (const auto & [name, preset] : presets) {
+        common_preset tmp = base; // copy
+        tmp.merge(preset);
+        out[name] = std::move(tmp);
+    }
+    return out;
+}
diff --git a/common/preset.h b/common/preset.h
index dceb849eb81..d7e06a8f872 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -13,20 +13,61 @@
 
 constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
 
+struct common_preset_context;
+
 struct common_preset {
     std::string name;
-    // TODO: support repeated args in the future
+
+    // options are stored as common_arg to string mapping, representing CLI arg and its value
     std::map<common_arg, std::string> options;
 
     // convert preset to CLI argument list
-    std::vector<std::string> to_args() const;
+    std::vector<std::string> to_args(const std::string & bin_path = "") const;
 
     // convert preset to INI format string
     std::string to_ini() const;
 
     // TODO: maybe implement to_env() if needed
+
+    // modify preset options where argument is identified by its env variable
+    void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
+
+    // unset option by its env variable
+    void unset_option(const std::string & env);
+
+    // get option value by its env variable, return false if not found
+    bool get_option(const std::string & env, std::string & value) const;
+
+    // merge another preset into this one, overwriting existing options
+    void merge(const common_preset & other);
 };
 
 // interface for multiple presets in one file
 using common_presets = std::map<std::string, common_preset>;
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
+
+// context for loading and editing presets
+struct common_preset_context {
+    common_params_context ctx_params;
+    std::map<std::string, common_arg> key_to_opt;
+    common_preset_context(common_params & default_params, llama_example ex);
+
+    // load presets from INI file
+    common_presets load_from_ini(const std::string & path, common_preset & global) const;
+
+    // generate presets from cached models
+    common_presets load_from_cache() const;
+
+    // generate presets from local models directory
+    // for the directory structure, see "Using multiple models" in server/README.md
+    common_presets load_from_models_dir(const std::string & models_dir) const;
+
+    // generate one preset from CLI arguments
+    common_preset load_from_args(int argc, char ** argv) const;
+
+    // cascade multiple presets if exist on both: base < added
+    // if preset does not exist in base, it will be added without modification
+    common_presets cascade(const common_presets & base, const common_presets & added) const;
+
+    // apply presets over a base preset (same idea as CSS cascading)
+    common_presets cascade(const common_preset & base, const common_presets & presets) const;
+};
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index c1f86e54933..1b70a395074 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -82,154 +82,29 @@ static std::filesystem::path get_server_exec_path() {
 #endif
 }
 
-struct local_model {
-    std::string name;
-    std::string path;
-    std::string path_mmproj;
-};
-
-static std::vector<local_model> list_local_models(const std::string & dir) {
-    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
-        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
-    }
-
-    std::vector<local_model> models;
-    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
-        auto files = fs_list(subdir_path, false);
-        common_file_info model_file;
-        common_file_info first_shard_file;
-        common_file_info mmproj_file;
-        for (const auto & file : files) {
-            if (string_ends_with(file.name, ".gguf")) {
-                if (file.name.find("mmproj") != std::string::npos) {
-                    mmproj_file = file;
-                } else if (file.name.find("-00001-of-") != std::string::npos) {
-                    first_shard_file = file;
-                } else {
-                    model_file = file;
-                }
-            }
-        }
-        // single file model
-        local_model model{
-            /* name        */ name,
-            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
-            /* path_mmproj */ mmproj_file.path // can be empty
-        };
-        if (!model.path.empty()) {
-            models.push_back(model);
-        }
-    };
-
-    auto files = fs_list(dir, true);
-    for (const auto & file : files) {
-        if (file.is_dir) {
-            scan_subdir(file.path, file.name);
-        } else if (string_ends_with(file.name, ".gguf")) {
-            // single file model
-            std::string name = file.name;
-            string_replace_all(name, ".gguf", "");
-            local_model model{
-                /* name        */ name,
-                /* path        */ file.path,
-                /* path_mmproj */ ""
-            };
-            models.push_back(model);
-        }
-    }
-    return models;
-}
-
-//
-// server_presets
-//
-
-
-server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
-        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
-    if (!presets_path.empty()) {
-        presets = common_presets_load(presets_path, ctx_params);
-        SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
-    }
-
-    // populate reserved args (will be appended by the router)
-    for (auto & opt : ctx_params.options) {
-        if (opt.env == nullptr) {
-            continue;
-        }
-        std::string env = opt.env;
-        if (env == "LLAMA_ARG_PORT" ||
-            env == "LLAMA_ARG_HOST" ||
-            env == "LLAMA_ARG_ALIAS" ||
-            env == "LLAMA_ARG_API_KEY" ||
-            env == "LLAMA_ARG_MODELS_DIR" ||
-            env == "LLAMA_ARG_MODELS_MAX" ||
-            env == "LLAMA_ARG_MODELS_PRESET" ||
-            env == "LLAMA_ARG_MODEL" ||
-            env == "LLAMA_ARG_MMPROJ" ||
-            env == "LLAMA_ARG_HF_REPO" ||
-            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
-            env == "LLAMA_ARG_SSL_KEY_FILE" ||
-            env == "LLAMA_ARG_SSL_CERT_FILE") {
-            control_args[env] = opt;
-        }
-    }
-
-    // read base args from router's argv
-    common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
-
-    // remove any router-controlled args from base_args
-    for (const auto & cargs : control_args) {
-        auto it = base_args.find(cargs.second);
-        if (it != base_args.end()) {
-            base_args.erase(it);
-        }
-    }
-}
-
-common_preset server_presets::get_preset(const std::string & name) {
-    auto it = presets.find(name);
-    if (it != presets.end()) {
-        return it->second;
-    }
-    return common_preset();
-}
-
-void server_presets::render_args(server_model_meta & meta) {
-    common_preset preset = meta.preset; // copy
-    // merging 3 kinds of args:
-    // 1. model-specific args (from preset)
-    // force removing control args if any
-    for (auto & cargs : control_args) {
-        if (preset.options.find(cargs.second) != preset.options.end()) {
-            SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
-            preset.options.erase(cargs.second);
-        }
-    }
-    // 2. base args (from router)
-    // inherit from base args
-    for (const auto & [arg, value] : base_args) {
-        preset.options[arg] = value;
-    }
-    // 3. control args (from router)
-    // set control values
-    preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
-    preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
-    preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
-    if (meta.in_cache) {
-        preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
-    } else {
-        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
-        if (!meta.path_mmproj.empty()) {
-            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
-        }
-    }
-    // disable SSL for child processes (HTTPS already handled by router)
-    preset.options[control_args["LLAMA_ARG_SSL_KEY_FILE"]] = "";
-    preset.options[control_args["LLAMA_ARG_SSL_CERT_FILE"]] = "";
-    meta.args = preset.to_args();
-    // add back the binary path at the front
-    meta.args.insert(meta.args.begin(), get_server_exec_path().string());
+static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
+    preset.unset_option("LLAMA_ARG_SSL_KEY_FILE");
+    preset.unset_option("LLAMA_ARG_SSL_CERT_FILE");
+    preset.unset_option("LLAMA_ARG_API_KEY");
+    preset.unset_option("LLAMA_ARG_MODELS_DIR");
+    preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_PRESET");
+    preset.unset_option("LLAMA_ARG_NO_MODELS_AUTOLOAD");
+    if (unset_model_args) {
+        preset.unset_option("LLAMA_ARG_MODEL");
+        preset.unset_option("LLAMA_ARG_MMPROJ");
+        preset.unset_option("LLAMA_ARG_HF_REPO");
+    }
+}
+
+void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
+    // update params
+    unset_reserved_args(preset, false);
+    preset.set_option(ctx_preset, "LLAMA_ARG_HOST",  CHILD_ADDR);
+    preset.set_option(ctx_preset, "LLAMA_ARG_PORT",  std::to_string(port));
+    preset.set_option(ctx_preset, "LLAMA_ARG_ALIAS", name);
+    // render args
+    args = preset.to_args(bin_path);
 }
 
 //
@@ -240,20 +115,20 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
-    for (int i = 0; i < argc; i++) {
-        base_args.push_back(std::string(argv[i]));
-    }
+        char ** envp)
+            : ctx_preset(base_params, LLAMA_EXAMPLE_SERVER),
+              base_params(params),
+              base_preset(ctx_preset.load_from_args(argc, argv)) {
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
-    GGML_ASSERT(!base_args.empty());
     // set binary path
     try {
-        base_args[0] = get_server_exec_path().string();
+        bin_path = get_server_exec_path().string();
     } catch (const std::exception & e) {
+        bin_path = argv[0];
         LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
+        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
     load_models();
 }
@@ -262,7 +137,7 @@ void server_models::add_model(server_model_meta && meta) {
     if (mapping.find(meta.name) != mapping.end()) {
         throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
     }
-    presets.render_args(meta); // populate meta.args
+    meta.update_args(ctx_preset, bin_path); // render args
     std::string name = meta.name;
     mapping[name] = instance_t{
         /* subproc */ std::make_shared<subprocess_s>(),
@@ -271,86 +146,59 @@ void server_models::add_model(server_model_meta && meta) {
     };
 }
 
-static std::vector<local_model> list_custom_path_models(server_presets & presets) {
-    // detect any custom-path models in presets
-    std::vector<local_model> custom_models;
-    for (auto & [model_name, preset] : presets.presets) {
-        local_model model;
-        model.name = model_name;
-        std::vector<common_arg> to_erase;
-        for (auto & [arg, value] : preset.options) {
-            std::string env(arg.env ? arg.env : "");
-            if (env == "LLAMA_ARG_MODEL") {
-                model.path = value;
-                to_erase.push_back(arg);
-            }
-            if (env == "LLAMA_ARG_MMPROJ") {
-                model.path_mmproj = value;
-                to_erase.push_back(arg);
-            }
-        }
-        for (auto & arg : to_erase) {
-            preset.options.erase(arg);
-        }
-        if (!model.name.empty() && !model.path.empty()) {
-            custom_models.push_back(model);
-        }
-    }
-    return custom_models;
-}
-
 // TODO: allow refreshing cached model list
 void server_models::load_models() {
     // loading models from 3 sources:
     // 1. cached models
-    auto cached_models = common_list_cached_models();
-    for (const auto & model : cached_models) {
-        server_model_meta meta{
-            /* preset      */ presets.get_preset(model.to_string()),
-            /* name        */ model.to_string(),
-            /* path        */ model.manifest_path,
-            /* path_mmproj */ "", // auto-detected when loading
-            /* in_cache    */ true,
-            /* port        */ 0,
-            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used   */ 0,
-            /* args        */ std::vector<std::string>(),
-            /* exit_code   */ 0
-        };
-        add_model(std::move(meta));
-    }
-    // 2. local models specificed via --models-dir
+    common_presets cached_models = ctx_preset.load_from_cache();
+    SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
+    // 2. local models from --models-dir
+    common_presets local_models;
     if (!base_params.models_dir.empty()) {
-        auto local_models = list_local_models(base_params.models_dir);
-        for (const auto & model : local_models) {
-            if (mapping.find(model.name) != mapping.end()) {
-                // already exists in cached models, skip
-                continue;
-            }
-            server_model_meta meta{
-                /* preset      */ presets.get_preset(model.name),
-                /* name        */ model.name,
-                /* path        */ model.path,
-                /* path_mmproj */ model.path_mmproj,
-                /* in_cache    */ false,
-                /* port        */ 0,
-                /* status      */ SERVER_MODEL_STATUS_UNLOADED,
-                /* last_used   */ 0,
-                /* args        */ std::vector<std::string>(),
-                /* exit_code   */ 0
-            };
-            add_model(std::move(meta));
+        local_models = ctx_preset.load_from_models_dir(base_params.models_dir);
+        SRV_INF("Loaded %zu local model presets from %s\n", local_models.size(), base_params.models_dir.c_str());
+    }
+    // 3. custom-path models from presets
+    common_preset global = {};
+    common_presets custom_models = ctx_preset.load_from_ini(base_params.models_preset, global);
+    SRV_INF("Loaded %zu custom model presets from %s\n", custom_models.size(), base_params.models_preset.c_str());
+
+    // cascade, apply global preset first
+    cached_models = ctx_preset.cascade(global, cached_models);
+    local_models  = ctx_preset.cascade(global, local_models);
+    custom_models = ctx_preset.cascade(global, custom_models);
+
+    // note: if a model exists in both cached and local, local takes precedence
+    common_presets final_presets;
+    for (const auto & [name, preset] : cached_models) {
+        final_presets[name] = preset;
+    }
+    for (const auto & [name, preset] : local_models) {
+        final_presets[name] = preset;
+    }
+
+    // process custom presets from INI
+    for (const auto & [name, custom] : custom_models) {
+        if (final_presets.find(name) != final_presets.end()) {
+            // apply custom config if exists
+            common_preset & target = final_presets[name];
+            target.merge(custom);
+        } else {
+            // otherwise add directly
+            final_presets[name] = custom;
         }
     }
-    // 3. custom-path models specified in presets
-    auto custom_models = list_custom_path_models(presets);
-    for (const auto & model : custom_models) {
+
+    // server base preset from CLI args take highest precedence
+    for (auto & [name, preset] : final_presets) {
+        preset.merge(base_preset);
+    }
+
+    // convert presets to server_model_meta and add to mapping
+    for (const auto & preset : final_presets) {
         server_model_meta meta{
-            /* preset      */ presets.get_preset(model.name),
-            /* name        */ model.name,
-            /* path        */ model.path,
-            /* path_mmproj */ model.path_mmproj,
-            /* in_cache    */ false,
+            /* preset      */ preset.second,
+            /* name        */ preset.first,
             /* port        */ 0,
             /* status      */ SERVER_MODEL_STATUS_UNLOADED,
             /* last_used   */ 0,
@@ -359,10 +207,16 @@ void server_models::load_models() {
         };
         add_model(std::move(meta));
     }
+
     // log available models
+    std::unordered_set<std::string> custom_names;
+    for (const auto & [name, preset] : custom_models) {
+        custom_names.insert(name);
+    }
     SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
     for (const auto & [name, inst] : mapping) {
-        SRV_INF("  %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
+        bool has_custom = custom_names.find(name) != custom_names.end();
+        SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
     }
 }
 
@@ -526,7 +380,7 @@ void server_models::load(const std::string & name) {
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
-        presets.render_args(inst.meta); // update meta.args
+        inst.meta.update_args(ctx_preset, bin_path); // render args
 
         std::vector<std::string> child_args = inst.meta.args; // copy
         std::vector<std::string> child_env  = base_env; // copy
@@ -888,8 +742,6 @@ void server_models_routes::init_routes() {
                 {"object",   "model"},    // for OAI-compat
                 {"owned_by", "llamacpp"}, // for OAI-compat
                 {"created",  t},          // for OAI-compat
-                {"in_cache", meta.in_cache},
-                {"path",     meta.path},
                 {"status",   status},
                 // TODO: add other fields, may require reading GGUF metadata
             });
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index cbc4c432460..56fb398e311 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -51,9 +51,6 @@ static std::string server_model_status_to_string(server_model_status status) {
 struct server_model_meta {
     common_preset preset;
     std::string name;
-    std::string path;
-    std::string path_mmproj; // only available if in_cache=false
-    bool in_cache = false; // if true, use -hf; use -m otherwise
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
@@ -67,19 +64,8 @@ struct server_model_meta {
     bool is_failed() const {
         return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
     }
-};
 
-// the server_presets struct holds the presets read from presets.ini
-// as well as base args from the router server
-struct server_presets {
-    common_presets presets;
-    common_params_context ctx_params;
-    std::map<common_arg, std::string> base_args;
-    std::map<std::string, common_arg> control_args; // args reserved for server control
-
-    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
-    common_preset get_preset(const std::string & name);
-    void render_args(server_model_meta & meta);
+    void update_args(common_preset_context & ctx_presets, std::string bin_path);
 };
 
 struct subprocess_s;
@@ -97,11 +83,12 @@ struct server_models {
     std::condition_variable cv;
     std::map<std::string, instance_t> mapping;
 
+    common_preset_context ctx_preset;
+
     common_params base_params;
-    std::vector<std::string> base_args;
+    std::string bin_path;
     std::vector<std::string> base_env;
-
-    server_presets presets;
+    common_preset base_preset; // base preset from llama-server CLI args
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 

From 5abab16d75e1aa861a186e1eccd01db6a66b7eb7 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 13:56:25 +0100
Subject: [PATCH 2/8] update docs

---
 tools/server/README.md         | 15 +++++++++++++--
 tools/server/server-models.cpp |  2 ++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index fd5a59e848d..88937b577e7 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1443,6 +1443,12 @@ Example:
 ```ini
 version = 1
 
+; (Optional) This section provides global settings shared across all presets.
+; If the same key is defined in a specific preset, it will override the value in this global section.
+[*]
+c = 8192
+n-gpu-layer = 8
+
 ; If the key corresponds to an existing model on the server,
 ; this will be used as the default config for that model
 [ggml-org/MY-MODEL-GGUF:Q8_0]
@@ -1462,12 +1468,17 @@ model-draft = ./my-models/draft.gguf
 model-draft = /Users/abc/my-models/draft.gguf
 
 ; If the key does NOT correspond to an existing model,
-; you need to specify at least the model path
+; you need to specify at least the model path or HF repo
 [custom_model]
 model = /Users/abc/my-awesome-model-Q4_K_M.gguf
 ```
 
-Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
+Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upon loading.
+
+The precedence rule for preset options is as follows:
+1. **Command-line arguments** passed to `llama-server` (highest priority)
+2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
+3. **Global options** defined in the preset file (`[*]`)
 
 ### Routing requests
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 1b70a395074..db85d514bf5 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -122,6 +122,8 @@ server_models::server_models(
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
+    // clean up base preset
+    unset_reserved_args(base_preset, true);
     // set binary path
     try {
         bin_path = get_server_exec_path().string();

From 0d04bba5e81e0a262f08b342dd9b9c112e6bdb7d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 14:03:26 +0100
Subject: [PATCH 3/8] fix neg arg handling

---
 common/arg.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index 4a0ec656bdd..5f16a3b3b4a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -772,6 +772,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
         }
         auto opt = *arg_to_options[arg];
         std::string val;
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // bool arg (need to reverse the meaning for negative args)
+            bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
+            val = is_neg ? "0" : "1";
+        }
         if (opt.value_hint != nullptr) {
             // arg with single value
             check_arg(i);

From 60ec94edf58a6b97cb189eb7a19c784220205496 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 14:06:04 +0100
Subject: [PATCH 4/8] fix empty mmproj

---
 common/preset.cpp              | 5 +----
 tools/server/server-models.cpp | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/common/preset.cpp b/common/preset.cpp
index 9a21fe75e64..0b2c7196967 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -337,11 +337,8 @@ common_presets common_preset_context::load_from_models_dir(const std::string & m
 
     // convert local models to presets
     common_presets out;
-    common_preset base;
-    base.set_option(*this, "LLAMA_ARG_MODEL",  "");
-    base.set_option(*this, "LLAMA_ARG_MMPROJ", "");
     for (const auto & model : models) {
-        common_preset preset = base; // copy
+        common_preset preset;
         preset.name = model.name;
         preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
         if (!model.path_mmproj.empty()) {
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index db85d514bf5..0cf0e22866e 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -103,6 +103,7 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
     preset.set_option(ctx_preset, "LLAMA_ARG_HOST",  CHILD_ADDR);
     preset.set_option(ctx_preset, "LLAMA_ARG_PORT",  std::to_string(port));
     preset.set_option(ctx_preset, "LLAMA_ARG_ALIAS", name);
+    // TODO: maybe validate preset before rendering ?
     // render args
     args = preset.to_args(bin_path);
 }

From 4004c4770f71854bb661d9fce29abb71c9d51668 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 14:12:38 +0100
Subject: [PATCH 5/8] also filter out server-controlled args before to_ini()

---
 common/preset.cpp              | 1 +
 tools/server/server-models.cpp | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/common/preset.cpp b/common/preset.cpp
index 0b2c7196967..fd12176fe91 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -381,6 +381,7 @@ common_presets common_preset_context::cascade(const common_preset & base, const
     common_presets out;
     for (const auto & [name, preset] : presets) {
         common_preset tmp = base; // copy
+        tmp.name = name;
         tmp.merge(preset);
         out[name] = std::move(tmp);
     }
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 0cf0e22866e..e86b3bc2cc0 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -734,7 +734,12 @@ void server_models_routes::init_routes() {
                 {"args",   meta.args},
             };
             if (!meta.preset.name.empty()) {
-                status["preset"] = meta.preset.to_ini();
+                common_preset preset_copy = meta.preset;
+                unset_reserved_args(preset_copy, false);
+                preset_copy.unset_option("LLAMA_ARG_HOST");
+                preset_copy.unset_option("LLAMA_ARG_PORT");
+                preset_copy.unset_option("LLAMA_ARG_ALIAS");
+                status["preset"] = preset_copy.to_ini();
             }
             if (meta.is_failed()) {
                 status["exit_code"] = meta.exit_code;

From ac6f8ca1cd50eb4e90ba04f0f8c2327c06a32fe9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 17:33:32 +0100
Subject: [PATCH 6/8] skip loading custom_models if not specified

---
 tools/server/server-models.cpp | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index e86b3bc2cc0..c2bbdf33e51 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -163,13 +163,16 @@ void server_models::load_models() {
     }
     // 3. custom-path models from presets
     common_preset global = {};
-    common_presets custom_models = ctx_preset.load_from_ini(base_params.models_preset, global);
-    SRV_INF("Loaded %zu custom model presets from %s\n", custom_models.size(), base_params.models_preset.c_str());
+    common_presets custom_presets = {};
+    if (!base_params.models_preset.empty()) {
+        custom_presets = ctx_preset.load_from_ini(base_params.models_preset, global);
+        SRV_INF("Loaded %zu custom model presets from %s\n", custom_presets.size(), base_params.models_preset.c_str());
+    }
 
     // cascade, apply global preset first
-    cached_models = ctx_preset.cascade(global, cached_models);
-    local_models  = ctx_preset.cascade(global, local_models);
-    custom_models = ctx_preset.cascade(global, custom_models);
+    cached_models  = ctx_preset.cascade(global, cached_models);
+    local_models   = ctx_preset.cascade(global, local_models);
+    custom_presets = ctx_preset.cascade(global, custom_presets);
 
     // note: if a model exists in both cached and local, local takes precedence
     common_presets final_presets;
@@ -181,7 +184,7 @@ void server_models::load_models() {
     }
 
     // process custom presets from INI
-    for (const auto & [name, custom] : custom_models) {
+    for (const auto & [name, custom] : custom_presets) {
         if (final_presets.find(name) != final_presets.end()) {
             // apply custom config if exists
             common_preset & target = final_presets[name];
@@ -212,14 +215,16 @@ void server_models::load_models() {
     }
 
     // log available models
-    std::unordered_set<std::string> custom_names;
-    for (const auto & [name, preset] : custom_models) {
-        custom_names.insert(name);
-    }
-    SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
-    for (const auto & [name, inst] : mapping) {
-        bool has_custom = custom_names.find(name) != custom_names.end();
-        SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+    {
+        std::unordered_set<std::string> custom_names;
+        for (const auto & [name, preset] : custom_presets) {
+            custom_names.insert(name);
+        }
+        SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+        for (const auto & [name, inst] : mapping) {
+            bool has_custom = custom_names.find(name) != custom_names.end();
+            SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+        }
     }
 }
 

From 11f8109b69180846db1bd24c4c6410c015d4f0d6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 22:26:32 +0100
Subject: [PATCH 7/8] fix unset_reserved_args

---
 tools/server/server-models.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index c2bbdf33e51..30434be96c2 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -85,11 +85,11 @@ static std::filesystem::path get_server_exec_path() {
 static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
     preset.unset_option("LLAMA_ARG_SSL_KEY_FILE");
     preset.unset_option("LLAMA_ARG_SSL_CERT_FILE");
-    preset.unset_option("LLAMA_ARG_API_KEY");
+    preset.unset_option("LLAMA_API_KEY");
     preset.unset_option("LLAMA_ARG_MODELS_DIR");
     preset.unset_option("LLAMA_ARG_MODELS_MAX");
     preset.unset_option("LLAMA_ARG_MODELS_PRESET");
-    preset.unset_option("LLAMA_ARG_NO_MODELS_AUTOLOAD");
+    preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
     if (unset_model_args) {
         preset.unset_option("LLAMA_ARG_MODEL");
         preset.unset_option("LLAMA_ARG_MMPROJ");

From 95b79963068d64953d3dd2afad5ad0942ddc4e85 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 18 Dec 2025 22:48:38 +0100
Subject: [PATCH 8/8] fix crash on windows

---
 common/preset.cpp              | 2 +-
 common/preset.h                | 3 ++-
 tools/server/server-models.cpp | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/common/preset.cpp b/common/preset.cpp
index fd12176fe91..1aa9864d0aa 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -223,7 +223,7 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
     return value;
 }
 
-common_preset_context::common_preset_context(common_params & default_params, llama_example ex)
+common_preset_context::common_preset_context(llama_example ex)
     : ctx_params(common_params_parser_init(default_params, ex)),
       key_to_opt(get_map_key_opt(ctx_params)) {}
 
diff --git a/common/preset.h b/common/preset.h
index d7e06a8f872..3a84d1be29c 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -47,9 +47,10 @@ using common_presets = std::map<std::string, common_preset>;
 
 // context for loading and editing presets
 struct common_preset_context {
+    common_params default_params; // unused for now
     common_params_context ctx_params;
     std::map<std::string, common_arg> key_to_opt;
-    common_preset_context(common_params & default_params, llama_example ex);
+    common_preset_context(llama_example ex);
 
     // load presets from INI file
     common_presets load_from_ini(const std::string & path, common_preset & global) const;
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 30434be96c2..db7ab667f93 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -117,7 +117,7 @@ server_models::server_models(
         int argc,
         char ** argv,
         char ** envp)
-            : ctx_preset(base_params, LLAMA_EXAMPLE_SERVER),
+            : ctx_preset(LLAMA_EXAMPLE_SERVER),
               base_params(params),
               base_preset(ctx_preset.load_from_args(argc, argv)) {
     for (char ** env = envp; *env != nullptr; env++) {