diff --git a/common/arg.cpp b/common/arg.cpp index c27587b8b98..6d20dbfbd7d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1139,7 +1139,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--cache-ram", "-cram"}, "N", + {"-cram", "--cache-ram"}, "N", string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)" "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib), [](common_params & params, int value) { @@ -1147,7 +1147,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--kv-unified", "-kvu"}, + {"-kvu", "--kv-unified"}, "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)", [](common_params & params) { params.kv_unified = true; @@ -1415,7 +1415,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", + {"--sampler-seq", "--sampling-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sampling.samplers = common_sampler_types_from_chars(value); @@ -2073,26 +2073,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( - {"--override-tensor", "-ot"}, "=,...", + {"-ot", "--override-tensor"}, "=,...", "override tensor buffer type", [](common_params & params, const std::string & value) { parse_tensor_buffer_overrides(value, params.tensor_buft_overrides); } )); add_opt(common_arg( - {"--override-tensor-draft", "-otd"}, "=,...", + {"-otd", "--override-tensor-draft"}, "=,...", "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--cpu-moe", "-cmoe"}, + {"-cmoe", "--cpu-moe"}, "keep all Mixture of Experts (MoE) weights in the CPU", [](common_params & params) { params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( - {"--n-cpu-moe", "-ncmoe"}, "N", + {"-ncmoe", "--n-cpu-moe"}, "N", "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU", [](common_params & params, int value) { if (value < 0) { @@ -2107,14 +2107,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_N_CPU_MOE")); add_opt(common_arg( - {"--cpu-moe-draft", "-cmoed"}, + {"-cmoed", "--cpu-moe-draft"}, "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", [](common_params & params) { params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); add_opt(common_arg( - {"--n-cpu-moe-draft", "-ncmoed"}, "N", + {"-ncmoed", "--n-cpu-moe-draft"}, "N", "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model", [](common_params & params, int value) { if (value < 0) { @@ -2642,7 +2642,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(common_arg( - {"--reranking", "--rerank"}, + {"--rerank", "--reranking"}, string_format("enable reranking endpoint on server (default: %s)", "disabled"), [](common_params & params) { params.embedding = true; @@ -3113,7 +3113,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( - {"--draft-max", "--draft", "--draft-n"}, "N", + {"--draft", "--draft-n", "--draft-max"}, "N", string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), [](common_params & params, int value) { params.speculative.n_max = value; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 468d325e22c..74573c34e9d 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -37,6 +37,30 @@ int main(void) { exit(1); } } + + // ensure shorter argument precedes longer argument + if (opt.args.size() > 1) { + const std::string first(opt.args.front()); + const std::string last(opt.args.back()); + + if (first.length() > last.length()) { + fprintf(stderr, "test-arg-parser: shorter argument should come before longer one: %s, %s\n", + first.c_str(), last.c_str()); + assert(false); + } + } + + // same check for negated arguments + if (opt.args_neg.size() > 1) { + const std::string first(opt.args_neg.front()); + const std::string last(opt.args_neg.back()); + + if (first.length() > last.length()) { + fprintf(stderr, "test-arg-parser: shorter negated argument should come before longer one: %s, %s\n", + first.c_str(), last.c_str()); + assert(false); + } + } } } catch (std::exception & e) { printf("%s\n", e.what()); diff --git a/tools/server/README.md b/tools/server/README.md index fd5a59e848d..17e295921d3 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -75,9 +75,9 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | -| `--override-tensor, -ot =,...` | override tensor buffer type | -| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | -| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | +| `-ot, --override-tensor =,...` | override tensor buffer type | +| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | +| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)
(env: LLAMA_ARG_N_GPU_LAYERS) | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | @@ -120,7 +120,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | -------- | ----------- | | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) | | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | -| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | +| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--temp N` | temperature (default: 0.8) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | @@ -156,8 +156,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | Argument | Explanation | | -------- | ----------- | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | -| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | +| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode
| | `-sp, --special` | special tokens output enabled (default: false) | @@ -172,9 +172,9 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | -| `--override-tensor-draft, -otd =,...` | override tensor buffer type for draft model | -| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | -| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | +| `-otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | +| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | +| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | | `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | @@ -184,7 +184,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | -| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | +| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | | `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | @@ -212,7 +212,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | -| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | +| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)
(env: LLAMA_ARG_DRAFT_P_MIN) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) |