You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
llama: automatically set parameters not set by the user in such a way that maximizes GPU utilization (#16653)
* llama: automatically fit args to free memory
llama-fit-params tool
* fix CI
* hints for bug reports, ensure no reallocation
* fix segfault with Vulkan
* add llama-fit-params to CI
* fix CI
* fix CI
* fix CI
* minor adjustments
* fix assignment of 1 dense layer
* fix logger not being reset on model load failure
* remove --n-gpu-layer hint on model load failure
* fix llama-fit-params verbosity
* fix edge case
* fix typo [no ci]
(time ./bin/llama-fit-params --model ${model_f16}2>&1| tee -a $OUT/${ci}-fp-f16.log)
402
+
401
403
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1| tee -a $OUT/${ci}-tg-f16.log
402
404
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1| tee -a $OUT/${ci}-tg-bf16.log
403
405
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1| tee -a $OUT/${ci}-tg-q8_0.log
@@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
(time ./bin/llama-fit-params --model ${model_f16}2>&1| tee -a $OUT/${ci}-fp-f16.log)
529
+
526
530
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1| tee -a $OUT/${ci}-tg-f16.log
527
531
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1| tee -a $OUT/${ci}-tg-q8_0.log
528
532
@@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
563
567
564
568
model_f16="${path_models}/ggml-model-f16.gguf"
565
569
570
+
(time ./bin/llama-fit-params --model ${model_f16}2>&1| tee -a $OUT/${ci}-fp-f16.log)
571
+
566
572
# for this model, the SEP token is "</s>"
567
573
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1| tee -a $OUT/${ci}-rk-f16.log
0 commit comments