Skip to content

Commit b1f3a6e

Browse files
llama: automatically set parameters not set by the user in such a way that maximizes GPU utilization (#16653)
* llama: automatically fit args to free memory llama-fit-params tool * fix CI * hints for bug reports, ensure no reallocation * fix segfault with Vulkan * add llama-fit-params to CI * fix CI * fix CI * fix CI * minor adjustments * fix assignment of 1 dense layer * fix logger not being reset on model load failure * remove --n-gpu-layer hint on model load failure * fix llama-fit-params verbosity * fix edge case * fix typo [no ci]
1 parent 4aced7a commit b1f3a6e

26 files changed

+1075
-63
lines changed

.github/ISSUE_TEMPLATE/011-bug-results.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ body:
1111
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
1212
If you encountered the issue while using an external UI (e.g. ollama),
1313
please reproduce your issue using one of the examples/binaries in this repository.
14-
The `llama-cli` binary can be used for simple and reproducible model inference.
14+
The `llama-completion` binary can be used for simple and reproducible model inference.
1515
- type: textarea
1616
id: version
1717
attributes:
@@ -74,9 +74,12 @@ body:
7474
Please give us a summary of the problem and tell us how to reproduce it.
7575
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
7676
that information would be very much appreciated by us.
77+
78+
If possible, please try to reproduce the issue using `llama-completion` with `-fit off`.
79+
If you can only reproduce the issue with `-fit on`, please provide logs both with and without `--verbose`.
7780
placeholder: >
78-
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
79-
When I use -ngl 0 it works correctly.
81+
e.g. when I run llama-completion with `-fa on` I get garbled outputs for very long prompts.
82+
With short prompts or `-fa off` it works correctly.
8083
Here are the exact commands that I used: ...
8184
validations:
8285
required: true

ci/run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,8 @@ function gg_run_qwen3_0_6b {
398398
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
399399
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
400400

401+
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
402+
401403
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
402404
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
403405
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -523,6 +525,8 @@ function gg_run_embd_bge_small {
523525

524526
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
525527

528+
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
529+
526530
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
527531
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
528532

@@ -563,6 +567,8 @@ function gg_run_rerank_tiny {
563567

564568
model_f16="${path_models}/ggml-model-f16.gguf"
565569

570+
(time ./bin/llama-fit-params --model ${model_f16} 2>&1 | tee -a $OUT/${ci}-fp-f16.log)
571+
566572
# for this model, the SEP token is "</s>"
567573
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
568574

common/arg.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <nlohmann/json.hpp>
2121

2222
#include <algorithm>
23+
#include <cinttypes>
2324
#include <climits>
2425
#include <cstdarg>
2526
#include <fstream>
@@ -529,7 +530,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
529530
params.kv_overrides.back().key[0] = 0;
530531
}
531532

532-
if (!params.tensor_buft_overrides.empty()) {
533+
// pad tensor_buft_overrides for llama_params_fit:
534+
const size_t ntbo = llama_max_tensor_buft_overrides();
535+
while (params.tensor_buft_overrides.size() < ntbo) {
533536
params.tensor_buft_overrides.push_back({nullptr, nullptr});
534537
}
535538

@@ -2153,6 +2156,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21532156
}
21542157
}
21552158
).set_env("LLAMA_ARG_MAIN_GPU"));
2159+
add_opt(common_arg(
2160+
{ "-fit", "--fit" }, "[on|off]",
2161+
string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2162+
[](common_params & params, const std::string & value) {
2163+
if (is_truthy(value)) {
2164+
params.fit_params = true;
2165+
} else if (is_falsey(value)) {
2166+
params.fit_params = false;
2167+
} else {
2168+
throw std::runtime_error(
2169+
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2170+
}
2171+
}
2172+
).set_env("LLAMA_ARG_FIT"));
2173+
add_opt(common_arg(
2174+
{ "-fitt", "--fit-target" }, "MiB",
2175+
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2176+
[](common_params & params, int value) {
2177+
params.fit_params_target = value * size_t(1024*1024);
2178+
}
2179+
).set_env("LLAMA_ARG_FIT_TARGET"));
2180+
add_opt(common_arg(
2181+
{ "-fitc", "--fit-ctx" }, "N",
2182+
string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2183+
[](common_params & params, int value) {
2184+
params.fit_params_min_ctx = value;
2185+
}
2186+
).set_env("LLAMA_ARG_FIT_CTX"));
21562187
add_opt(common_arg(
21572188
{"--check-tensors"},
21582189
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),

common/common.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,7 +1088,15 @@ struct common_init_result::impl {
10881088

10891089
common_init_result::common_init_result(common_params & params) :
10901090
pimpl(new impl{}) {
1091-
const auto mparams = common_model_params_to_llama(params);
1091+
auto mparams = common_model_params_to_llama(params);
1092+
auto cparams = common_context_params_to_llama(params);
1093+
1094+
if (params.fit_params) {
1095+
LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
1096+
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1097+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1098+
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
1099+
}
10921100

10931101
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
10941102
if (model == NULL) {
@@ -1103,8 +1111,6 @@ common_init_result::common_init_result(common_params & params) :
11031111
// TODO: fix naming
11041112
common_init_sampler_from_model(model, params.sampling);
11051113

1106-
auto cparams = common_context_params_to_llama(params);
1107-
11081114
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
11091115
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
11101116
params.sampling.ignore_eos = false;
@@ -1143,8 +1149,7 @@ common_init_result::common_init_result(common_params & params) :
11431149

11441150
llama_context * lctx = llama_init_from_model(model, cparams);
11451151
if (lctx == NULL) {
1146-
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
1147-
__func__, params.model.path.c_str());
1152+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
11481153
return;
11491154
}
11501155

@@ -1176,15 +1181,13 @@ common_init_result_ptr common_init_from_params(common_params & params) {
11761181

11771182
llama_model * model = res->model();
11781183
if (model == NULL) {
1179-
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
1180-
__func__, params.model.path.c_str());
1184+
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
11811185
return res;
11821186
}
11831187

11841188
llama_context * lctx = res->context();
11851189
if (lctx == NULL) {
1186-
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
1187-
__func__, params.model.path.c_str());
1190+
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
11881191
return res;
11891192
}
11901193

common/common.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ enum llama_example {
9999
LLAMA_EXAMPLE_TTS,
100100
LLAMA_EXAMPLE_DIFFUSION,
101101
LLAMA_EXAMPLE_FINETUNE,
102+
LLAMA_EXAMPLE_FIT_PARAMS,
102103

103104
LLAMA_EXAMPLE_COUNT,
104105
};
@@ -306,8 +307,8 @@ struct lr_opt {
306307
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
307308

308309
struct common_params {
309-
int32_t n_predict = -1; // new tokens to predict
310-
int32_t n_ctx = 4096; // context size
310+
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
311+
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
311312
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
312313
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
313314
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -328,9 +329,12 @@ struct common_params {
328329
// offload params
329330
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
330331

331-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
332-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
333-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
332+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
333+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
334+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
335+
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
336+
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
337+
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
334338

335339
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
336340

ggml/include/ggml-alloc.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
5353
// call with a worst-case graph to avoid buffer reallocations
5454
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
5555
// returns false if the buffer allocation failed
56+
// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes
5657
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
58+
GGML_API void ggml_gallocr_reserve_n_size(
59+
ggml_gallocr_t galloc,
60+
struct ggml_cgraph * graph,
61+
const int * node_buffer_ids,
62+
const int * leaf_buffer_ids,
63+
size_t * sizes);
5764
GGML_API bool ggml_gallocr_reserve_n(
5865
ggml_gallocr_t galloc,
5966
struct ggml_cgraph * graph,
@@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
6875

6976
// Utils
7077
// Create a buffer and allocate all the tensors in a ggml_context
78+
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
79+
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7180
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
7281
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
7382

ggml/include/ggml-backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ extern "C" {
307307
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
308308

309309
// Initialize backend buffers from a measure graph
310+
GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes);
310311
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
311312

312313
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);

ggml/include/ggml.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2615,7 +2615,8 @@ extern "C" {
26152615

26162616
// Set callback for all future logging events.
26172617
// If this is not called, or NULL is supplied, everything is output on stderr.
2618-
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2618+
GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data);
2619+
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
26192620

26202621
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
26212622

ggml/src/ggml-alloc.c

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
594594
}
595595

596596
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
597-
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
597+
return t->data != NULL // tensor data already set externally
598+
|| t->buffer // tensor on external buffer (but not yet allocated)
599+
|| ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc
598600
}
599601

600602
// free the extra space at the end if the new tensor is smaller
@@ -823,7 +825,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
823825
}
824826
}
825827

826-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
828+
static bool ggml_gallocr_reserve_n_impl(
829+
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) {
827830
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
828831
// add 25% margin to avoid hash collisions
829832
min_hash_size += min_hash_size / 4;
@@ -928,23 +931,41 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
928931
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
929932
if (cur_size > 0) {
930933
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
931-
__func__, ggml_backend_buft_name(galloc->bufts[i]),
932-
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
934+
__func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
933935
}
934936
}
935937
#endif
936938
ggml_vbuffer_free(galloc->buffers[i]);
937-
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
938-
if (galloc->buffers[i] == NULL) {
939-
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
940-
return false;
939+
if (no_alloc) {
940+
galloc->buffers[i] = NULL;
941+
} else {
942+
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
943+
if (galloc->buffers[i] == NULL) {
944+
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
945+
return false;
946+
}
941947
}
942948
}
943949
}
944950

945951
return true;
946952
}
947953

954+
void ggml_gallocr_reserve_n_size(
955+
ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) {
956+
GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true));
957+
for (int i = 0; i < galloc->n_buffers; i++) {
958+
sizes[i] = 0;
959+
for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
960+
sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size;
961+
}
962+
}
963+
}
964+
965+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
966+
return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false);
967+
}
968+
948969
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
949970
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
950971
}
@@ -1147,14 +1168,16 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
11471168
return true;
11481169
}
11491170

1150-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1171+
static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl(
1172+
struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) {
11511173
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
11521174

11531175
size_t alignment = ggml_backend_buft_get_alignment(buft);
11541176
size_t max_size = ggml_backend_buft_get_max_size(buft);
11551177

11561178
ggml_backend_buffer_t * buffers = NULL;
11571179
size_t n_buffers = 0;
1180+
*nbytes_total = 0;
11581181

11591182
size_t cur_buf_size = 0;
11601183
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
@@ -1166,10 +1189,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
11661189

11671190
if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
11681191
// allocate tensors in the current buffer
1169-
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1192+
if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
11701193
return NULL;
11711194
}
11721195
first = t;
1196+
*nbytes_total += cur_buf_size;
11731197
cur_buf_size = this_size;
11741198
} else {
11751199
cur_buf_size += this_size;
@@ -1178,15 +1202,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
11781202

11791203
// allocate remaining tensors
11801204
if (cur_buf_size > 0) {
1181-
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1205+
*nbytes_total += cur_buf_size;
1206+
if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
11821207
return NULL;
11831208
}
11841209
}
11851210

1211+
if (no_alloc) {
1212+
return NULL;
1213+
}
1214+
11861215
if (n_buffers == 0) {
11871216
#ifndef NDEBUG
11881217
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
11891218
#endif
1219+
GGML_ASSERT(!buffers);
11901220
return NULL;
11911221
}
11921222

@@ -1196,10 +1226,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
11961226
} else {
11971227
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
11981228
}
1199-
free(buffers);
1229+
if (buffers) {
1230+
free(buffers); // can be NULL if context is empty or no_alloc
1231+
}
12001232
return buffer;
12011233
}
12021234

1235+
size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1236+
size_t nbytes_total = 0;
1237+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true);
1238+
GGML_ASSERT(!buf);
1239+
return nbytes_total;
1240+
}
1241+
1242+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
1243+
size_t nbytes_total = 0;
1244+
return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false);
1245+
}
1246+
12031247
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
12041248
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
12051249
}

0 commit comments

Comments
 (0)