Skip to content

Commit c98ce3b

Browse files
committed
Sync llama.cpp API 20251215
1 parent e4db276 commit c98ce3b

File tree

1 file changed

+75
-2
lines changed

1 file changed

+75
-2
lines changed

llama_cpp/llama_cpp.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ class llama_model_tensor_buft_override(ctypes.Structure):
712712
# bool check_tensors; // validate model tensor data
713713
# bool use_extra_bufts; // use extra buffer types (used for weight repacking)
714714
# bool no_host; // bypass host buffer allowing extra buffers to be used
715+
# bool no_alloc; // only load metadata and simulate memory allocations
715716
# };
716717
class llama_model_params(ctypes.Structure):
717718
"""Parameters for llama_model
@@ -731,7 +732,8 @@ class llama_model_params(ctypes.Structure):
731732
use_mlock (bool): force system to keep model in RAM
732733
check_tensors (bool): validate model tensor data
733734
use_extra_bufts (bool): use extra buffer types (used for weight repacking)
734-
no_host (bool): bypass host buffer allowing extra buffers to be used"""
735+
no_host (bool): bypass host buffer allowing extra buffers to be used
736+
no_alloc (bool): only load metadata and simulate memory allocations"""
735737

736738
if TYPE_CHECKING:
737739
devices: CtypesArray[ctypes.c_void_p] # NOTE: unused
@@ -749,6 +751,7 @@ class llama_model_params(ctypes.Structure):
749751
check_tensors: bool
750752
use_extra_bufts: bool
751753
no_host: bool
754+
no_alloc: bool
752755

753756
_fields_ = [
754757
("devices", ctypes.c_void_p), # NOTE: unnused
@@ -766,8 +769,10 @@ class llama_model_params(ctypes.Structure):
766769
("check_tensors", ctypes.c_bool),
767770
("use_extra_bufts", ctypes.c_bool),
768771
("no_host", ctypes.c_bool),
772+
("no_alloc", ctypes.c_bool),
769773
]
770774

775+
llama_model_params_p = ctypes.POINTER(llama_model_params)
771776

772777
# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
773778
# // https://github.com/ggml-org/llama.cpp/pull/7544
@@ -918,6 +923,7 @@ class llama_context_params(ctypes.Structure):
918923
("kv_unified", ctypes.c_bool),
919924
]
920925

926+
llama_context_params_p = ctypes.POINTER(llama_context_params)
921927

922928
# // Signature for logging events
923929
# // Note that text includes the new line character at the end for most events.
@@ -1306,6 +1312,51 @@ def llama_free(ctx: llama_context_p, /):
13061312
...
13071313

13081314

1315+
# // fits mparams and cparams to free device memory (assumes system memory is unlimited)
1316+
# // returns true if the parameters could be successfully modified to fit device memory
1317+
# // this function is NOT thread safe because it modifies the global llama logger state
1318+
# LLAMA_API bool llama_params_fit(
1319+
# const char * path_model,
1320+
# struct llama_model_params * mparams,
1321+
# struct llama_context_params * cparams,
1322+
# float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
1323+
# struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
1324+
# size_t margin, // margin of memory to leave per device in bytes
1325+
# uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
1326+
# enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
1327+
@ctypes_function(
1328+
"llama_params_fit",
1329+
[
1330+
ctypes.c_char_p,
1331+
llama_model_params_p,
1332+
llama_context_params_p,
1333+
ctypes.POINTER(ctypes.c_float),
1334+
ctypes.POINTER(llama_model_tensor_buft_override),
1335+
ctypes.c_size_t,
1336+
ctypes.c_uint32,
1337+
ctypes.c_int,
1338+
],
1339+
ctypes.c_bool,
1340+
)
1341+
def llama_params_fit(
1342+
path_model: ctypes.c_char_p,
1343+
mparams: llama_model_params_p,
1344+
cparams: llama_context_params_p,
1345+
tensor_split: ctypes.pointer(ctypes.c_float),
1346+
tensor_buft_overrides: ctypes.pointer(llama_model_tensor_buft_override),
1347+
margin: ctypes.c_size_t,
1348+
n_ctx_min: ctypes.c_uint32,
1349+
log_level: int,
1350+
/,
1351+
) -> bool:
1352+
"""
1353+
fits mparams and cparams to free device memory (assumes system memory is unlimited)
1354+
returns true if the parameters could be successfully modified to fit device memory
1355+
this function is NOT thread safe because it modifies the global llama logger state
1356+
"""
1357+
...
1358+
1359+
13091360
# LLAMA_API int64_t llama_time_us(void);
13101361
@ctypes_function(
13111362
"llama_time_us",
@@ -1328,6 +1379,12 @@ def llama_max_parallel_sequences() -> int:
13281379
...
13291380

13301381

1382+
# LLAMA_API size_t llama_max_tensor_buft_overrides(void);
1383+
@ctypes_function("llama_max_tensor_buft_overrides", [], ctypes.c_size_t)
1384+
def llama_max_tensor_buft_overrides() -> int:
1385+
...
1386+
1387+
13311388
# LLAMA_API bool llama_supports_mmap (void);
13321389
@ctypes_function("llama_supports_mmap", [], ctypes.c_bool)
13331390
def llama_supports_mmap() -> bool:
@@ -4217,6 +4274,23 @@ def llama_print_system_info() -> bytes:
42174274

42184275
# // Set callback for all future logging events.
42194276
# // If this is not called, or NULL is supplied, everything is output on stderr.
4277+
# // The logger state is global so these functions are NOT thread safe.
4278+
# LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
4279+
@ctypes_function(
4280+
"llama_log_get",
4281+
[ctypes.POINTER(ggml_log_callback), ctypes.POINTER(ctypes.c_void_p)],
4282+
None,
4283+
)
4284+
def llama_log_get(
4285+
log_callback: Optional[ctypes.pointer(ggml_log_callback)],
4286+
user_data: ctypes.pointer(ctypes.c_void_p),
4287+
/,
4288+
):
4289+
"""Get callback for all future logging events.
4290+
If this is not called, or NULL is supplied, everything is output on stderr."""
4291+
...
4292+
4293+
42204294
# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
42214295
@ctypes_function(
42224296
"llama_log_set",
@@ -4229,7 +4303,6 @@ def llama_log_set(
42294303
/,
42304304
):
42314305
"""Set callback for all future logging events.
4232-
42334306
If this is not called, or NULL is supplied, everything is output on stderr."""
42344307
...
42354308

0 commit comments

Comments
 (0)