@@ -712,6 +712,7 @@ class llama_model_tensor_buft_override(ctypes.Structure):
712712# bool check_tensors; // validate model tensor data
713713# bool use_extra_bufts; // use extra buffer types (used for weight repacking)
714714# bool no_host; // bypass host buffer allowing extra buffers to be used
715+ # bool no_alloc; // only load metadata and simulate memory allocations
715716# };
716717class llama_model_params (ctypes .Structure ):
717718 """Parameters for llama_model
@@ -731,7 +732,8 @@ class llama_model_params(ctypes.Structure):
731732 use_mlock (bool): force system to keep model in RAM
732733 check_tensors (bool): validate model tensor data
733734 use_extra_bufts (bool): use extra buffer types (used for weight repacking)
734- no_host (bool): bypass host buffer allowing extra buffers to be used"""
735+ no_host (bool): bypass host buffer allowing extra buffers to be used
736+ no_alloc (bool): only load metadata and simulate memory allocations"""
735737
736738 if TYPE_CHECKING :
737739 devices : CtypesArray [ctypes .c_void_p ] # NOTE: unused
@@ -749,6 +751,7 @@ class llama_model_params(ctypes.Structure):
749751 check_tensors : bool
750752 use_extra_bufts : bool
751753 no_host : bool
754+ no_alloc : bool
752755
753756 _fields_ = [
754757 ("devices" , ctypes .c_void_p ), # NOTE: unnused
@@ -766,8 +769,10 @@ class llama_model_params(ctypes.Structure):
766769 ("check_tensors" , ctypes .c_bool ),
767770 ("use_extra_bufts" , ctypes .c_bool ),
768771 ("no_host" , ctypes .c_bool ),
772+ ("no_alloc" , ctypes .c_bool ),
769773 ]
770774
775+ llama_model_params_p = ctypes .POINTER (llama_model_params )
771776
772777# // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
773778# // https://github.com/ggml-org/llama.cpp/pull/7544
@@ -918,6 +923,7 @@ class llama_context_params(ctypes.Structure):
918923 ("kv_unified" , ctypes .c_bool ),
919924 ]
920925
926+ llama_context_params_p = ctypes .POINTER (llama_context_params )
921927
922928# // Signature for logging events
923929# // Note that text includes the new line character at the end for most events.
@@ -1306,6 +1312,51 @@ def llama_free(ctx: llama_context_p, /):
13061312 ...
13071313
13081314
1315+ # // fits mparams and cparams to free device memory (assumes system memory is unlimited)
1316+ # // returns true if the parameters could be successfully modified to fit device memory
1317+ # // this function is NOT thread safe because it modifies the global llama logger state
1318+ # LLAMA_API bool llama_params_fit(
1319+ # const char * path_model,
1320+ # struct llama_model_params * mparams,
1321+ # struct llama_context_params * cparams,
1322+ # float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
1323+ # struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
1324+ # size_t margin, // margin of memory to leave per device in bytes
1325+ # uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
1326+ # enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
1327+ @ctypes_function (
1328+ "llama_params_fit" ,
1329+ [
1330+ ctypes .c_char_p ,
1331+ llama_model_params_p ,
1332+ llama_context_params_p ,
1333+ ctypes .POINTER (ctypes .c_float ),
1334+ ctypes .POINTER (llama_model_tensor_buft_override ),
1335+ ctypes .c_size_t ,
1336+ ctypes .c_uint32 ,
1337+ ctypes .c_int ,
1338+ ],
1339+ ctypes .c_bool ,
1340+ )
1341+ def llama_params_fit (
1342+ path_model : ctypes .c_char_p ,
1343+ mparams : llama_model_params_p ,
1344+ cparams : llama_context_params_p ,
1345+ tensor_split : ctypes .pointer (ctypes .c_float ),
1346+ tensor_buft_overrides : ctypes .pointer (llama_model_tensor_buft_override ),
1347+ margin : ctypes .c_size_t ,
1348+ n_ctx_min : ctypes .c_uint32 ,
1349+ log_level : int ,
1350+ / ,
1351+ ) -> bool :
1352+ """
1353+ fits mparams and cparams to free device memory (assumes system memory is unlimited)
1354+ returns true if the parameters could be successfully modified to fit device memory
1355+ this function is NOT thread safe because it modifies the global llama logger state
1356+ """
1357+ ...
1358+
1359+
13091360# LLAMA_API int64_t llama_time_us(void);
13101361@ctypes_function (
13111362 "llama_time_us" ,
@@ -1328,6 +1379,12 @@ def llama_max_parallel_sequences() -> int:
13281379 ...
13291380
13301381
1382+ # LLAMA_API size_t llama_max_tensor_buft_overrides(void);
1383+ @ctypes_function ("llama_max_tensor_buft_overrides" , [], ctypes .c_size_t )
1384+ def llama_max_tensor_buft_overrides () -> int :
1385+ ...
1386+
1387+
13311388# LLAMA_API bool llama_supports_mmap (void);
13321389@ctypes_function ("llama_supports_mmap" , [], ctypes .c_bool )
13331390def llama_supports_mmap () -> bool :
@@ -4217,6 +4274,23 @@ def llama_print_system_info() -> bytes:
42174274
42184275# // Set callback for all future logging events.
42194276# // If this is not called, or NULL is supplied, everything is output on stderr.
4277+ # // The logger state is global so these functions are NOT thread safe.
4278+ # LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
4279+ @ctypes_function (
4280+ "llama_log_get" ,
4281+ [ctypes .POINTER (ggml_log_callback ), ctypes .POINTER (ctypes .c_void_p )],
4282+ None ,
4283+ )
4284+ def llama_log_get (
4285+ log_callback : Optional [ctypes .pointer (ggml_log_callback )],
4286+ user_data : ctypes .pointer (ctypes .c_void_p ),
4287+ / ,
4288+ ):
4289+ """Get callback for all future logging events.
4290+ If this is not called, or NULL is supplied, everything is output on stderr."""
4291+ ...
4292+
4293+
42204294# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
42214295@ctypes_function (
42224296 "llama_log_set" ,
@@ -4229,7 +4303,6 @@ def llama_log_set(
42294303 / ,
42304304):
42314305 """Set callback for all future logging events.
4232-
42334306 If this is not called, or NULL is supplied, everything is output on stderr."""
42344307 ...
42354308
0 commit comments