Skip to content

Commit 8213c19

Browse files
committed
perf: optimize LlamaModel.metadata reading performance
- Increase initial buffer size to 16KB to eliminate re-allocations for large chat templates. - Cache ctypes function references to reduce loop overhead. - Repeated model loading can result in a cumulative speed improvement of 1-3%. Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 8322481 commit 8213c19

File tree

1 file changed

+24
-18
lines changed

1 file changed

+24
-18
lines changed

llama_cpp/_internals.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -225,32 +225,38 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
225225
# Extra
226226
def metadata(self) -> Dict[str, str]:
227227
metadata: Dict[str, str] = {}
228-
buffer_size = 1024
228+
# Pre-allocate a 16KB buffer. This is large enough to handle almost all
229+
# metadata values (including gpt-oss large chat templates ~15KB) in a single pass,
230+
# eliminating the need for resize-and-retry in most cases.
231+
buffer_size = 16384
229232
buffer = ctypes.create_string_buffer(buffer_size)
230-
# zero the buffer
231-
buffer.value = b"\0" * buffer_size
233+
234+
# Caching function references reduces the overhead of property lookups within loops.
235+
get_key_by_index = llama_cpp.llama_model_meta_key_by_index
236+
get_val_by_index = llama_cpp.llama_model_meta_val_str_by_index
237+
metadata_count = llama_cpp.llama_model_meta_count(self.model)
232238
# iterate over model keys
233-
for i in range(llama_cpp.llama_model_meta_count(self.model)):
234-
nbytes = llama_cpp.llama_model_meta_key_by_index(
235-
self.model, i, buffer, buffer_size
236-
)
239+
for i in range(metadata_count):
240+
# 1. Get Key
241+
nbytes = get_key_by_index(self.model, i, buffer, buffer_size)
242+
# Handle buffer resize if the key exceeds current size
237243
if nbytes > buffer_size:
238-
buffer_size = nbytes + 1
244+
buffer_size = nbytes + 1024
239245
buffer = ctypes.create_string_buffer(buffer_size)
240-
nbytes = llama_cpp.llama_model_meta_key_by_index(
241-
self.model, i, buffer, buffer_size
242-
)
246+
# Retry with the larger buffer
247+
nbytes = get_key_by_index(self.model, i, buffer, buffer_size)
243248
key = buffer.value.decode("utf-8")
244-
nbytes = llama_cpp.llama_model_meta_val_str_by_index(
245-
self.model, i, buffer, buffer_size
246-
)
249+
250+
# 2. Get Value
251+
nbytes = get_val_by_index(self.model, i, buffer, buffer_size)
252+
# Handle buffer resize if the value exceeds current size
247253
if nbytes > buffer_size:
248-
buffer_size = nbytes + 1
254+
buffer_size = nbytes + 1024
249255
buffer = ctypes.create_string_buffer(buffer_size)
250-
nbytes = llama_cpp.llama_model_meta_val_str_by_index(
251-
self.model, i, buffer, buffer_size
252-
)
256+
# Retry with the larger buffer
257+
nbytes = get_val_by_index(self.model, i, buffer, buffer_size)
253258
value = buffer.value.decode("utf-8")
259+
254260
metadata[key] = value
255261
return metadata
256262

0 commit comments

Comments
 (0)