From 3074b500a5fffba5c9b84a7c14e4dc2248b3653c Mon Sep 17 00:00:00 2001 From: JTischbein Date: Sat, 13 Dec 2025 20:10:21 +0100 Subject: [PATCH 01/10] Uncached model read --- common/arg.cpp | 7 ++ common/common.h | 2 +- src/llama-mmap.cpp | 133 ++++++++++++++++++++++++++++++++++++- src/llama-mmap.h | 3 + src/llama-model-loader.cpp | 117 +++++++++++++++++++++++++++++++- 5 files changed, 255 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index bb2a6840baa..88c65abdd92 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1984,6 +1984,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = value; } ).set_env("LLAMA_ARG_MMAP")); + add_opt(common_arg( + {"--mmap"}, + "memory-map model", + [](common_params & params) { + params.use_mmap = true; + } + ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.h b/common/common.h index 2fd83f0cf9c..3cd531f6867 100644 --- a/common/common.h +++ b/common/common.h @@ -413,7 +413,7 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool use_mmap = true; // use mmap for faster loads + bool use_mmap = false; // use uncached reads for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 0641c2d22f6..232dcdb9e49 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -13,9 +13,10 @@ #ifdef __has_include #if __has_include() #include + #include + #include #if defined(_POSIX_MAPPED_FILES) #include - #include #endif #if defined(_POSIX_MEMLOCK_RANGE) #include @@ -158,6 +159,129 @@ struct llama_file::impl { std::fclose(fp); } } +#elif defined(__linux__) + impl(const char * fname, const char * mode) : impl(fname, mode, false) {} + + impl(const char * fname, const char * mode, bool uncached_read) { + if (uncached_read) { + fd = open(fname, O_RDONLY | O_DIRECT); + if (fd == -1) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + + struct stat file_stats{}; + fstat(fd, &file_stats); + + size = file_stats.st_size; + + off_t ret = lseek(fd, 0, SEEK_SET); + if (ret == -1) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } + } else { + fp = ggml_fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { + if (fd == -1) { + long ret = std::ftell(fp); + if (ret == -1) { + throw std::runtime_error(format("ftell error: %s", strerror(errno))); + } + + return (size_t) ret; + } + + off_t pos = lseek(fd, 0, SEEK_CUR); + if (pos == -1) { + throw std::runtime_error(format("lseek error: %s", strerror(errno))); + } + return (size_t) pos; + } + + void seek(size_t offset, int whence) const { + off_t ret = 0; + if (fd == -1) { + ret = std::fseek(fp, (long) offset, whence); + } else { + ret = lseek(fd, offset, whence); + } + if (ret == -1) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } + } + + void read_raw(void * ptr, size_t len) const { + if (len == 0) { + return; + } + if (fd == -1) { + errno = 0; + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error("unexpectedly reached end of file"); + } + } else { + bool successful = false; + while (!successful) { + off_t ret = read(fd, ptr, len); + + if (ret == -1) { + if (errno == EINTR) { + continue; // Interrupted by signal, retry + } + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret == 0) { + throw std::runtime_error("unexpectedly reached end of file"); + } + + successful = true; + } + } + } + + uint32_t read_u32() const { + uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, len, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } + + void write_u32(uint32_t val) const { + write_raw(&val, sizeof(val)); + } + + ~impl() { + if (fp) { + std::fclose(fp); + } else if (fd != -1) { + close(fd); + } + } + + int fd = -1; + #else impl(const char * fname, const char * mode) { fp = ggml_fopen(fname, mode); @@ -237,11 +361,14 @@ struct llama_file::impl { } #endif - FILE * fp; - size_t size; + FILE * fp{}; + size_t size{}; }; llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} +#if defined(__linux__) +llama_file::llama_file(const char * fname, const char * mode, bool uncached_read) : pimpl(std::make_unique(fname, mode, uncached_read)) {} +#endif llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 4e5aec3f440..985404d0f52 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -14,6 +14,9 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode); +#if defined(__linux__) + llama_file(const char * fname, const char * mode, bool uncached_read); +#endif ~llama_file(); size_t tell() const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a5..03b855e2a90 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -502,8 +502,12 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - + +#if defined(__linux__) + files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); +#else files.emplace_back(new llama_file(fname.c_str(), "rb")); +#endif contexts.emplace_back(ctx); // Save tensors data offset of the main file. @@ -571,7 +575,11 @@ llama_model_loader::llama_model_loader( } } +#if defined(__linux__) + files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); +#else files.emplace_back(new llama_file(fname_split, "rb")); +#endif contexts.emplace_back(ctx); // Save tensors data offset info of the shard. @@ -933,7 +941,14 @@ bool llama_model_loader::load_all_data( // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; +#if defined(__linux__) + constexpr size_t alignment = 4 * 1024; // 4 KiB for Direct I/O + // Buffer size: balance between memory usage and I/O efficiency + // 64MB works well for NVMe drives + constexpr size_t buffer_size = 64 * 1024 * 1024; // 64 MiB +#else constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB +#endif std::vector host_buffers; std::vector events; @@ -982,7 +997,11 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { +#if defined(__linux__) + auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size + 2 * alignment); +#else auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); +#endif if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, ggml_backend_dev_name(dev)); @@ -1019,6 +1038,35 @@ bool llama_model_loader::load_all_data( ggml_backend_name(upload_backend)); } +#if defined(__linux__) + auto read_aligned_chunk = [](const llama_file * file, + size_t offset, + void * dest, + size_t size, + size_t alignment) { + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; + size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); + + void * raw_buffer = nullptr; + int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read); + if (ret != 0) { + throw std::runtime_error(format("posix_memalign failed with error %d", ret)); + } + + struct aligned_buffer_deleter { + void operator()(void * p) const { free(p); } + }; + std::unique_ptr buffer(raw_buffer); + + file->seek(aligned_offset, SEEK_SET); + file->read_raw(buffer.get(), bytes_to_read); + + uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; + memcpy(dest, reinterpret_cast(actual_data), size); + }; +#endif + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -1064,9 +1112,18 @@ bool llama_model_loader::load_all_data( } } else { const auto & file = files.at(weight->idx); +#if defined(__linux__) + auto offset = (off_t) weight->offs; + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; +#endif if (ggml_backend_buffer_is_host(cur->buffer)) { +#if defined(__linux__) + read_aligned_chunk(file.get(), weight->offs, cur->data, n_size, alignment); +#else file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); +#endif if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1075,6 +1132,55 @@ bool llama_model_loader::load_all_data( } else { // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { +#if defined(__linux__) + // Calculate aligned read boundaries + size_t read_start = aligned_offset; + size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); + + size_t bytes_read = 0; + size_t data_read = 0; // Actual tensor data copied (excluding padding) + + file->seek(aligned_offset, SEEK_SET); + + while (bytes_read < read_end - read_start) { + size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); + + // Align the destination pointer within the pinned buffer + uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); + + // Wait for previous upload to complete before reusing buffer + ggml_backend_event_synchronize(events[buffer_idx]); + + // Read aligned chunk from file + file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); + + // Calculate actual data portion (excluding alignment padding) + uintptr_t ptr_data = ptr_dest_aligned; + size_t data_to_copy = read_size; + + // Skip alignment padding at start of first chunk + if (bytes_read == 0) { + ptr_data += offset_from_alignment; + data_to_copy -= offset_from_alignment; + } + + // Trim alignment padding at end of last chunk + if (aligned_offset + bytes_read + read_size > offset + n_size) { + data_to_copy -= (read_end - (offset + n_size)); + } + + // Async upload actual data to GPU + ggml_backend_tensor_set_async(upload_backend, cur, + reinterpret_cast(ptr_data), data_read, data_to_copy); + ggml_backend_event_record(events[buffer_idx], upload_backend); + + data_read += data_to_copy; + bytes_read += read_size; + + ++buffer_idx; + buffer_idx %= n_buffers; + } +#else file->seek(weight->offs, SEEK_SET); size_t bytes_read = 0; @@ -1091,11 +1197,16 @@ bool llama_model_loader::load_all_data( ++buffer_idx; buffer_idx %= n_buffers; } +#endif } else { read_buf.resize(n_size); +#if defined(__linux__) + read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment); +#else file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); - ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + file->read_raw(read_buf.data(), n_size); +#endif + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } From 26cc75ffccb111c2f53e74a52e6d5ab850cd7513 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Sun, 14 Dec 2025 09:41:27 +0100 Subject: [PATCH 02/10] Removing additional --mmap arg --- common/arg.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 88c65abdd92..bb2a6840baa 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1984,13 +1984,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = value; } ).set_env("LLAMA_ARG_MMAP")); - add_opt(common_arg( - {"--mmap"}, - "memory-map model", - [](common_params & params) { - params.use_mmap = true; - } - ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" From ceccfb9ee644f1591e4fa5f1cc39a3ca9010cdd6 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Sun, 14 Dec 2025 19:36:31 +0100 Subject: [PATCH 03/10] Removing trailing whitespaces --- src/llama-mmap.cpp | 2 +- src/llama-model-loader.cpp | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 232dcdb9e49..7d23d6973ef 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -160,7 +160,7 @@ struct llama_file::impl { } } #elif defined(__linux__) - impl(const char * fname, const char * mode) : impl(fname, mode, false) {} + impl(const char * fname, const char * mode) : impl(fname, mode, false) {} impl(const char * fname, const char * mode, bool uncached_read) { if (uncached_read) { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 03b855e2a90..9faf85a050e 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -502,7 +502,7 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - + #if defined(__linux__) files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); #else @@ -1047,21 +1047,21 @@ bool llama_model_loader::load_all_data( off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); - + void * raw_buffer = nullptr; int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read); if (ret != 0) { throw std::runtime_error(format("posix_memalign failed with error %d", ret)); } - + struct aligned_buffer_deleter { void operator()(void * p) const { free(p); } }; std::unique_ptr buffer(raw_buffer); - + file->seek(aligned_offset, SEEK_SET); file->read_raw(buffer.get(), bytes_to_read); - + uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; memcpy(dest, reinterpret_cast(actual_data), size); }; @@ -1150,7 +1150,7 @@ bool llama_model_loader::load_all_data( // Wait for previous upload to complete before reusing buffer ggml_backend_event_synchronize(events[buffer_idx]); - + // Read aligned chunk from file file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); @@ -1163,7 +1163,7 @@ bool llama_model_loader::load_all_data( ptr_data += offset_from_alignment; data_to_copy -= offset_from_alignment; } - + // Trim alignment padding at end of last chunk if (aligned_offset + bytes_read + read_size > offset + n_size) { data_to_copy -= (read_end - (offset + n_size)); @@ -1204,9 +1204,9 @@ bool llama_model_loader::load_all_data( read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment); #else file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); + file->read_raw(read_buf.data(), n_size); #endif - ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } From d2acc3a8a89f84fdfc0bfc6dc26be24e3e3d7c27 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Mon, 15 Dec 2025 13:44:20 +0100 Subject: [PATCH 04/10] Adding fallback when O_DIRECT is not supported --- src/llama-mmap.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 7d23d6973ef..9d03c41d967 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -165,6 +165,10 @@ struct llama_file::impl { impl(const char * fname, const char * mode, bool uncached_read) { if (uncached_read) { fd = open(fname, O_RDONLY | O_DIRECT); + if (fd == -1 && (errno == EINVAL || errno == EOPNOTSUPP)) { + fd = open(fname, O_RDONLY); // retry without O_DIRECT + } + if (fd == -1) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } From f6d79fe1b1dddbcc8678f918394f5430b37b339e Mon Sep 17 00:00:00 2001 From: JTischbein Date: Tue, 16 Dec 2025 13:58:43 +0100 Subject: [PATCH 05/10] Remove branching in llama-model-loader.cpp and reduce code duplications in llama-mmap.cpp --- src/llama-mmap.cpp | 174 +++++++++++++--------------------- src/llama-mmap.h | 7 +- src/llama-model-loader.cpp | 188 +++++++++++++++---------------------- 3 files changed, 143 insertions(+), 226 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 9d03c41d967..044d9aa902a 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -75,7 +75,7 @@ struct llama_file::impl { return ret; } - impl(const char * fname, const char * mode) { + impl(const char * fname, const char * mode, const bool use_direct_io = false) { fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -154,43 +154,50 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + bool has_direct_io() const { + return false; + } + + void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const { + throw std::runtime_error("DirectIO is not implemented on Windows."); + } + ~impl() { if (fp) { std::fclose(fp); } } -#elif defined(__linux__) - impl(const char * fname, const char * mode) : impl(fname, mode, false) {} - - impl(const char * fname, const char * mode, bool uncached_read) { - if (uncached_read) { +#else + impl(const char * fname, const char * mode, const bool use_direct_io = false) { +#ifdef __linux__ + // Try unbuffered I/O for read only + if (use_direct_io && std::strcmp(mode, "rb") == 0) { fd = open(fname, O_RDONLY | O_DIRECT); - if (fd == -1 && (errno == EINVAL || errno == EOPNOTSUPP)) { - fd = open(fname, O_RDONLY); // retry without O_DIRECT - } - - if (fd == -1) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); - } - struct stat file_stats{}; - fstat(fd, &file_stats); + if (fd != -1) { + struct stat file_stats{}; + fstat(fd, &file_stats); - size = file_stats.st_size; + size = file_stats.st_size; - off_t ret = lseek(fd, 0, SEEK_SET); - if (ret == -1) { - throw std::runtime_error(format("seek error: %s", strerror(errno))); - } - } else { - fp = ggml_fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + off_t ret = lseek(fd, 0, SEEK_SET); + if (ret == -1) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } + return; } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); + + LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O", + fname, strerror(errno)); + } +#endif + fp = ggml_fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); } size_t tell() const { @@ -226,8 +233,8 @@ struct llama_file::impl { if (len == 0) { return; } + errno = 0; if (fd == -1) { - errno = 0; std::size_t ret = std::fread(ptr, len, 1, fp); if (ferror(fp)) { throw std::runtime_error(format("read error: %s", strerror(errno))); @@ -255,86 +262,27 @@ struct llama_file::impl { } } - uint32_t read_u32() const { - uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - void write_raw(const void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, len, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(uint32_t val) const { - write_raw(&val, sizeof(val)); - } + void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const { + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; + size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); - ~impl() { - if (fp) { - std::fclose(fp); - } else if (fd != -1) { - close(fd); - } - } - - int fd = -1; - -#else - impl(const char * fname, const char * mode) { - fp = ggml_fopen(fname, mode); - if (fp == NULL) { - throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); - } - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - - size_t tell() const { -// TODO: this ifdef is never true? -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - if (ret == -1) { - throw std::runtime_error(format("ftell error: %s", strerror(errno))); + void * raw_buffer = nullptr; + int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read); + if (ret != 0) { + throw std::runtime_error(format("posix_memalign failed with error %d", ret)); } - return (size_t) ret; - } + struct aligned_buffer_deleter { + void operator()(void * p) const { free(p); } + }; + std::unique_ptr buffer(raw_buffer); - void seek(size_t offset, int whence) const { -// TODO: this ifdef is never true? -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - if (ret != 0) { - throw std::runtime_error(format("seek error: %s", strerror(errno))); - } - } + seek(aligned_offset, SEEK_SET); + read_raw(buffer.get(), bytes_to_read); - void read_raw(void * ptr, size_t len) const { - if (len == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, len, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); - } - if (ret != 1) { - throw std::runtime_error("unexpectedly reached end of file"); - } + uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; + memcpy(dest, reinterpret_cast(actual_data), size); } uint32_t read_u32() const { @@ -358,26 +306,33 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + bool has_direct_io() const { + return fd != -1; + } + ~impl() { - if (fp) { + if (fd != -1) { + close(fd); + } else { std::fclose(fp); } } + int fd = -1; #endif FILE * fp{}; size_t size{}; }; -llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} -#if defined(__linux__) -llama_file::llama_file(const char * fname, const char * mode, bool uncached_read) : pimpl(std::make_unique(fname, mode, uncached_read)) {} -#endif +llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : + pimpl(std::make_unique(fname, mode, use_direct_io)) {} llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } +bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); } + int llama_file::file_id() const { #ifdef _WIN32 return _fileno(pimpl->fp); @@ -392,6 +347,9 @@ int llama_file::file_id() const { void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } +void llama_file::read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const + { pimpl->read_aligned_chunk(offset, dest, size, alignment); } + uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 985404d0f52..5a9361e4c37 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -13,10 +13,7 @@ using llama_mmaps = std::vector>; using llama_mlocks = std::vector>; struct llama_file { - llama_file(const char * fname, const char * mode); -#if defined(__linux__) - llama_file(const char * fname, const char * mode, bool uncached_read); -#endif + llama_file(const char * fname, const char * mode, bool use_direct_io = false); ~llama_file(); size_t tell() const; @@ -27,11 +24,13 @@ struct llama_file { void seek(size_t offset, int whence) const; void read_raw(void * ptr, size_t len) const; + void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const; uint32_t read_u32() const; void write_raw(const void * ptr, size_t len) const; void write_u32(uint32_t val) const; + bool has_direct_io() const; private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 9faf85a050e..6e47d992bb6 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -503,11 +503,7 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); -#if defined(__linux__) files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); -#else - files.emplace_back(new llama_file(fname.c_str(), "rb")); -#endif contexts.emplace_back(ctx); // Save tensors data offset of the main file. @@ -575,11 +571,7 @@ llama_model_loader::llama_model_loader( } } -#if defined(__linux__) - files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); -#else files.emplace_back(new llama_file(fname_split, "rb")); -#endif contexts.emplace_back(ctx); // Save tensors data offset info of the shard. @@ -941,14 +933,17 @@ bool llama_model_loader::load_all_data( // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; -#if defined(__linux__) - constexpr size_t alignment = 4 * 1024; // 4 KiB for Direct I/O + + + bool direct_io = false; + for (const auto& file : files) { + direct_io |= file->has_direct_io(); + } + + constexpr size_t alignment = 4 * 1024; // 4 KB for Direct I/O // Buffer size: balance between memory usage and I/O efficiency // 64MB works well for NVMe drives - constexpr size_t buffer_size = 64 * 1024 * 1024; // 64 MiB -#else - constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB -#endif + const size_t buffer_size = direct_io ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; std::vector host_buffers; std::vector events; @@ -997,11 +992,8 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { -#if defined(__linux__) - auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size + 2 * alignment); -#else auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); -#endif + if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, ggml_backend_dev_name(dev)); @@ -1038,35 +1030,6 @@ bool llama_model_loader::load_all_data( ggml_backend_name(upload_backend)); } -#if defined(__linux__) - auto read_aligned_chunk = [](const llama_file * file, - size_t offset, - void * dest, - size_t size, - size_t alignment) { - off_t aligned_offset = offset & ~(alignment - 1); - off_t offset_from_alignment = offset - aligned_offset; - size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); - - void * raw_buffer = nullptr; - int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read); - if (ret != 0) { - throw std::runtime_error(format("posix_memalign failed with error %d", ret)); - } - - struct aligned_buffer_deleter { - void operator()(void * p) const { free(p); } - }; - std::unique_ptr buffer(raw_buffer); - - file->seek(aligned_offset, SEEK_SET); - file->read_raw(buffer.get(), bytes_to_read); - - uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; - memcpy(dest, reinterpret_cast(actual_data), size); - }; -#endif - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -1112,100 +1075,97 @@ bool llama_model_loader::load_all_data( } } else { const auto & file = files.at(weight->idx); -#if defined(__linux__) - auto offset = (off_t) weight->offs; - off_t aligned_offset = offset & ~(alignment - 1); - off_t offset_from_alignment = offset - aligned_offset; -#endif + if (ggml_backend_buffer_is_host(cur->buffer)) { -#if defined(__linux__) - read_aligned_chunk(file.get(), weight->offs, cur->data, n_size, alignment); -#else - file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); -#endif + if (file->has_direct_io()) { + file->read_aligned_chunk(weight->offs, cur->data, n_size, alignment); + } else { + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); + } if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); })); } } else { + file->seek(weight->offs, SEEK_SET); // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { -#if defined(__linux__) - // Calculate aligned read boundaries - size_t read_start = aligned_offset; - size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); + if (file->has_direct_io()) { + auto offset = (off_t) weight->offs; + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; - size_t bytes_read = 0; - size_t data_read = 0; // Actual tensor data copied (excluding padding) + // Calculate aligned read boundaries + size_t read_start = aligned_offset; + size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); - file->seek(aligned_offset, SEEK_SET); + size_t bytes_read = 0; + size_t data_read = 0; // Actual tensor data copied (excluding padding) - while (bytes_read < read_end - read_start) { - size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); + while (bytes_read < read_end - read_start) { + size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); - // Align the destination pointer within the pinned buffer - uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); + // Align the destination pointer within the pinned buffer + uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); - // Wait for previous upload to complete before reusing buffer - ggml_backend_event_synchronize(events[buffer_idx]); + // Wait for previous upload to complete before reusing buffer + ggml_backend_event_synchronize(events[buffer_idx]); - // Read aligned chunk from file - file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); + // Read aligned chunk from file + file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); - // Calculate actual data portion (excluding alignment padding) - uintptr_t ptr_data = ptr_dest_aligned; - size_t data_to_copy = read_size; + // Calculate actual data portion (excluding alignment padding) + uintptr_t ptr_data = ptr_dest_aligned; + size_t data_to_copy = read_size; - // Skip alignment padding at start of first chunk - if (bytes_read == 0) { - ptr_data += offset_from_alignment; - data_to_copy -= offset_from_alignment; - } + // Skip alignment padding at start of first chunk + if (bytes_read == 0) { + ptr_data += offset_from_alignment; + data_to_copy -= offset_from_alignment; + } - // Trim alignment padding at end of last chunk - if (aligned_offset + bytes_read + read_size > offset + n_size) { - data_to_copy -= (read_end - (offset + n_size)); - } + // Trim alignment padding at end of last chunk + if (aligned_offset + bytes_read + read_size > offset + n_size) { + data_to_copy -= (read_end - (offset + n_size)); + } - // Async upload actual data to GPU - ggml_backend_tensor_set_async(upload_backend, cur, - reinterpret_cast(ptr_data), data_read, data_to_copy); - ggml_backend_event_record(events[buffer_idx], upload_backend); + // Async upload actual data to GPU + ggml_backend_tensor_set_async(upload_backend, cur, + reinterpret_cast(ptr_data), data_read, data_to_copy); + ggml_backend_event_record(events[buffer_idx], upload_backend); - data_read += data_to_copy; - bytes_read += read_size; + data_read += data_to_copy; + bytes_read += read_size; - ++buffer_idx; - buffer_idx %= n_buffers; - } -#else - file->seek(weight->offs, SEEK_SET); - - size_t bytes_read = 0; + ++buffer_idx; + buffer_idx %= n_buffers; + } + } else { + size_t bytes_read = 0; - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + while (bytes_read < n_size) { + size_t read_iteration = std::min(buffer_size, n_size - bytes_read); - ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx], upload_backend); + ggml_backend_event_synchronize(events[buffer_idx]); + file->read_raw(host_ptrs[buffer_idx], read_iteration); + ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + ggml_backend_event_record(events[buffer_idx], upload_backend); - bytes_read += read_iteration; - ++buffer_idx; - buffer_idx %= n_buffers; + bytes_read += read_iteration; + ++buffer_idx; + buffer_idx %= n_buffers; + } } -#endif } else { read_buf.resize(n_size); -#if defined(__linux__) - read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment); -#else - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); -#endif + if (file->has_direct_io()) { + file->read_aligned_chunk(weight->offs, read_buf.data(), n_size, alignment); + } else { + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); + } ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); From 0879d22196c66825c438e3136cc0c9a60188c7c6 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Tue, 16 Dec 2025 20:00:46 +0100 Subject: [PATCH 06/10] Adding maybe unused keyword for Mac and Windows. --- src/llama-mmap.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 044d9aa902a..d6904714a9d 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -75,7 +75,7 @@ struct llama_file::impl { return ret; } - impl(const char * fname, const char * mode, const bool use_direct_io = false) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -168,7 +168,7 @@ struct llama_file::impl { } } #else - impl(const char * fname, const char * mode, const bool use_direct_io = false) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { #ifdef __linux__ // Try unbuffered I/O for read only if (use_direct_io && std::strcmp(mode, "rb") == 0) { From fff1157a6c0c4d16da79815b9e4f54028c81307e Mon Sep 17 00:00:00 2001 From: JTischbein Date: Wed, 17 Dec 2025 08:20:33 +0100 Subject: [PATCH 07/10] File seek aligned --- src/llama-model-loader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 6e47d992bb6..1aa0dae0f59 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1089,13 +1089,13 @@ bool llama_model_loader::load_all_data( })); } } else { - file->seek(weight->offs, SEEK_SET); // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { if (file->has_direct_io()) { auto offset = (off_t) weight->offs; off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; + file->seek(aligned_offset, SEEK_SET); // Calculate aligned read boundaries size_t read_start = aligned_offset; From d73ff6a9c59e80fc5d7637f16efe72537650e80d Mon Sep 17 00:00:00 2001 From: JTischbein Date: Wed, 17 Dec 2025 13:21:09 +0100 Subject: [PATCH 08/10] Removing all branches for direct_io in llama-model-loader.cpp --- src/llama-mmap.cpp | 34 ++++++----- src/llama-mmap.h | 6 +- src/llama-model-loader.cpp | 113 ++++++++++++++----------------------- 3 files changed, 66 insertions(+), 87 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index d6904714a9d..23b648a2e3b 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -154,11 +154,7 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } - bool has_direct_io() const { - return false; - } - - void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const { + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { throw std::runtime_error("DirectIO is not implemented on Windows."); } @@ -179,6 +175,7 @@ struct llama_file::impl { fstat(fd, &file_stats); size = file_stats.st_size; + alignment = file_stats.st_blksize; off_t ret = lseek(fd, 0, SEEK_SET); if (ret == -1) { @@ -262,7 +259,7 @@ struct llama_file::impl { } } - void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const { + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); @@ -306,10 +303,6 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } - bool has_direct_io() const { - return fd != -1; - } - ~impl() { if (fd != -1) { close(fd); @@ -320,6 +313,21 @@ struct llama_file::impl { int fd = -1; #endif + void read_raw_at(void * ptr, size_t len, size_t offset) const { + if (alignment != 1) { + read_aligned_chunk(offset, ptr, len); + } else { + seek(offset, SEEK_SET); + read_raw(ptr, len); + } + } + + size_t read_alignment() const { + return alignment; + } + + size_t alignment = 1; + FILE * fp{}; size_t size{}; }; @@ -331,7 +339,7 @@ llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } -bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); } +size_t llama_file::read_alignment() const { return pimpl->read_alignment(); } int llama_file::file_id() const { #ifdef _WIN32 @@ -347,9 +355,7 @@ int llama_file::file_id() const { void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } -void llama_file::read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const - { pimpl->read_aligned_chunk(offset, dest, size, alignment); } - +void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); } uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 5a9361e4c37..729aac164b8 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -3,6 +3,7 @@ #include #include #include +#include struct llama_file; struct llama_mmap; @@ -24,13 +25,14 @@ struct llama_file { void seek(size_t offset, int whence) const; void read_raw(void * ptr, size_t len) const; - void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const; + void read_raw_at(void * ptr, size_t len, size_t offset) const; + void read_aligned_chunk(size_t offset, void * dest, size_t size) const; uint32_t read_u32() const; void write_raw(const void * ptr, size_t len) const; void write_u32(uint32_t val) const; - bool has_direct_io() const; + size_t read_alignment() const; private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1aa0dae0f59..1c5b1153ba1 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -934,16 +934,14 @@ bool llama_model_loader::load_all_data( // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; - - bool direct_io = false; - for (const auto& file : files) { - direct_io |= file->has_direct_io(); + size_t alignment = 1; + for (const auto & file : files) { + alignment = std::max(file->read_alignment(), alignment); } - constexpr size_t alignment = 4 * 1024; // 4 KB for Direct I/O // Buffer size: balance between memory usage and I/O efficiency // 64MB works well for NVMe drives - const size_t buffer_size = direct_io ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; + const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; std::vector host_buffers; std::vector events; @@ -1077,12 +1075,7 @@ bool llama_model_loader::load_all_data( const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - if (file->has_direct_io()) { - file->read_aligned_chunk(weight->offs, cur->data, n_size, alignment); - } else { - file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); - } + file->read_raw_at(cur->data, n_size, weight->offs); if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1091,81 +1084,59 @@ bool llama_model_loader::load_all_data( } else { // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { - if (file->has_direct_io()) { - auto offset = (off_t) weight->offs; - off_t aligned_offset = offset & ~(alignment - 1); - off_t offset_from_alignment = offset - aligned_offset; - file->seek(aligned_offset, SEEK_SET); - - // Calculate aligned read boundaries - size_t read_start = aligned_offset; - size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); - - size_t bytes_read = 0; - size_t data_read = 0; // Actual tensor data copied (excluding padding) + auto offset = (off_t) weight->offs; + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; + file->seek(aligned_offset, SEEK_SET); - while (bytes_read < read_end - read_start) { - size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); + // Calculate aligned read boundaries + size_t read_start = aligned_offset; + size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); - // Align the destination pointer within the pinned buffer - uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); + size_t bytes_read = 0; + size_t data_read = 0; // Actual tensor data copied (excluding padding) - // Wait for previous upload to complete before reusing buffer - ggml_backend_event_synchronize(events[buffer_idx]); + while (bytes_read < read_end - read_start) { + size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); - // Read aligned chunk from file - file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); + // Align the destination pointer within the pinned buffer + uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); - // Calculate actual data portion (excluding alignment padding) - uintptr_t ptr_data = ptr_dest_aligned; - size_t data_to_copy = read_size; + // Wait for previous upload to complete before reusing buffer + ggml_backend_event_synchronize(events[buffer_idx]); - // Skip alignment padding at start of first chunk - if (bytes_read == 0) { - ptr_data += offset_from_alignment; - data_to_copy -= offset_from_alignment; - } + // Read aligned chunk from file + file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); - // Trim alignment padding at end of last chunk - if (aligned_offset + bytes_read + read_size > offset + n_size) { - data_to_copy -= (read_end - (offset + n_size)); - } + // Calculate actual data portion (excluding alignment padding) + uintptr_t ptr_data = ptr_dest_aligned; + size_t data_to_copy = read_size; - // Async upload actual data to GPU - ggml_backend_tensor_set_async(upload_backend, cur, - reinterpret_cast(ptr_data), data_read, data_to_copy); - ggml_backend_event_record(events[buffer_idx], upload_backend); - - data_read += data_to_copy; - bytes_read += read_size; + // Skip alignment padding at start of first chunk + if (bytes_read == 0) { + ptr_data += offset_from_alignment; + data_to_copy -= offset_from_alignment; + } - ++buffer_idx; - buffer_idx %= n_buffers; + // Trim alignment padding at end of last chunk + if (aligned_offset + bytes_read + read_size > offset + n_size) { + data_to_copy -= (read_end - (offset + n_size)); } - } else { - size_t bytes_read = 0; - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + // Async upload actual data to GPU + ggml_backend_tensor_set_async(upload_backend, cur, + reinterpret_cast(ptr_data), data_read, data_to_copy); + ggml_backend_event_record(events[buffer_idx], upload_backend); - ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); - ggml_backend_event_record(events[buffer_idx], upload_backend); + data_read += data_to_copy; + bytes_read += read_size; - bytes_read += read_iteration; - ++buffer_idx; - buffer_idx %= n_buffers; - } + ++buffer_idx; + buffer_idx %= n_buffers; } } else { read_buf.resize(n_size); - if (file->has_direct_io()) { - file->read_aligned_chunk(weight->offs, read_buf.data(), n_size, alignment); - } else { - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); - } + file->read_raw_at(read_buf.data(), n_size, weight->offs); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); From 99fde7260d46a229dd1567c3ac3d40b2b74902b8 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Wed, 17 Dec 2025 13:32:25 +0100 Subject: [PATCH 09/10] Always use alignment from llama_file --- src/llama-model-loader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1c5b1153ba1..64eddf221a2 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1085,6 +1085,7 @@ bool llama_model_loader::load_all_data( // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { auto offset = (off_t) weight->offs; + alignment = file->read_alignment(); off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; file->seek(aligned_offset, SEEK_SET); From 921d7c98089a42438ebd391fb0a76620bd161f67 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Wed, 17 Dec 2025 15:49:33 +0100 Subject: [PATCH 10/10] use_mmap=true --- common/common.h | 2 +- src/llama-model-loader.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.h b/common/common.h index 3cd531f6867..2fd83f0cf9c 100644 --- a/common/common.h +++ b/common/common.h @@ -413,7 +413,7 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool use_mmap = false; // use uncached reads for faster loads + bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 64eddf221a2..c50ca831e01 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -571,7 +571,7 @@ llama_model_loader::llama_model_loader( } } - files.emplace_back(new llama_file(fname_split, "rb")); + files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); contexts.emplace_back(ctx); // Save tensors data offset info of the shard.