From 8a7a4e3f2fb80a7168f3a8bf75c08891fb5384b3 Mon Sep 17 00:00:00 2001 From: JTischbein Date: Thu, 18 Dec 2025 12:44:57 +0100 Subject: [PATCH 1/4] Adding --direct-io flag for model loading --- common/arg.cpp | 8 ++++++++ common/common.cpp | 1 + common/common.h | 3 ++- examples/diffusion/diffusion-cli.cpp | 1 + include/llama.h | 1 + src/llama-mmap.cpp | 9 +++++++++ src/llama-mmap.h | 1 + src/llama-model-loader.cpp | 10 ++++++++-- src/llama-model-loader.h | 1 + src/llama-model.cpp | 1 + src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 12 files changed, 35 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b6d16168ebc..b0f710e343d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2027,6 +2027,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_mmap = value; } ).set_env("LLAMA_ARG_MMAP")); + add_opt(common_arg( + {"-dio", "--direct-io"}, + {"-ndio", "--no-direct-io"}, + string_format("Use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.use_direct_io = value; + } + ).set_env("LLAMA_ARG_DIO")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index d4e8c7405eb..74afd93f57a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1347,6 +1347,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; + mparams.use_direct_io = params.use_direct_io; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; diff --git a/common/common.h b/common/common.h index 3e314f4c802..d1f091a28ad 100644 --- a/common/common.h +++ b/common/common.h @@ -420,7 +420,8 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool use_mmap = true; // use mmap for faster loads + bool use_mmap = true; // enable mmap to use filesystem cache + bool use_direct_io = true; // read from disk without buffering for faster model loading bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 273942a165e..d50f754092d 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -553,6 +553,7 @@ int main(int argc, char ** argv) { model_params.n_gpu_layers = params.n_gpu_layers; model_params.devices = params.devices.data(); model_params.use_mmap = params.use_mmap; + model_params.use_direct_io = params.use_direct_io; model_params.use_mlock = params.use_mlock; model_params.check_tensors = params.check_tensors; diff --git a/include/llama.h b/include/llama.h index f8629300991..a88ddb98e63 100644 --- a/include/llama.h +++ b/include/llama.h @@ -309,6 +309,7 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible + bool use_direct_io; // use direct io, takes precedence over use_mmap bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 23b648a2e3b..d6637de4677 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -158,6 +158,10 @@ struct llama_file::impl { throw std::runtime_error("DirectIO is not implemented on Windows."); } + bool has_direct_io() const { + return true; + } + ~impl() { if (fp) { std::fclose(fp); @@ -303,6 +307,10 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + bool has_direct_io() const { + return alignment != 1; + } + ~impl() { if (fd != -1) { close(fd); @@ -340,6 +348,7 @@ size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } size_t llama_file::read_alignment() const { return pimpl->read_alignment(); } +bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); } int llama_file::file_id() const { #ifdef _WIN32 diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 729aac164b8..cf4ca4f1a76 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -33,6 +33,7 @@ struct llama_file { void write_u32(uint32_t val) const; size_t read_alignment() const; + bool has_direct_io() const; private: struct impl; std::unique_ptr pimpl; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1da89515f7b..1355eea9516 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -472,6 +472,7 @@ llama_model_loader::llama_model_loader( const std::string & fname, std::vector & splits, bool use_mmap, + bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, @@ -504,9 +505,14 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); + files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); + // Disable mmap in case Direct I/O is enabled and available + if (use_direct_io && files.at(0)->has_direct_io()) { + use_mmap = false; + } + // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. @@ -572,7 +578,7 @@ llama_model_loader::llama_model_loader( } } - files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); + files.emplace_back(new llama_file(fname_split, "rb", use_direct_io)); contexts.emplace_back(ctx); // Save tensors data offset info of the shard. diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 0380c92fde0..de06b528312 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -97,6 +97,7 @@ struct llama_model_loader { const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, + bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d2270e8f2da..cf0c3947535 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7671,6 +7671,7 @@ llama_model_params llama_model_default_params() { /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, /*.use_mmap =*/ true, + /*.use_direct_io =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index bc4b05c3b50..048d65a75c2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 1e18637e361..721e6e8d1c6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -771,7 +771,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); From 0e2b356d1f01de0519410f4fb250eb06d25b976b Mon Sep 17 00:00:00 2001 From: jtischbein Date: Thu, 18 Dec 2025 19:15:36 +0000 Subject: [PATCH 2/4] Fixing read_raw() calls --- src/llama-mmap.cpp | 11 ++++++----- src/llama-model-loader.cpp | 3 +-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index d6637de4677..ebd49e5cb31 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -202,7 +202,7 @@ struct llama_file::impl { } size_t tell() const { - if (fd == -1) { + if (!has_direct_io()) { long ret = std::ftell(fp); if (ret == -1) { throw std::runtime_error(format("ftell error: %s", strerror(errno))); @@ -220,7 +220,7 @@ struct llama_file::impl { void seek(size_t offset, int whence) const { off_t ret = 0; - if (fd == -1) { + if (!has_direct_io()) { ret = std::fseek(fp, (long) offset, whence); } else { ret = lseek(fd, offset, whence); @@ -245,6 +245,7 @@ struct llama_file::impl { } } else { bool successful = false; + GGML_ASSERT(len % alignment == 0 && (uintptr_t) ptr % alignment == 0); while (!successful) { off_t ret = read(fd, ptr, len); @@ -288,7 +289,7 @@ struct llama_file::impl { uint32_t read_u32() const { uint32_t ret; - read_raw(&ret, sizeof(ret)); + read_raw_at(&ret, sizeof(ret), tell()); return ret; } @@ -308,7 +309,7 @@ struct llama_file::impl { } bool has_direct_io() const { - return alignment != 1; + return fd != -1; } ~impl() { @@ -322,7 +323,7 @@ struct llama_file::impl { #endif void read_raw_at(void * ptr, size_t len, size_t offset) const { - if (alignment != 1) { + if (has_direct_io()) { read_aligned_chunk(offset, ptr, len); } else { seek(offset, SEEK_SET); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1355eea9516..2db2115a081 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -918,8 +918,7 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { GGML_ASSERT(cur->data != nullptr); GGML_ASSERT(w.idx < files.size()); const auto & file = files.at(w.idx); - file->seek(w.offs, SEEK_SET); - file->read_raw(cur->data, ggml_nbytes(cur)); + file->read_raw_at(cur->data, ggml_nbytes(cur), w.offs); } if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { From 7533f72095a03701d7c6f02bc9a2debc58e5ca54 Mon Sep 17 00:00:00 2001 From: jtischbein Date: Thu, 18 Dec 2025 20:05:22 +0000 Subject: [PATCH 3/4] Fixing Windows read_raw_at --- src/llama-mmap.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index ebd49e5cb31..f1c8cf1320b 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -154,6 +154,11 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + void read_raw_at(void * ptr, size_t len, size_t offset) const { + seek(offset, SEEK_SET); + read_raw(ptr, len); + } + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { throw std::runtime_error("DirectIO is not implemented on Windows."); } @@ -264,6 +269,15 @@ struct llama_file::impl { } } + void read_raw_at(void * ptr, size_t len, size_t offset) const { + if (has_direct_io()) { + read_aligned_chunk(offset, ptr, len); + } else { + seek(offset, SEEK_SET); + read_raw(ptr, len); + } + } + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; @@ -322,15 +336,6 @@ struct llama_file::impl { int fd = -1; #endif - void read_raw_at(void * ptr, size_t len, size_t offset) const { - if (has_direct_io()) { - read_aligned_chunk(offset, ptr, len); - } else { - seek(offset, SEEK_SET); - read_raw(ptr, len); - } - } - size_t read_alignment() const { return alignment; } From 8c150148cc8f8ad3380cf8793da13dab170b592a Mon Sep 17 00:00:00 2001 From: JTischbein Date: Fri, 19 Dec 2025 11:51:21 +0100 Subject: [PATCH 4/4] Changing type off_t to size_t for windows and Renaming functions --- src/llama-mmap.cpp | 45 ++++++++++++++++++++------------------ src/llama-mmap.h | 4 ++-- src/llama-model-loader.cpp | 21 ++++++++++++------ src/llama-model-loader.h | 1 + src/llama-model.cpp | 3 ++- 5 files changed, 43 insertions(+), 31 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index f1c8cf1320b..95db8ba10bd 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -154,12 +154,11 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } - void read_raw_at(void * ptr, size_t len, size_t offset) const { - seek(offset, SEEK_SET); - read_raw(ptr, len); + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + throw std::runtime_error("DirectIO is not implemented on Windows."); } - void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + void read_raw_unsafe(void * ptr, size_t len) const { throw std::runtime_error("DirectIO is not implemented on Windows."); } @@ -207,7 +206,7 @@ struct llama_file::impl { } size_t tell() const { - if (!has_direct_io()) { + if (fd == -1) { long ret = std::ftell(fp); if (ret == -1) { throw std::runtime_error(format("ftell error: %s", strerror(errno))); @@ -225,7 +224,7 @@ struct llama_file::impl { void seek(size_t offset, int whence) const { off_t ret = 0; - if (!has_direct_io()) { + if (fd == -1) { ret = std::fseek(fp, (long) offset, whence); } else { ret = lseek(fd, offset, whence); @@ -235,7 +234,7 @@ struct llama_file::impl { } } - void read_raw(void * ptr, size_t len) const { + void read_raw_unsafe(void * ptr, size_t len) const { if (len == 0) { return; } @@ -269,16 +268,8 @@ struct llama_file::impl { } } - void read_raw_at(void * ptr, size_t len, size_t offset) const { - if (has_direct_io()) { - read_aligned_chunk(offset, ptr, len); - } else { - seek(offset, SEEK_SET); - read_raw(ptr, len); - } - } - - void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + void read_aligned_chunk(void * dest, size_t size) const { + size_t offset = tell(); off_t aligned_offset = offset & ~(alignment - 1); off_t offset_from_alignment = offset - aligned_offset; size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); @@ -295,15 +286,23 @@ struct llama_file::impl { std::unique_ptr buffer(raw_buffer); seek(aligned_offset, SEEK_SET); - read_raw(buffer.get(), bytes_to_read); + read_raw_unsafe(buffer.get(), bytes_to_read); uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; memcpy(dest, reinterpret_cast(actual_data), size); } + void read_raw(void * ptr, size_t len) const { + if (has_direct_io()) { + read_aligned_chunk(ptr, len); + } else { + read_raw_unsafe(ptr, len); + } + } + uint32_t read_u32() const { uint32_t ret; - read_raw_at(&ret, sizeof(ret), tell()); + read_raw(&ret, sizeof(ret)); return ret; } @@ -323,7 +322,7 @@ struct llama_file::impl { } bool has_direct_io() const { - return fd != -1; + return fd != -1 && alignment > 1; } ~impl() { @@ -370,7 +369,11 @@ int llama_file::file_id() const { void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } -void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); } +#ifdef _WIN32 +void llama_file::read_raw_unsafe(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } +#else +void llama_file::read_raw_unsafe(void * ptr, size_t len) const { pimpl->read_raw_unsafe(ptr, len); } +#endif uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index cf4ca4f1a76..dd6831db7df 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -25,8 +25,8 @@ struct llama_file { void seek(size_t offset, int whence) const; void read_raw(void * ptr, size_t len) const; - void read_raw_at(void * ptr, size_t len, size_t offset) const; - void read_aligned_chunk(size_t offset, void * dest, size_t size) const; + void read_raw_unsafe(void * ptr, size_t len) const; + void read_aligned_chunk(void * dest, size_t size) const; uint32_t read_u32() const; void write_raw(const void * ptr, size_t len) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 2db2115a081..4c6d76a2626 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -508,9 +508,12 @@ llama_model_loader::llama_model_loader( files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); + use_direct_io = use_direct_io && files.back()->has_direct_io(); + // Disable mmap in case Direct I/O is enabled and available - if (use_direct_io && files.at(0)->has_direct_io()) { + if (use_direct_io && use_mmap) { use_mmap = false; + LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); } // Save tensors data offset of the main file. @@ -722,6 +725,7 @@ llama_model_loader::llama_model_loader( } this->use_mmap = use_mmap; + this->use_direct_io = use_direct_io; this->check_tensors = check_tensors; this->no_alloc = no_alloc; } @@ -918,7 +922,8 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { GGML_ASSERT(cur->data != nullptr); GGML_ASSERT(w.idx < files.size()); const auto & file = files.at(w.idx); - file->read_raw_at(cur->data, ggml_nbytes(cur), w.offs); + file->seek(w.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); } if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { @@ -1082,7 +1087,8 @@ bool llama_model_loader::load_all_data( const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file->read_raw_at(cur->data, n_size, weight->offs); + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1091,10 +1097,10 @@ bool llama_model_loader::load_all_data( } else { // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { - auto offset = (off_t) weight->offs; + size_t offset = weight->offs; alignment = file->read_alignment(); - off_t aligned_offset = offset & ~(alignment - 1); - off_t offset_from_alignment = offset - aligned_offset; + size_t aligned_offset = offset & ~(alignment - 1); + size_t offset_from_alignment = offset - aligned_offset; file->seek(aligned_offset, SEEK_SET); // Calculate aligned read boundaries @@ -1144,7 +1150,8 @@ bool llama_model_loader::load_all_data( } } else { read_buf.resize(n_size); - file->read_raw_at(read_buf.data(), n_size, weight->offs); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index de06b528312..6f15115ce74 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -70,6 +70,7 @@ struct llama_model_loader { size_t n_bytes = 0; bool use_mmap = false; + bool use_direct_io = false; bool check_tensors; bool no_alloc; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cf0c3947535..12ed04d028c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2337,7 +2337,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const bool use_mmap_buffer = true; - LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); + LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n", + __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false"); // build a list of buffer types for the CPU and GPU devices pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);