From 3074b500a5fffba5c9b84a7c14e4dc2248b3653c Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Sat, 13 Dec 2025 20:10:21 +0100
Subject: [PATCH 01/10] Uncached model read

---
 common/arg.cpp             |   7 ++
 common/common.h            |   2 +-
 src/llama-mmap.cpp         | 133 ++++++++++++++++++++++++++++++++++++-
 src/llama-mmap.h           |   3 +
 src/llama-model-loader.cpp | 117 +++++++++++++++++++++++++++++++-
 5 files changed, 255 insertions(+), 7 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index bb2a6840baa..88c65abdd92 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1984,6 +1984,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = value;
         }
     ).set_env("LLAMA_ARG_MMAP"));
+    add_opt(common_arg(
+                {"--mmap"},
+                "memory-map model",
+                [](common_params & params) {
+                    params.use_mmap = true;
+                }
+                ).set_env("LLAMA_ARG_MMAP"));
     add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"
diff --git a/common/common.h b/common/common.h
index 2fd83f0cf9c..3cd531f6867 100644
--- a/common/common.h
+++ b/common/common.h
@@ -413,7 +413,7 @@ struct common_params {
     bool kv_unified        = false; // enable unified KV cache
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mmap          = false; // use uncached reads for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 0641c2d22f6..232dcdb9e49 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -13,9 +13,10 @@
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
+        #include <fcntl.h>
+        #include <sys/stat.h>
         #if defined(_POSIX_MAPPED_FILES)
             #include <sys/mman.h>
-            #include <fcntl.h>
         #endif
         #if defined(_POSIX_MEMLOCK_RANGE)
             #include <sys/resource.h>
@@ -158,6 +159,129 @@ struct llama_file::impl {
             std::fclose(fp);
         }
     }
+#elif defined(__linux__)
+    impl(const char * fname, const char * mode) : impl(fname, mode, false) {} 
+
+    impl(const char * fname, const char * mode, bool uncached_read) {
+        if (uncached_read) {
+            fd = open(fname, O_RDONLY | O_DIRECT);
+            if (fd == -1) {
+                throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+            }
+
+            struct stat file_stats{};
+            fstat(fd, &file_stats);
+
+            size = file_stats.st_size;
+
+            off_t ret = lseek(fd, 0, SEEK_SET);
+            if (ret == -1) {
+                throw std::runtime_error(format("seek error: %s", strerror(errno)));
+            }
+        } else {
+            fp = ggml_fopen(fname, mode);
+            if (fp == NULL) {
+                throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+            }
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+        if (fd == -1) {
+            long ret = std::ftell(fp);
+            if (ret == -1) {
+                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+            }
+
+            return (size_t) ret;
+        }
+
+        off_t pos = lseek(fd, 0, SEEK_CUR);
+        if (pos == -1) {
+            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
+        }
+        return (size_t) pos;
+    }
+
+    void seek(size_t offset, int whence) const {
+        off_t ret = 0;
+        if (fd == -1) {
+            ret = std::fseek(fp, (long) offset, whence);
+        } else {
+            ret = lseek(fd, offset, whence);
+        }
+        if (ret == -1) {
+            throw std::runtime_error(format("seek error: %s", strerror(errno)));
+        }
+    }
+
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        if (fd == -1) {
+            errno = 0;
+            std::size_t ret = std::fread(ptr, len, 1, fp);
+            if (ferror(fp)) {
+                throw std::runtime_error(format("read error: %s", strerror(errno)));
+            }
+            if (ret != 1) {
+                throw std::runtime_error("unexpectedly reached end of file");
+            }
+        } else {
+            bool successful = false;
+            while (!successful) {
+                off_t ret = read(fd, ptr, len);
+
+                if (ret == -1) {
+                    if (errno == EINTR) {
+                        continue;  // Interrupted by signal, retry
+                    }
+                    throw std::runtime_error(format("read error: %s", strerror(errno)));
+                }
+                if (ret == 0) {
+                    throw std::runtime_error("unexpectedly reached end of file");
+                }
+
+                successful = true;
+            }
+        }
+    }
+
+    uint32_t read_u32() const {
+        uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, len, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(uint32_t val) const {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~impl() {
+        if (fp) {
+            std::fclose(fp);
+        } else if (fd != -1) {
+            close(fd);
+        }
+    }
+
+    int fd = -1;
+
 #else
     impl(const char * fname, const char * mode) {
         fp = ggml_fopen(fname, mode);
@@ -237,11 +361,14 @@ struct llama_file::impl {
     }
 #endif
 
-    FILE * fp;
-    size_t size;
+    FILE * fp{};
+    size_t size{};
 };
 
 llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+#if defined(__linux__)
+llama_file::llama_file(const char * fname, const char * mode, bool uncached_read) : pimpl(std::make_unique<impl>(fname, mode, uncached_read)) {}
+#endif
 llama_file::~llama_file() = default;
 
 size_t llama_file::tell() const { return pimpl->tell(); }
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 4e5aec3f440..985404d0f52 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -14,6 +14,9 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
     llama_file(const char * fname, const char * mode);
+#if defined(__linux__)
+    llama_file(const char * fname, const char * mode, bool uncached_read);
+#endif
     ~llama_file();
 
     size_t tell() const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index aa3a65f87a5..03b855e2a90 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -502,8 +502,12 @@ llama_model_loader::llama_model_loader(
 
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-
+    
+#if defined(__linux__)
+    files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
+#else
     files.emplace_back(new llama_file(fname.c_str(), "rb"));
+#endif
     contexts.emplace_back(ctx);
 
     // Save tensors data offset of the main file.
@@ -571,7 +575,11 @@ llama_model_loader::llama_model_loader(
                 }
             }
 
+#if defined(__linux__)
+            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
+#else
             files.emplace_back(new llama_file(fname_split, "rb"));
+#endif
             contexts.emplace_back(ctx);
 
             // Save tensors data offset info of the shard.
@@ -933,7 +941,14 @@ bool llama_model_loader::load_all_data(
     // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
     // NVMe raid configurations might require more / larger buffers.
     constexpr size_t n_buffers = 4;
+#if defined(__linux__)
+    constexpr size_t alignment = 4 * 1024; // 4 KiB for Direct I/O
+    // Buffer size: balance between memory usage and I/O efficiency
+    // 64MB works well for NVMe drives
+    constexpr size_t buffer_size = 64 * 1024 * 1024; // 64 MiB
+#else
     constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+#endif
 
     std::vector<ggml_backend_buffer_t> host_buffers;
     std::vector<ggml_backend_event_t> events;
@@ -982,7 +997,11 @@ bool llama_model_loader::load_all_data(
 
         // If the backend is supported, create pinned memory buffers and events for synchronisation.
         for (size_t idx = 0; idx < n_buffers; ++idx) {
+#if defined(__linux__)
+            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size + 2 * alignment);
+#else
             auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+#endif
             if (!buf) {
                 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                     ggml_backend_dev_name(dev));
@@ -1019,6 +1038,35 @@ bool llama_model_loader::load_all_data(
             ggml_backend_name(upload_backend));
     }
 
+#if defined(__linux__)
+    auto read_aligned_chunk = [](const llama_file * file,
+                                size_t offset,
+                                void * dest,
+                                size_t size,
+                                size_t alignment) {
+        off_t aligned_offset = offset & ~(alignment - 1);
+        off_t offset_from_alignment = offset - aligned_offset;
+        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
+        
+        void * raw_buffer = nullptr;
+        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
+        if (ret != 0) {
+            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
+        }
+        
+        struct aligned_buffer_deleter {
+            void operator()(void * p) const { free(p); }
+        };
+        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
+        
+        file->seek(aligned_offset, SEEK_SET);
+        file->read_raw(buffer.get(), bytes_to_read);
+        
+        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
+        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
+    };
+#endif
+
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
         const auto * weight = get_weight(ggml_get_name(cur));
         if (weight == nullptr) {
@@ -1064,9 +1112,18 @@ bool llama_model_loader::load_all_data(
             }
         } else {
             const auto & file = files.at(weight->idx);
+#if defined(__linux__)
+            auto offset = (off_t) weight->offs;
+            off_t aligned_offset = offset & ~(alignment - 1);
+            off_t offset_from_alignment = offset - aligned_offset;
+#endif
             if (ggml_backend_buffer_is_host(cur->buffer)) {
+#if defined(__linux__)
+                read_aligned_chunk(file.get(), weight->offs, cur->data, n_size, alignment);
+#else
                 file->seek(weight->offs, SEEK_SET);
                 file->read_raw(cur->data, n_size);
+#endif
                 if (check_tensors) {
                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1075,6 +1132,55 @@ bool llama_model_loader::load_all_data(
             } else {
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
+#if defined(__linux__)
+                    // Calculate aligned read boundaries
+                    size_t read_start = aligned_offset;
+                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
+
+                    size_t bytes_read = 0;
+                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)
+
+                    file->seek(aligned_offset, SEEK_SET);
+
+                    while (bytes_read < read_end - read_start) {
+                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+
+                        // Align the destination pointer within the pinned buffer
+                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+
+                        // Wait for previous upload to complete before reusing buffer
+                        ggml_backend_event_synchronize(events[buffer_idx]);
+                        
+                        // Read aligned chunk from file
+                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+
+                        // Calculate actual data portion (excluding alignment padding)
+                        uintptr_t ptr_data = ptr_dest_aligned;
+                        size_t data_to_copy = read_size;
+
+                        // Skip alignment padding at start of first chunk
+                        if (bytes_read == 0) {
+                            ptr_data += offset_from_alignment;
+                            data_to_copy -= offset_from_alignment;
+                        }
+                        
+                        // Trim alignment padding at end of last chunk
+                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
+                            data_to_copy -= (read_end - (offset + n_size));
+                        }
+
+                        // Async upload actual data to GPU
+                        ggml_backend_tensor_set_async(upload_backend, cur,
+                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+                        ggml_backend_event_record(events[buffer_idx], upload_backend);
+
+                        data_read += data_to_copy;
+                        bytes_read += read_size;
+
+                        ++buffer_idx;
+                        buffer_idx %= n_buffers;
+                    }
+#else
                     file->seek(weight->offs, SEEK_SET);
 
                     size_t bytes_read = 0;
@@ -1091,11 +1197,16 @@ bool llama_model_loader::load_all_data(
                         ++buffer_idx;
                         buffer_idx %= n_buffers;
                     }
+#endif
                 } else {
                     read_buf.resize(n_size);
+#if defined(__linux__)
+                    read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment);
+#else
                     file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), n_size);
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
+                    file->read_raw(read_buf.data(), n_size);           
+#endif
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);   
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                     }

From 26cc75ffccb111c2f53e74a52e6d5ab850cd7513 Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Sun, 14 Dec 2025 09:41:27 +0100
Subject: [PATCH 02/10] Removing additional --mmap arg

---
 common/arg.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 88c65abdd92..bb2a6840baa 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1984,13 +1984,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_mmap = value;
         }
     ).set_env("LLAMA_ARG_MMAP"));
-    add_opt(common_arg(
-                {"--mmap"},
-                "memory-map model",
-                [](common_params & params) {
-                    params.use_mmap = true;
-                }
-                ).set_env("LLAMA_ARG_MMAP"));
     add_opt(common_arg(
         {"--numa"}, "TYPE",
         "attempt optimizations that help on some NUMA systems\n"

From ceccfb9ee644f1591e4fa5f1cc39a3ca9010cdd6 Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Sun, 14 Dec 2025 19:36:31 +0100
Subject: [PATCH 03/10] Removing trailing whitespaces

---
 src/llama-mmap.cpp         |  2 +-
 src/llama-model-loader.cpp | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 232dcdb9e49..7d23d6973ef 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -160,7 +160,7 @@ struct llama_file::impl {
         }
     }
 #elif defined(__linux__)
-    impl(const char * fname, const char * mode) : impl(fname, mode, false) {} 
+    impl(const char * fname, const char * mode) : impl(fname, mode, false) {}
 
     impl(const char * fname, const char * mode, bool uncached_read) {
         if (uncached_read) {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 03b855e2a90..9faf85a050e 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -502,7 +502,7 @@ llama_model_loader::llama_model_loader(
 
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-    
+
 #if defined(__linux__)
     files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
 #else
@@ -1047,21 +1047,21 @@ bool llama_model_loader::load_all_data(
         off_t aligned_offset = offset & ~(alignment - 1);
         off_t offset_from_alignment = offset - aligned_offset;
         size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
-        
+
         void * raw_buffer = nullptr;
         int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
         if (ret != 0) {
             throw std::runtime_error(format("posix_memalign failed with error %d", ret));
         }
-        
+
         struct aligned_buffer_deleter {
             void operator()(void * p) const { free(p); }
         };
         std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
-        
+
         file->seek(aligned_offset, SEEK_SET);
         file->read_raw(buffer.get(), bytes_to_read);
-        
+
         uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
         memcpy(dest, reinterpret_cast<void *>(actual_data), size);
     };
@@ -1150,7 +1150,7 @@ bool llama_model_loader::load_all_data(
 
                         // Wait for previous upload to complete before reusing buffer
                         ggml_backend_event_synchronize(events[buffer_idx]);
-                        
+
                         // Read aligned chunk from file
                         file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
 
@@ -1163,7 +1163,7 @@ bool llama_model_loader::load_all_data(
                             ptr_data += offset_from_alignment;
                             data_to_copy -= offset_from_alignment;
                         }
-                        
+
                         // Trim alignment padding at end of last chunk
                         if (aligned_offset + bytes_read + read_size > offset + n_size) {
                             data_to_copy -= (read_end - (offset + n_size));
@@ -1204,9 +1204,9 @@ bool llama_model_loader::load_all_data(
                     read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment);
 #else
                     file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), n_size);           
+                    file->read_raw(read_buf.data(), n_size);
 #endif
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);   
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                     }

From d2acc3a8a89f84fdfc0bfc6dc26be24e3e3d7c27 Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Mon, 15 Dec 2025 13:44:20 +0100
Subject: [PATCH 04/10] Adding fallback when O_DIRECT is not supported

---
 src/llama-mmap.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 7d23d6973ef..9d03c41d967 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -165,6 +165,10 @@ struct llama_file::impl {
     impl(const char * fname, const char * mode, bool uncached_read) {
         if (uncached_read) {
             fd = open(fname, O_RDONLY | O_DIRECT);
+            if (fd == -1 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+                fd = open(fname, O_RDONLY);   // retry without O_DIRECT
+            }
+
             if (fd == -1) {
                 throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
             }

From f6d79fe1b1dddbcc8678f918394f5430b37b339e Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Tue, 16 Dec 2025 13:58:43 +0100
Subject: [PATCH 05/10] Remove branching in llama-model-loader.cpp and reduce
 code duplications in llama-mmap.cpp

---
 src/llama-mmap.cpp         | 174 +++++++++++++---------------------
 src/llama-mmap.h           |   7 +-
 src/llama-model-loader.cpp | 188 +++++++++++++++----------------------
 3 files changed, 143 insertions(+), 226 deletions(-)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 9d03c41d967..044d9aa902a 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -75,7 +75,7 @@ struct llama_file::impl {
         return ret;
     }
 
-    impl(const char * fname, const char * mode) {
+    impl(const char * fname, const char * mode, const bool use_direct_io = false) {
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -154,43 +154,50 @@ struct llama_file::impl {
         write_raw(&val, sizeof(val));
     }
 
+    bool has_direct_io() const {
+        return false;
+    }
+
+    void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
+        throw std::runtime_error("DirectIO is not implemented on Windows.");
+    }
+
     ~impl() {
         if (fp) {
             std::fclose(fp);
         }
     }
-#elif defined(__linux__)
-    impl(const char * fname, const char * mode) : impl(fname, mode, false) {}
-
-    impl(const char * fname, const char * mode, bool uncached_read) {
-        if (uncached_read) {
+#else
+    impl(const char * fname, const char * mode, const bool use_direct_io = false) {
+#ifdef __linux__
+        // Try unbuffered I/O for read only
+        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
             fd = open(fname, O_RDONLY | O_DIRECT);
-            if (fd == -1 && (errno == EINVAL || errno == EOPNOTSUPP)) {
-                fd = open(fname, O_RDONLY);   // retry without O_DIRECT
-            }
-
-            if (fd == -1) {
-                throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
-            }
 
-            struct stat file_stats{};
-            fstat(fd, &file_stats);
+            if (fd != -1) {
+                struct stat file_stats{};
+                fstat(fd, &file_stats);
 
-            size = file_stats.st_size;
+                size = file_stats.st_size;
 
-            off_t ret = lseek(fd, 0, SEEK_SET);
-            if (ret == -1) {
-                throw std::runtime_error(format("seek error: %s", strerror(errno)));
-            }
-        } else {
-            fp = ggml_fopen(fname, mode);
-            if (fp == NULL) {
-                throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
+                off_t ret = lseek(fd, 0, SEEK_SET);
+                if (ret == -1) {
+                    throw std::runtime_error(format("seek error: %s", strerror(errno)));
+                }
+                return;
             }
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
+
+            LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
+                fname, strerror(errno));
+        }
+#endif
+        fp = ggml_fopen(fname, mode);
+        if (fp == NULL) {
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
         }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
     }
 
     size_t tell() const {
@@ -226,8 +233,8 @@ struct llama_file::impl {
         if (len == 0) {
             return;
         }
+        errno = 0;
         if (fd == -1) {
-            errno = 0;
             std::size_t ret = std::fread(ptr, len, 1, fp);
             if (ferror(fp)) {
                 throw std::runtime_error(format("read error: %s", strerror(errno)));
@@ -255,86 +262,27 @@ struct llama_file::impl {
         }
     }
 
-    uint32_t read_u32() const {
-        uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    void write_raw(const void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, len, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(uint32_t val) const {
-        write_raw(&val, sizeof(val));
-    }
+    void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
+        off_t aligned_offset = offset & ~(alignment - 1);
+        off_t offset_from_alignment = offset - aligned_offset;
+        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
 
-    ~impl() {
-        if (fp) {
-            std::fclose(fp);
-        } else if (fd != -1) {
-            close(fd);
-        }
-    }
-
-    int fd = -1;
-
-#else
-    impl(const char * fname, const char * mode) {
-        fp = ggml_fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        if (ret == -1) {
-            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+        void * raw_buffer = nullptr;
+        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
+        if (ret != 0) {
+            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
         }
 
-        return (size_t) ret;
-    }
+        struct aligned_buffer_deleter {
+            void operator()(void * p) const { free(p); }
+        };
+        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
 
-    void seek(size_t offset, int whence) const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        if (ret != 0) {
-            throw std::runtime_error(format("seek error: %s", strerror(errno)));
-        }
-    }
+        seek(aligned_offset, SEEK_SET);
+        read_raw(buffer.get(), bytes_to_read);
 
-    void read_raw(void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error("unexpectedly reached end of file");
-        }
+        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
+        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
     }
 
     uint32_t read_u32() const {
@@ -358,26 +306,33 @@ struct llama_file::impl {
         write_raw(&val, sizeof(val));
     }
 
+    bool has_direct_io() const {
+        return fd != -1;
+    }
+
     ~impl() {
-        if (fp) {
+        if (fd != -1) {
+            close(fd);
+        } else {
             std::fclose(fp);
         }
     }
+    int fd = -1;
 #endif
 
     FILE * fp{};
     size_t size{};
 };
 
-llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
-#if defined(__linux__)
-llama_file::llama_file(const char * fname, const char * mode, bool uncached_read) : pimpl(std::make_unique<impl>(fname, mode, uncached_read)) {}
-#endif
+llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
+    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
 llama_file::~llama_file() = default;
 
 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }
 
+bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
+
 int llama_file::file_id() const {
 #ifdef _WIN32
     return _fileno(pimpl->fp);
@@ -392,6 +347,9 @@ int llama_file::file_id() const {
 
 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
 void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+void llama_file::read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const
+    { pimpl->read_aligned_chunk(offset, dest, size, alignment); }
+
 
 uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
 
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 985404d0f52..5a9361e4c37 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -13,10 +13,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
-    llama_file(const char * fname, const char * mode);
-#if defined(__linux__)
-    llama_file(const char * fname, const char * mode, bool uncached_read);
-#endif
+    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
     ~llama_file();
 
     size_t tell() const;
@@ -27,11 +24,13 @@ struct llama_file {
     void seek(size_t offset, int whence) const;
 
     void read_raw(void * ptr, size_t len) const;
+    void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const;
     uint32_t read_u32() const;
 
     void write_raw(const void * ptr, size_t len) const;
     void write_u32(uint32_t val) const;
 
+    bool has_direct_io() const;
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 9faf85a050e..6e47d992bb6 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -503,11 +503,7 @@ llama_model_loader::llama_model_loader(
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-#if defined(__linux__)
     files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
-#else
-    files.emplace_back(new llama_file(fname.c_str(), "rb"));
-#endif
     contexts.emplace_back(ctx);
 
     // Save tensors data offset of the main file.
@@ -575,11 +571,7 @@ llama_model_loader::llama_model_loader(
                 }
             }
 
-#if defined(__linux__)
-            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
-#else
             files.emplace_back(new llama_file(fname_split, "rb"));
-#endif
             contexts.emplace_back(ctx);
 
             // Save tensors data offset info of the shard.
@@ -941,14 +933,17 @@ bool llama_model_loader::load_all_data(
     // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
     // NVMe raid configurations might require more / larger buffers.
     constexpr size_t n_buffers = 4;
-#if defined(__linux__)
-    constexpr size_t alignment = 4 * 1024; // 4 KiB for Direct I/O
+
+
+    bool direct_io = false;
+    for (const auto& file : files) {
+        direct_io |= file->has_direct_io();
+    }
+
+    constexpr size_t alignment = 4 * 1024; // 4 KB for Direct I/O
     // Buffer size: balance between memory usage and I/O efficiency
     // 64MB works well for NVMe drives
-    constexpr size_t buffer_size = 64 * 1024 * 1024; // 64 MiB
-#else
-    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
-#endif
+    const size_t buffer_size = direct_io ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
 
     std::vector<ggml_backend_buffer_t> host_buffers;
     std::vector<ggml_backend_event_t> events;
@@ -997,11 +992,8 @@ bool llama_model_loader::load_all_data(
 
         // If the backend is supported, create pinned memory buffers and events for synchronisation.
         for (size_t idx = 0; idx < n_buffers; ++idx) {
-#if defined(__linux__)
-            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size + 2 * alignment);
-#else
             auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
-#endif
+
             if (!buf) {
                 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                     ggml_backend_dev_name(dev));
@@ -1038,35 +1030,6 @@ bool llama_model_loader::load_all_data(
             ggml_backend_name(upload_backend));
     }
 
-#if defined(__linux__)
-    auto read_aligned_chunk = [](const llama_file * file,
-                                size_t offset,
-                                void * dest,
-                                size_t size,
-                                size_t alignment) {
-        off_t aligned_offset = offset & ~(alignment - 1);
-        off_t offset_from_alignment = offset - aligned_offset;
-        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
-
-        void * raw_buffer = nullptr;
-        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
-        if (ret != 0) {
-            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
-        }
-
-        struct aligned_buffer_deleter {
-            void operator()(void * p) const { free(p); }
-        };
-        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
-
-        file->seek(aligned_offset, SEEK_SET);
-        file->read_raw(buffer.get(), bytes_to_read);
-
-        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
-        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
-    };
-#endif
-
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
         const auto * weight = get_weight(ggml_get_name(cur));
         if (weight == nullptr) {
@@ -1112,100 +1075,97 @@ bool llama_model_loader::load_all_data(
             }
         } else {
             const auto & file = files.at(weight->idx);
-#if defined(__linux__)
-            auto offset = (off_t) weight->offs;
-            off_t aligned_offset = offset & ~(alignment - 1);
-            off_t offset_from_alignment = offset - aligned_offset;
-#endif
+
             if (ggml_backend_buffer_is_host(cur->buffer)) {
-#if defined(__linux__)
-                read_aligned_chunk(file.get(), weight->offs, cur->data, n_size, alignment);
-#else
-                file->seek(weight->offs, SEEK_SET);
-                file->read_raw(cur->data, n_size);
-#endif
+                if (file->has_direct_io()) {
+                    file->read_aligned_chunk(weight->offs, cur->data, n_size, alignment);
+                } else {
+                    file->seek(weight->offs, SEEK_SET);
+                    file->read_raw(cur->data, n_size);
+                }
                 if (check_tensors) {
                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
                     }));
                 }
             } else {
+                file->seek(weight->offs, SEEK_SET);
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
-#if defined(__linux__)
-                    // Calculate aligned read boundaries
-                    size_t read_start = aligned_offset;
-                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
+                    if (file->has_direct_io()) {
+                        auto offset = (off_t) weight->offs;
+                        off_t aligned_offset = offset & ~(alignment - 1);
+                        off_t offset_from_alignment = offset - aligned_offset;
 
-                    size_t bytes_read = 0;
-                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)
+                        // Calculate aligned read boundaries
+                        size_t read_start = aligned_offset;
+                        size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
 
-                    file->seek(aligned_offset, SEEK_SET);
+                        size_t bytes_read = 0;
+                        size_t data_read = 0;  // Actual tensor data copied (excluding padding)
 
-                    while (bytes_read < read_end - read_start) {
-                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+                        while (bytes_read < read_end - read_start) {
+                            size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
 
-                        // Align the destination pointer within the pinned buffer
-                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+                            // Align the destination pointer within the pinned buffer
+                            uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
 
-                        // Wait for previous upload to complete before reusing buffer
-                        ggml_backend_event_synchronize(events[buffer_idx]);
+                            // Wait for previous upload to complete before reusing buffer
+                            ggml_backend_event_synchronize(events[buffer_idx]);
 
-                        // Read aligned chunk from file
-                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+                            // Read aligned chunk from file
+                            file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
 
-                        // Calculate actual data portion (excluding alignment padding)
-                        uintptr_t ptr_data = ptr_dest_aligned;
-                        size_t data_to_copy = read_size;
+                            // Calculate actual data portion (excluding alignment padding)
+                            uintptr_t ptr_data = ptr_dest_aligned;
+                            size_t data_to_copy = read_size;
 
-                        // Skip alignment padding at start of first chunk
-                        if (bytes_read == 0) {
-                            ptr_data += offset_from_alignment;
-                            data_to_copy -= offset_from_alignment;
-                        }
+                            // Skip alignment padding at start of first chunk
+                            if (bytes_read == 0) {
+                                ptr_data += offset_from_alignment;
+                                data_to_copy -= offset_from_alignment;
+                            }
 
-                        // Trim alignment padding at end of last chunk
-                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
-                            data_to_copy -= (read_end - (offset + n_size));
-                        }
+                            // Trim alignment padding at end of last chunk
+                            if (aligned_offset + bytes_read + read_size > offset + n_size) {
+                                data_to_copy -= (read_end - (offset + n_size));
+                            }
 
-                        // Async upload actual data to GPU
-                        ggml_backend_tensor_set_async(upload_backend, cur,
-                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
-                        ggml_backend_event_record(events[buffer_idx], upload_backend);
+                            // Async upload actual data to GPU
+                            ggml_backend_tensor_set_async(upload_backend, cur,
+                                                          reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+                            ggml_backend_event_record(events[buffer_idx], upload_backend);
 
-                        data_read += data_to_copy;
-                        bytes_read += read_size;
+                            data_read += data_to_copy;
+                            bytes_read += read_size;
 
-                        ++buffer_idx;
-                        buffer_idx %= n_buffers;
-                    }
-#else
-                    file->seek(weight->offs, SEEK_SET);
-
-                    size_t bytes_read = 0;
+                            ++buffer_idx;
+                            buffer_idx %= n_buffers;
+                        }
+                    } else {
+                        size_t bytes_read = 0;
 
-                    while (bytes_read < n_size) {
-                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+                        while (bytes_read < n_size) {
+                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
 
-                        ggml_backend_event_synchronize(events[buffer_idx]);
-                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                        ggml_backend_event_record(events[buffer_idx], upload_backend);
+                            ggml_backend_event_synchronize(events[buffer_idx]);
+                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
+                            ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+                            ggml_backend_event_record(events[buffer_idx], upload_backend);
 
-                        bytes_read += read_iteration;
-                        ++buffer_idx;
-                        buffer_idx %= n_buffers;
+                            bytes_read += read_iteration;
+                            ++buffer_idx;
+                            buffer_idx %= n_buffers;
+                        }
                     }
-#endif
                 } else {
                     read_buf.resize(n_size);
-#if defined(__linux__)
-                    read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment);
-#else
-                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), n_size);
-#endif
+                    if (file->has_direct_io()) {
+                        file->read_aligned_chunk(weight->offs, read_buf.data(), n_size, alignment);
+                    } else {
+                        file->seek(weight->offs, SEEK_SET);
+                        file->read_raw(read_buf.data(), n_size);
+                    }
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));

From 0879d22196c66825c438e3136cc0c9a60188c7c6 Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Tue, 16 Dec 2025 20:00:46 +0100
Subject: [PATCH 06/10] Adding maybe unused keyword for Mac and Windows.

---
 src/llama-mmap.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 044d9aa902a..d6904714a9d 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -75,7 +75,7 @@ struct llama_file::impl {
         return ret;
     }
 
-    impl(const char * fname, const char * mode, const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -168,7 +168,7 @@ struct llama_file::impl {
         }
     }
 #else
-    impl(const char * fname, const char * mode, const bool use_direct_io = false) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
 #ifdef __linux__
         // Try unbuffered I/O for read only
         if (use_direct_io && std::strcmp(mode, "rb") == 0) {

From fff1157a6c0c4d16da79815b9e4f54028c81307e Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Wed, 17 Dec 2025 08:20:33 +0100
Subject: [PATCH 07/10] File seek aligned

---
 src/llama-model-loader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 6e47d992bb6..1aa0dae0f59 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1089,13 +1089,13 @@ bool llama_model_loader::load_all_data(
                     }));
                 }
             } else {
-                file->seek(weight->offs, SEEK_SET);
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
                     if (file->has_direct_io()) {
                         auto offset = (off_t) weight->offs;
                         off_t aligned_offset = offset & ~(alignment - 1);
                         off_t offset_from_alignment = offset - aligned_offset;
+                        file->seek(aligned_offset, SEEK_SET);
 
                         // Calculate aligned read boundaries
                         size_t read_start = aligned_offset;

From d73ff6a9c59e80fc5d7637f16efe72537650e80d Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Wed, 17 Dec 2025 13:21:09 +0100
Subject: [PATCH 08/10] Removing all branches for direct_io in
 llama-model-loader.cpp

---
 src/llama-mmap.cpp         |  34 ++++++-----
 src/llama-mmap.h           |   6 +-
 src/llama-model-loader.cpp | 113 ++++++++++++++-----------------------
 3 files changed, 66 insertions(+), 87 deletions(-)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index d6904714a9d..23b648a2e3b 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -154,11 +154,7 @@ struct llama_file::impl {
         write_raw(&val, sizeof(val));
     }
 
-    bool has_direct_io() const {
-        return false;
-    }
-
-    void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
         throw std::runtime_error("DirectIO is not implemented on Windows.");
     }
 
@@ -179,6 +175,7 @@ struct llama_file::impl {
                 fstat(fd, &file_stats);
 
                 size = file_stats.st_size;
+                alignment = file_stats.st_blksize;
 
                 off_t ret = lseek(fd, 0, SEEK_SET);
                 if (ret == -1) {
@@ -262,7 +259,7 @@ struct llama_file::impl {
         }
     }
 
-    void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const {
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
         off_t aligned_offset = offset & ~(alignment - 1);
         off_t offset_from_alignment = offset - aligned_offset;
         size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
@@ -306,10 +303,6 @@ struct llama_file::impl {
         write_raw(&val, sizeof(val));
     }
 
-    bool has_direct_io() const {
-        return fd != -1;
-    }
-
     ~impl() {
         if (fd != -1) {
             close(fd);
@@ -320,6 +313,21 @@ struct llama_file::impl {
     int fd = -1;
 #endif
 
+    void read_raw_at(void * ptr, size_t len, size_t offset) const {
+        if (alignment != 1) {
+            read_aligned_chunk(offset, ptr, len);
+        } else {
+            seek(offset, SEEK_SET);
+            read_raw(ptr, len);
+        }
+    }
+
+    size_t read_alignment() const {
+        return alignment;
+    }
+
+    size_t alignment = 1;
+
     FILE * fp{};
     size_t size{};
 };
@@ -331,7 +339,7 @@ llama_file::~llama_file() = default;
 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }
 
-bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
+size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
 
 int llama_file::file_id() const {
 #ifdef _WIN32
@@ -347,9 +355,7 @@ int llama_file::file_id() const {
 
 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
 void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
-void llama_file::read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const
-    { pimpl->read_aligned_chunk(offset, dest, size, alignment); }
-
+void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
 
 uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
 
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 5a9361e4c37..729aac164b8 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
+#include <cstdio>
 
 struct llama_file;
 struct llama_mmap;
@@ -24,13 +25,14 @@ struct llama_file {
     void seek(size_t offset, int whence) const;
 
     void read_raw(void * ptr, size_t len) const;
-    void read_aligned_chunk(size_t offset, void * dest, size_t size, size_t alignment) const;
+    void read_raw_at(void * ptr, size_t len, size_t offset) const;
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
     uint32_t read_u32() const;
 
     void write_raw(const void * ptr, size_t len) const;
     void write_u32(uint32_t val) const;
 
-    bool has_direct_io() const;
+    size_t read_alignment() const;
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 1aa0dae0f59..1c5b1153ba1 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -934,16 +934,14 @@ bool llama_model_loader::load_all_data(
     // NVMe raid configurations might require more / larger buffers.
     constexpr size_t n_buffers = 4;
 
-
-    bool direct_io = false;
-    for (const auto& file : files) {
-        direct_io |= file->has_direct_io();
+    size_t alignment = 1;
+    for (const auto & file : files) {
+        alignment = std::max(file->read_alignment(), alignment);
     }
 
-    constexpr size_t alignment = 4 * 1024; // 4 KB for Direct I/O
     // Buffer size: balance between memory usage and I/O efficiency
     // 64MB works well for NVMe drives
-    const size_t buffer_size = direct_io ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
+    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
 
     std::vector<ggml_backend_buffer_t> host_buffers;
     std::vector<ggml_backend_event_t> events;
@@ -1077,12 +1075,7 @@ bool llama_model_loader::load_all_data(
             const auto & file = files.at(weight->idx);
 
             if (ggml_backend_buffer_is_host(cur->buffer)) {
-                if (file->has_direct_io()) {
-                    file->read_aligned_chunk(weight->offs, cur->data, n_size, alignment);
-                } else {
-                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(cur->data, n_size);
-                }
+                file->read_raw_at(cur->data, n_size, weight->offs);
                 if (check_tensors) {
                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1091,81 +1084,59 @@ bool llama_model_loader::load_all_data(
             } else {
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
-                    if (file->has_direct_io()) {
-                        auto offset = (off_t) weight->offs;
-                        off_t aligned_offset = offset & ~(alignment - 1);
-                        off_t offset_from_alignment = offset - aligned_offset;
-                        file->seek(aligned_offset, SEEK_SET);
-
-                        // Calculate aligned read boundaries
-                        size_t read_start = aligned_offset;
-                        size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
-
-                        size_t bytes_read = 0;
-                        size_t data_read = 0;  // Actual tensor data copied (excluding padding)
+                    auto offset = (off_t) weight->offs;
+                    off_t aligned_offset = offset & ~(alignment - 1);
+                    off_t offset_from_alignment = offset - aligned_offset;
+                    file->seek(aligned_offset, SEEK_SET);
 
-                        while (bytes_read < read_end - read_start) {
-                            size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+                    // Calculate aligned read boundaries
+                    size_t read_start = aligned_offset;
+                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
 
-                            // Align the destination pointer within the pinned buffer
-                            uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+                    size_t bytes_read = 0;
+                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)
 
-                            // Wait for previous upload to complete before reusing buffer
-                            ggml_backend_event_synchronize(events[buffer_idx]);
+                    while (bytes_read < read_end - read_start) {
+                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
 
-                            // Read aligned chunk from file
-                            file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+                        // Align the destination pointer within the pinned buffer
+                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
 
-                            // Calculate actual data portion (excluding alignment padding)
-                            uintptr_t ptr_data = ptr_dest_aligned;
-                            size_t data_to_copy = read_size;
+                        // Wait for previous upload to complete before reusing buffer
+                        ggml_backend_event_synchronize(events[buffer_idx]);
 
-                            // Skip alignment padding at start of first chunk
-                            if (bytes_read == 0) {
-                                ptr_data += offset_from_alignment;
-                                data_to_copy -= offset_from_alignment;
-                            }
+                        // Read aligned chunk from file
+                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
 
-                            // Trim alignment padding at end of last chunk
-                            if (aligned_offset + bytes_read + read_size > offset + n_size) {
-                                data_to_copy -= (read_end - (offset + n_size));
-                            }
+                        // Calculate actual data portion (excluding alignment padding)
+                        uintptr_t ptr_data = ptr_dest_aligned;
+                        size_t data_to_copy = read_size;
 
-                            // Async upload actual data to GPU
-                            ggml_backend_tensor_set_async(upload_backend, cur,
-                                                          reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
-                            ggml_backend_event_record(events[buffer_idx], upload_backend);
-
-                            data_read += data_to_copy;
-                            bytes_read += read_size;
+                        // Skip alignment padding at start of first chunk
+                        if (bytes_read == 0) {
+                            ptr_data += offset_from_alignment;
+                            data_to_copy -= offset_from_alignment;
+                        }
 
-                            ++buffer_idx;
-                            buffer_idx %= n_buffers;
+                        // Trim alignment padding at end of last chunk
+                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
+                            data_to_copy -= (read_end - (offset + n_size));
                         }
-                    } else {
-                        size_t bytes_read = 0;
 
-                        while (bytes_read < n_size) {
-                            size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+                        // Async upload actual data to GPU
+                        ggml_backend_tensor_set_async(upload_backend, cur,
+                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
+                        ggml_backend_event_record(events[buffer_idx], upload_backend);
 
-                            ggml_backend_event_synchronize(events[buffer_idx]);
-                            file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                            ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
-                            ggml_backend_event_record(events[buffer_idx], upload_backend);
+                        data_read += data_to_copy;
+                        bytes_read += read_size;
 
-                            bytes_read += read_iteration;
-                            ++buffer_idx;
-                            buffer_idx %= n_buffers;
-                        }
+                        ++buffer_idx;
+                        buffer_idx %= n_buffers;
                     }
                 } else {
                     read_buf.resize(n_size);
-                    if (file->has_direct_io()) {
-                        file->read_aligned_chunk(weight->offs, read_buf.data(), n_size, alignment);
-                    } else {
-                        file->seek(weight->offs, SEEK_SET);
-                        file->read_raw(read_buf.data(), n_size);
-                    }
+                    file->read_raw_at(read_buf.data(), n_size, weight->offs);
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));

From 99fde7260d46a229dd1567c3ac3d40b2b74902b8 Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Wed, 17 Dec 2025 13:32:25 +0100
Subject: [PATCH 09/10] Always use alignment from llama_file

---
 src/llama-model-loader.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 1c5b1153ba1..64eddf221a2 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1085,6 +1085,7 @@ bool llama_model_loader::load_all_data(
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
                     auto offset = (off_t) weight->offs;
+                    alignment = file->read_alignment();
                     off_t aligned_offset = offset & ~(alignment - 1);
                     off_t offset_from_alignment = offset - aligned_offset;
                     file->seek(aligned_offset, SEEK_SET);

From 921d7c98089a42438ebd391fb0a76620bd161f67 Mon Sep 17 00:00:00 2001
From: JTischbein <jtischbein@nvidia.com>
Date: Wed, 17 Dec 2025 15:49:33 +0100
Subject: [PATCH 10/10] use_mmap=true

---
 common/common.h            | 2 +-
 src/llama-model-loader.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.h b/common/common.h
index 3cd531f6867..2fd83f0cf9c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -413,7 +413,7 @@ struct common_params {
     bool kv_unified        = false; // enable unified KV cache
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool use_mmap          = false; // use uncached reads for faster loads
+    bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 64eddf221a2..c50ca831e01 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -571,7 +571,7 @@ llama_model_loader::llama_model_loader(
                 }
             }
 
-            files.emplace_back(new llama_file(fname_split, "rb"));
+            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
             contexts.emplace_back(ctx);
 
             // Save tensors data offset info of the shard.