Skip to content
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this to false by default, results in a huge slowdown on MacOS with default arguments:

time ./bin/llama-completion -m ../models/gpt-oss-120b/ggml-model-mxfp4.gguf -p "hello" -n 1 -no-cnv

# master
real	0m4.648s

# PR
real	0m17.957s

Not sure what is the best way to handle this. If we keep it true, then linux users would not get the benefit of Direct IO. If we switch to false, Mac users will take the hit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be OK to set mmap depending on the platform?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have such precedent atm for any of the parameters in common, so I would say it's not ideal.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have on M4 Pro with GPT-OSS-20B on cold load --no-mmap: 4.168s --mmap: 6.3s. The warm load however takes with --mmap 2.1s (--no-mmap still ~4.1s).

Measured using time ./llama-cli -m /Users/jtischbein/Documents/models/openai_gpt-oss-20b-MXFP4.gguf --no-mmap -p "bla" -n 0 --single-turn and filesystem cache cleared using purge.

So the cold load time is still faster using --mmap, but unfortunately not as fast as on Linux.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do the following:

  • Add new CLI argument --direct-io, -dio
  • Description: "Use DirectIO if available. Takes precedence over --mmap"
  • Keep use_mmap == true and use_direct_io == true
  • On Mac, the internal implementation will determine that DIO is not available so it will fallback to mmap

Might want to do it in a separate PR as it would require changes in libllama API. This PR should keep use_mmap == true by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good

bool use_mmap = false; // use uncached reads for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
Expand Down
151 changes: 123 additions & 28 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
Expand Down Expand Up @@ -74,7 +75,7 @@ struct llama_file::impl {
return ret;
}

impl(const char * fname, const char * mode) {
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
Expand Down Expand Up @@ -153,13 +154,40 @@ struct llama_file::impl {
write_raw(&val, sizeof(val));
}

void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
throw std::runtime_error("DirectIO is not implemented on Windows.");
}

~impl() {
if (fp) {
std::fclose(fp);
}
}
#else
impl(const char * fname, const char * mode) {
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
#ifdef __linux__
// Try unbuffered I/O for read only
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
fd = open(fname, O_RDONLY | O_DIRECT);

if (fd != -1) {
struct stat file_stats{};
fstat(fd, &file_stats);

size = file_stats.st_size;
alignment = file_stats.st_blksize;

off_t ret = lseek(fd, 0, SEEK_SET);
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
return;
}

LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
fname, strerror(errno));
}
#endif
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
Expand All @@ -170,27 +198,30 @@ struct llama_file::impl {
}

size_t tell() const {
// TODO: this ifdef is never true?
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
if (fd == -1) {
long ret = std::ftell(fp);
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}

return (size_t) ret;
}

return (size_t) ret;
off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos == -1) {
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
}
return (size_t) pos;
}

void seek(size_t offset, int whence) const {
// TODO: this ifdef is never true?
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
if (ret != 0) {
off_t ret = 0;
if (fd == -1) {
ret = std::fseek(fp, (long) offset, whence);
} else {
ret = lseek(fd, offset, whence);
}
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
}
Expand All @@ -200,13 +231,55 @@ struct llama_file::impl {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
if (fd == -1) {
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
} else {
bool successful = false;
while (!successful) {
off_t ret = read(fd, ptr, len);

if (ret == -1) {
if (errno == EINTR) {
continue; // Interrupted by signal, retry
}
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret == 0) {
throw std::runtime_error("unexpectedly reached end of file");
}

successful = true;
}
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}

void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);

void * raw_buffer = nullptr;
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
if (ret != 0) {
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
}

struct aligned_buffer_deleter {
void operator()(void * p) const { free(p); }
};
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);

seek(aligned_offset, SEEK_SET);
read_raw(buffer.get(), bytes_to_read);

uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}

uint32_t read_u32() const {
Expand All @@ -231,22 +304,43 @@ struct llama_file::impl {
}

~impl() {
if (fp) {
if (fd != -1) {
close(fd);
} else {
std::fclose(fp);
}
}
int fd = -1;
#endif

FILE * fp;
size_t size;
void read_raw_at(void * ptr, size_t len, size_t offset) const {
if (alignment != 1) {
read_aligned_chunk(offset, ptr, len);
} else {
seek(offset, SEEK_SET);
read_raw(ptr, len);
}
}

size_t read_alignment() const {
return alignment;
}

size_t alignment = 1;

FILE * fp{};
size_t size{};
};

llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
llama_file::~llama_file() = default;

size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }

size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }

int llama_file::file_id() const {
#ifdef _WIN32
return _fileno(pimpl->fp);
Expand All @@ -261,6 +355,7 @@ int llama_file::file_id() const {

void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }

uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }

Expand Down
6 changes: 5 additions & 1 deletion src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <cstdint>
#include <memory>
#include <vector>
#include <cstdio>

struct llama_file;
struct llama_mmap;
Expand All @@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

struct llama_file {
llama_file(const char * fname, const char * mode);
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
~llama_file();

size_t tell() const;
Expand All @@ -24,11 +25,14 @@ struct llama_file {
void seek(size_t offset, int whence) const;

void read_raw(void * ptr, size_t len) const;
void read_raw_at(void * ptr, size_t len, size_t offset) const;
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
uint32_t read_u32() const;

void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const;

size_t read_alignment() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
Expand Down
67 changes: 55 additions & 12 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ llama_model_loader::llama_model_loader(
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));

files.emplace_back(new llama_file(fname.c_str(), "rb"));
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
contexts.emplace_back(ctx);

// Save tensors data offset of the main file.
Expand Down Expand Up @@ -933,7 +933,15 @@ bool llama_model_loader::load_all_data(
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4;
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB

size_t alignment = 1;
for (const auto & file : files) {
alignment = std::max(file->read_alignment(), alignment);
}

// Buffer size: balance between memory usage and I/O efficiency
// 64MB works well for NVMe drives
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;

std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<ggml_backend_event_t> events;
Expand Down Expand Up @@ -983,6 +991,7 @@ bool llama_model_loader::load_all_data(
// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) {
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);

if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
ggml_backend_dev_name(dev));
Expand Down Expand Up @@ -1064,9 +1073,9 @@ bool llama_model_loader::load_all_data(
}
} else {
const auto & file = files.at(weight->idx);

if (ggml_backend_buffer_is_host(cur->buffer)) {
file->seek(weight->offs, SEEK_SET);
file->read_raw(cur->data, n_size);
file->read_raw_at(cur->data, n_size, weight->offs);
if (check_tensors) {
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
Expand All @@ -1075,26 +1084,60 @@ bool llama_model_loader::load_all_data(
} else {
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) {
file->seek(weight->offs, SEEK_SET);
auto offset = (off_t) weight->offs;
alignment = file->read_alignment();
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
file->seek(aligned_offset, SEEK_SET);

// Calculate aligned read boundaries
size_t read_start = aligned_offset;
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);

size_t bytes_read = 0;
size_t data_read = 0; // Actual tensor data copied (excluding padding)

while (bytes_read < read_end - read_start) {
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);

while (bytes_read < n_size) {
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
// Align the destination pointer within the pinned buffer
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);

// Wait for previous upload to complete before reusing buffer
ggml_backend_event_synchronize(events[buffer_idx]);
file->read_raw(host_ptrs[buffer_idx], read_iteration);
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);

// Read aligned chunk from file
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);

// Calculate actual data portion (excluding alignment padding)
uintptr_t ptr_data = ptr_dest_aligned;
size_t data_to_copy = read_size;

// Skip alignment padding at start of first chunk
if (bytes_read == 0) {
ptr_data += offset_from_alignment;
data_to_copy -= offset_from_alignment;
}

// Trim alignment padding at end of last chunk
if (aligned_offset + bytes_read + read_size > offset + n_size) {
data_to_copy -= (read_end - (offset + n_size));
}

// Async upload actual data to GPU
ggml_backend_tensor_set_async(upload_backend, cur,
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
ggml_backend_event_record(events[buffer_idx], upload_backend);

bytes_read += read_iteration;
data_read += data_to_copy;
bytes_read += read_size;

++buffer_idx;
buffer_idx %= n_buffers;
}
} else {
read_buf.resize(n_size);
file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size);
file->read_raw_at(read_buf.data(), n_size, weight->offs);
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
Expand Down
Loading