Skip to content
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this to false by default, results in a huge slowdown on MacOS with default arguments:

time ./bin/llama-completion -m ../models/gpt-oss-120b/ggml-model-mxfp4.gguf -p "hello" -n 1 -no-cnv

# master
real	0m4.648s

# PR
real	0m17.957s

Not sure what is the best way to handle this. If we keep it true, then linux users would not get the benefit of Direct IO. If we switch to false, Mac users will take the hit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be OK to set mmap depending on the platform?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have such precedent atm for any of the parameters in common, so I would say it's not ideal.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have on M4 Pro with GPT-OSS-20B on cold load --no-mmap: 4.168s --mmap: 6.3s. The warm load however takes with --mmap 2.1s (--no-mmap still ~4.1s).

Measured using time ./llama-cli -m /Users/jtischbein/Documents/models/openai_gpt-oss-20b-MXFP4.gguf --no-mmap -p "bla" -n 0 --single-turn and filesystem cache cleared using purge.

So the cold load time is still faster using --mmap, but unfortunately not as fast as on Linux.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do the following:

  • Add new CLI argument --direct-io, -dio
  • Description: "Use DirectIO if available. Takes precedence over --mmap"
  • Keep use_mmap == true and use_direct_io == true
  • On Mac, the internal implementation will determine that DIO is not available so it will fallback to mmap

Might want to do it in a separate PR as it would require changes in libllama API. This PR should keep use_mmap == true by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good

bool use_mmap = false; // use uncached reads for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
Expand Down
137 changes: 134 additions & 3 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
Expand Down Expand Up @@ -158,6 +159,133 @@ struct llama_file::impl {
std::fclose(fp);
}
}
#elif defined(__linux__)
impl(const char * fname, const char * mode) : impl(fname, mode, false) {}

impl(const char * fname, const char * mode, bool uncached_read) {
if (uncached_read) {
fd = open(fname, O_RDONLY | O_DIRECT);
if (fd == -1 && (errno == EINVAL || errno == EOPNOTSUPP)) {
fd = open(fname, O_RDONLY); // retry without O_DIRECT
}

if (fd == -1) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}

struct stat file_stats{};
fstat(fd, &file_stats);

size = file_stats.st_size;

off_t ret = lseek(fd, 0, SEEK_SET);
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
} else {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
}

size_t tell() const {
if (fd == -1) {
long ret = std::ftell(fp);
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}

return (size_t) ret;
}

off_t pos = lseek(fd, 0, SEEK_CUR);
if (pos == -1) {
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
}
return (size_t) pos;
}

void seek(size_t offset, int whence) const {
off_t ret = 0;
if (fd == -1) {
ret = std::fseek(fp, (long) offset, whence);
} else {
ret = lseek(fd, offset, whence);
}
if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
}

void read_raw(void * ptr, size_t len) const {
if (len == 0) {
return;
}
if (fd == -1) {
errno = 0;
std::size_t ret = std::fread(ptr, len, 1, fp);
if (ferror(fp)) {
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret != 1) {
throw std::runtime_error("unexpectedly reached end of file");
}
} else {
bool successful = false;
while (!successful) {
off_t ret = read(fd, ptr, len);

if (ret == -1) {
if (errno == EINTR) {
continue; // Interrupted by signal, retry
}
throw std::runtime_error(format("read error: %s", strerror(errno)));
}
if (ret == 0) {
throw std::runtime_error("unexpectedly reached end of file");
}

successful = true;
}
}
}

uint32_t read_u32() const {
uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}

void write_raw(const void * ptr, size_t len) const {
if (len == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, len, 1, fp);
if (ret != 1) {
throw std::runtime_error(format("write error: %s", strerror(errno)));
}
}

void write_u32(uint32_t val) const {
write_raw(&val, sizeof(val));
}

~impl() {
if (fp) {
std::fclose(fp);
} else if (fd != -1) {
close(fd);
}
}

int fd = -1;

#else
impl(const char * fname, const char * mode) {
fp = ggml_fopen(fname, mode);
Expand Down Expand Up @@ -237,11 +365,14 @@ struct llama_file::impl {
}
#endif

FILE * fp;
size_t size;
FILE * fp{};
size_t size{};
};

llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
#if defined(__linux__)
llama_file::llama_file(const char * fname, const char * mode, bool uncached_read) : pimpl(std::make_unique<impl>(fname, mode, uncached_read)) {}
#endif
llama_file::~llama_file() = default;

size_t llama_file::tell() const { return pimpl->tell(); }
Expand Down
3 changes: 3 additions & 0 deletions src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;

struct llama_file {
llama_file(const char * fname, const char * mode);
#if defined(__linux__)
llama_file(const char * fname, const char * mode, bool uncached_read);
#endif
~llama_file();

size_t tell() const;
Expand Down
111 changes: 111 additions & 0 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,11 @@ llama_model_loader::llama_model_loader(
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));

#if defined(__linux__)
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
#else
files.emplace_back(new llama_file(fname.c_str(), "rb"));
#endif
contexts.emplace_back(ctx);

// Save tensors data offset of the main file.
Expand Down Expand Up @@ -571,7 +575,11 @@ llama_model_loader::llama_model_loader(
}
}

#if defined(__linux__)
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
#else
files.emplace_back(new llama_file(fname_split, "rb"));
#endif
contexts.emplace_back(ctx);

// Save tensors data offset info of the shard.
Expand Down Expand Up @@ -933,7 +941,14 @@ bool llama_model_loader::load_all_data(
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4;
#if defined(__linux__)
constexpr size_t alignment = 4 * 1024; // 4 KiB for Direct I/O
// Buffer size: balance between memory usage and I/O efficiency
// 64MB works well for NVMe drives
constexpr size_t buffer_size = 64 * 1024 * 1024; // 64 MiB
#else
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
#endif

std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<ggml_backend_event_t> events;
Expand Down Expand Up @@ -982,7 +997,11 @@ bool llama_model_loader::load_all_data(

// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) {
#if defined(__linux__)
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size + 2 * alignment);
#else
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
#endif
if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
ggml_backend_dev_name(dev));
Expand Down Expand Up @@ -1019,6 +1038,35 @@ bool llama_model_loader::load_all_data(
ggml_backend_name(upload_backend));
}

#if defined(__linux__)
auto read_aligned_chunk = [](const llama_file * file,
size_t offset,
void * dest,
size_t size,
size_t alignment) {
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);

void * raw_buffer = nullptr;
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
if (ret != 0) {
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
}

struct aligned_buffer_deleter {
void operator()(void * p) const { free(p); }
};
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);

file->seek(aligned_offset, SEEK_SET);
file->read_raw(buffer.get(), bytes_to_read);

uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
};
#endif

for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
const auto * weight = get_weight(ggml_get_name(cur));
if (weight == nullptr) {
Expand Down Expand Up @@ -1064,9 +1112,18 @@ bool llama_model_loader::load_all_data(
}
} else {
const auto & file = files.at(weight->idx);
#if defined(__linux__)
auto offset = (off_t) weight->offs;
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
#endif
if (ggml_backend_buffer_is_host(cur->buffer)) {
#if defined(__linux__)
read_aligned_chunk(file.get(), weight->offs, cur->data, n_size, alignment);
#else
file->seek(weight->offs, SEEK_SET);
file->read_raw(cur->data, n_size);
#endif
if (check_tensors) {
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
Expand All @@ -1075,6 +1132,55 @@ bool llama_model_loader::load_all_data(
} else {
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) {
#if defined(__linux__)
// Calculate aligned read boundaries
size_t read_start = aligned_offset;
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);

size_t bytes_read = 0;
size_t data_read = 0; // Actual tensor data copied (excluding padding)

file->seek(aligned_offset, SEEK_SET);

while (bytes_read < read_end - read_start) {
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);

// Align the destination pointer within the pinned buffer
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);

// Wait for previous upload to complete before reusing buffer
ggml_backend_event_synchronize(events[buffer_idx]);

// Read aligned chunk from file
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);

// Calculate actual data portion (excluding alignment padding)
uintptr_t ptr_data = ptr_dest_aligned;
size_t data_to_copy = read_size;

// Skip alignment padding at start of first chunk
if (bytes_read == 0) {
ptr_data += offset_from_alignment;
data_to_copy -= offset_from_alignment;
}

// Trim alignment padding at end of last chunk
if (aligned_offset + bytes_read + read_size > offset + n_size) {
data_to_copy -= (read_end - (offset + n_size));
}

// Async upload actual data to GPU
ggml_backend_tensor_set_async(upload_backend, cur,
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
ggml_backend_event_record(events[buffer_idx], upload_backend);

data_read += data_to_copy;
bytes_read += read_size;

++buffer_idx;
buffer_idx %= n_buffers;
}
#else
file->seek(weight->offs, SEEK_SET);

size_t bytes_read = 0;
Expand All @@ -1091,10 +1197,15 @@ bool llama_model_loader::load_all_data(
++buffer_idx;
buffer_idx %= n_buffers;
}
#endif
} else {
read_buf.resize(n_size);
#if defined(__linux__)
read_aligned_chunk(file.get(), weight->offs, read_buf.data(), n_size, alignment);
#else
file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size);
#endif
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
Expand Down
Loading