Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2027,6 +2027,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_mmap = value;
}
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"-dio", "--direct-io"},
{"-ndio", "--no-direct-io"},
string_format("Use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_direct_io = value;
}
).set_env("LLAMA_ARG_DIO"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
Expand Down
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,8 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache

bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // use mmap for faster loads
bool use_mmap = true; // enable mmap to use filesystem cache
bool use_direct_io = true; // read from disk without buffering for faster model loading
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
Expand Down
1 change: 1 addition & 0 deletions examples/diffusion/diffusion-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ int main(int argc, char ** argv) {
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.devices = params.devices.data();
model_params.use_mmap = params.use_mmap;
model_params.use_direct_io = params.use_direct_io;
model_params.use_mlock = params.use_mlock;
model_params.check_tensors = params.check_tensors;

Expand Down
1 change: 1 addition & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_direct_io; // use direct io, takes precedence over use_mmap
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
Expand Down
44 changes: 31 additions & 13 deletions src/llama-mmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ struct llama_file::impl {
throw std::runtime_error("DirectIO is not implemented on Windows.");
}

void read_raw_unsafe(void * ptr, size_t len) const {
throw std::runtime_error("DirectIO is not implemented on Windows.");
}

bool has_direct_io() const {
return true;
}

~impl() {
if (fp) {
std::fclose(fp);
Expand Down Expand Up @@ -226,7 +234,7 @@ struct llama_file::impl {
}
}

void read_raw(void * ptr, size_t len) const {
void read_raw_unsafe(void * ptr, size_t len) const {
if (len == 0) {
return;
}
Expand All @@ -241,6 +249,7 @@ struct llama_file::impl {
}
} else {
bool successful = false;
GGML_ASSERT(len % alignment == 0 && (uintptr_t) ptr % alignment == 0);
while (!successful) {
off_t ret = read(fd, ptr, len);

Expand All @@ -259,7 +268,8 @@ struct llama_file::impl {
}
}

void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
void read_aligned_chunk(void * dest, size_t size) const {
size_t offset = tell();
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
Expand All @@ -276,12 +286,20 @@ struct llama_file::impl {
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);

seek(aligned_offset, SEEK_SET);
read_raw(buffer.get(), bytes_to_read);
read_raw_unsafe(buffer.get(), bytes_to_read);

uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}

void read_raw(void * ptr, size_t len) const {
if (has_direct_io()) {
read_aligned_chunk(ptr, len);
} else {
read_raw_unsafe(ptr, len);
}
}

uint32_t read_u32() const {
uint32_t ret;
read_raw(&ret, sizeof(ret));
Expand All @@ -303,6 +321,10 @@ struct llama_file::impl {
write_raw(&val, sizeof(val));
}

bool has_direct_io() const {
return fd != -1 && alignment > 1;
}

~impl() {
if (fd != -1) {
close(fd);
Expand All @@ -313,15 +335,6 @@ struct llama_file::impl {
int fd = -1;
#endif

void read_raw_at(void * ptr, size_t len, size_t offset) const {
if (alignment != 1) {
read_aligned_chunk(offset, ptr, len);
} else {
seek(offset, SEEK_SET);
read_raw(ptr, len);
}
}

size_t read_alignment() const {
return alignment;
}
Expand All @@ -340,6 +353,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }

size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }

int llama_file::file_id() const {
#ifdef _WIN32
Expand All @@ -355,7 +369,11 @@ int llama_file::file_id() const {

void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
#ifdef _WIN32
void llama_file::read_raw_unsafe(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
#else
void llama_file::read_raw_unsafe(void * ptr, size_t len) const { pimpl->read_raw_unsafe(ptr, len); }
#endif

uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }

Expand Down
5 changes: 3 additions & 2 deletions src/llama-mmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ struct llama_file {
void seek(size_t offset, int whence) const;

void read_raw(void * ptr, size_t len) const;
void read_raw_at(void * ptr, size_t len, size_t offset) const;
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
void read_raw_unsafe(void * ptr, size_t len) const;
void read_aligned_chunk(void * dest, size_t size) const;
uint32_t read_u32() const;

void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const;

size_t read_alignment() const;
bool has_direct_io() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
Expand Down
26 changes: 19 additions & 7 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ llama_model_loader::llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
Expand Down Expand Up @@ -504,9 +505,17 @@ llama_model_loader::llama_model_loader(
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));

files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);

use_direct_io = use_direct_io && files.back()->has_direct_io();

// Disable mmap in case Direct I/O is enabled and available
if (use_direct_io && use_mmap) {
use_mmap = false;
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
}

// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
Expand Down Expand Up @@ -572,7 +581,7 @@ llama_model_loader::llama_model_loader(
}
}

files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
contexts.emplace_back(ctx);

// Save tensors data offset info of the shard.
Expand Down Expand Up @@ -716,6 +725,7 @@ llama_model_loader::llama_model_loader(
}

this->use_mmap = use_mmap;
this->use_direct_io = use_direct_io;
this->check_tensors = check_tensors;
this->no_alloc = no_alloc;
}
Expand Down Expand Up @@ -1077,7 +1087,8 @@ bool llama_model_loader::load_all_data(
const auto & file = files.at(weight->idx);

if (ggml_backend_buffer_is_host(cur->buffer)) {
file->read_raw_at(cur->data, n_size, weight->offs);
file->seek(weight->offs, SEEK_SET);
file->read_raw(cur->data, n_size);
if (check_tensors) {
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
Expand All @@ -1086,10 +1097,10 @@ bool llama_model_loader::load_all_data(
} else {
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) {
auto offset = (off_t) weight->offs;
size_t offset = weight->offs;
alignment = file->read_alignment();
off_t aligned_offset = offset & ~(alignment - 1);
off_t offset_from_alignment = offset - aligned_offset;
size_t aligned_offset = offset & ~(alignment - 1);
size_t offset_from_alignment = offset - aligned_offset;
file->seek(aligned_offset, SEEK_SET);

// Calculate aligned read boundaries
Expand Down Expand Up @@ -1139,7 +1150,8 @@ bool llama_model_loader::load_all_data(
}
} else {
read_buf.resize(n_size);
file->read_raw_at(read_buf.data(), n_size, weight->offs);
file->seek(weight->offs, SEEK_SET);
file->read_raw(read_buf.data(), n_size);
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
Expand Down
2 changes: 2 additions & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ struct llama_model_loader {
size_t n_bytes = 0;

bool use_mmap = false;
bool use_direct_io = false;
bool check_tensors;
bool no_alloc;

Expand Down Expand Up @@ -97,6 +98,7 @@ struct llama_model_loader {
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
Expand Down
4 changes: 3 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2337,7 +2337,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

const bool use_mmap_buffer = true;

LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
__func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");

// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
Expand Down Expand Up @@ -7671,6 +7672,7 @@ llama_model_params llama_model_default_params() {
/*.kv_overrides =*/ nullptr,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
/*.use_direct_io =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.use_extra_bufts =*/ true,
Expand Down
2 changes: 1 addition & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}

std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching

llama_model model(llama_model_default_params());
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
model.t_start_us = tm.t_start_us;

try {
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);

ml.print_info();

Expand Down
Loading