Skip to content

Commit 4d4f4ca

Browse files
authored
llama : Async DirectIO model loading on Linux (#18012)
* Uncached model read * Removing additional --mmap arg * Removing trailing whitespaces * Adding fallback when O_DIRECT is not supported * Remove branching in llama-model-loader.cpp and reduce code duplications in llama-mmap.cpp * Adding maybe unused keyword for Mac and Windows. * File seek aligned * Removing all branches for direct_io in llama-model-loader.cpp * Always use alignment from llama_file * use_mmap=true
1 parent 0a0bba0 commit 4d4f4ca

File tree

3 files changed

+184
-42
lines changed

3 files changed

+184
-42
lines changed

src/llama-mmap.cpp

Lines changed: 123 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,10 @@
1313
#ifdef __has_include
1414
#if __has_include(<unistd.h>)
1515
#include <unistd.h>
16+
#include <fcntl.h>
17+
#include <sys/stat.h>
1618
#if defined(_POSIX_MAPPED_FILES)
1719
#include <sys/mman.h>
18-
#include <fcntl.h>
1920
#endif
2021
#if defined(_POSIX_MEMLOCK_RANGE)
2122
#include <sys/resource.h>
@@ -74,7 +75,7 @@ struct llama_file::impl {
7475
return ret;
7576
}
7677

77-
impl(const char * fname, const char * mode) {
78+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
7879
fp = ggml_fopen(fname, mode);
7980
if (fp == NULL) {
8081
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -153,13 +154,40 @@ struct llama_file::impl {
153154
write_raw(&val, sizeof(val));
154155
}
155156

157+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158+
throw std::runtime_error("DirectIO is not implemented on Windows.");
159+
}
160+
156161
~impl() {
157162
if (fp) {
158163
std::fclose(fp);
159164
}
160165
}
161166
#else
162-
impl(const char * fname, const char * mode) {
167+
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
168+
#ifdef __linux__
169+
// Try unbuffered I/O for read only
170+
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171+
fd = open(fname, O_RDONLY | O_DIRECT);
172+
173+
if (fd != -1) {
174+
struct stat file_stats{};
175+
fstat(fd, &file_stats);
176+
177+
size = file_stats.st_size;
178+
alignment = file_stats.st_blksize;
179+
180+
off_t ret = lseek(fd, 0, SEEK_SET);
181+
if (ret == -1) {
182+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
183+
}
184+
return;
185+
}
186+
187+
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188+
fname, strerror(errno));
189+
}
190+
#endif
163191
fp = ggml_fopen(fname, mode);
164192
if (fp == NULL) {
165193
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -170,27 +198,30 @@ struct llama_file::impl {
170198
}
171199

172200
size_t tell() const {
173-
// TODO: this ifdef is never true?
174-
#ifdef _WIN32
175-
__int64 ret = _ftelli64(fp);
176-
#else
177-
long ret = std::ftell(fp);
178-
#endif
179-
if (ret == -1) {
180-
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
201+
if (fd == -1) {
202+
long ret = std::ftell(fp);
203+
if (ret == -1) {
204+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
205+
}
206+
207+
return (size_t) ret;
181208
}
182209

183-
return (size_t) ret;
210+
off_t pos = lseek(fd, 0, SEEK_CUR);
211+
if (pos == -1) {
212+
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
213+
}
214+
return (size_t) pos;
184215
}
185216

186217
void seek(size_t offset, int whence) const {
187-
// TODO: this ifdef is never true?
188-
#ifdef _WIN32
189-
int ret = _fseeki64(fp, (__int64) offset, whence);
190-
#else
191-
int ret = std::fseek(fp, (long) offset, whence);
192-
#endif
193-
if (ret != 0) {
218+
off_t ret = 0;
219+
if (fd == -1) {
220+
ret = std::fseek(fp, (long) offset, whence);
221+
} else {
222+
ret = lseek(fd, offset, whence);
223+
}
224+
if (ret == -1) {
194225
throw std::runtime_error(format("seek error: %s", strerror(errno)));
195226
}
196227
}
@@ -200,13 +231,55 @@ struct llama_file::impl {
200231
return;
201232
}
202233
errno = 0;
203-
std::size_t ret = std::fread(ptr, len, 1, fp);
204-
if (ferror(fp)) {
205-
throw std::runtime_error(format("read error: %s", strerror(errno)));
234+
if (fd == -1) {
235+
std::size_t ret = std::fread(ptr, len, 1, fp);
236+
if (ferror(fp)) {
237+
throw std::runtime_error(format("read error: %s", strerror(errno)));
238+
}
239+
if (ret != 1) {
240+
throw std::runtime_error("unexpectedly reached end of file");
241+
}
242+
} else {
243+
bool successful = false;
244+
while (!successful) {
245+
off_t ret = read(fd, ptr, len);
246+
247+
if (ret == -1) {
248+
if (errno == EINTR) {
249+
continue; // Interrupted by signal, retry
250+
}
251+
throw std::runtime_error(format("read error: %s", strerror(errno)));
252+
}
253+
if (ret == 0) {
254+
throw std::runtime_error("unexpectedly reached end of file");
255+
}
256+
257+
successful = true;
258+
}
206259
}
207-
if (ret != 1) {
208-
throw std::runtime_error("unexpectedly reached end of file");
260+
}
261+
262+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
263+
off_t aligned_offset = offset & ~(alignment - 1);
264+
off_t offset_from_alignment = offset - aligned_offset;
265+
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
266+
267+
void * raw_buffer = nullptr;
268+
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
269+
if (ret != 0) {
270+
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
209271
}
272+
273+
struct aligned_buffer_deleter {
274+
void operator()(void * p) const { free(p); }
275+
};
276+
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
277+
278+
seek(aligned_offset, SEEK_SET);
279+
read_raw(buffer.get(), bytes_to_read);
280+
281+
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
282+
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
210283
}
211284

212285
uint32_t read_u32() const {
@@ -231,22 +304,43 @@ struct llama_file::impl {
231304
}
232305

233306
~impl() {
234-
if (fp) {
307+
if (fd != -1) {
308+
close(fd);
309+
} else {
235310
std::fclose(fp);
236311
}
237312
}
313+
int fd = -1;
238314
#endif
239315

240-
FILE * fp;
241-
size_t size;
316+
void read_raw_at(void * ptr, size_t len, size_t offset) const {
317+
if (alignment != 1) {
318+
read_aligned_chunk(offset, ptr, len);
319+
} else {
320+
seek(offset, SEEK_SET);
321+
read_raw(ptr, len);
322+
}
323+
}
324+
325+
size_t read_alignment() const {
326+
return alignment;
327+
}
328+
329+
size_t alignment = 1;
330+
331+
FILE * fp{};
332+
size_t size{};
242333
};
243334

244-
llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
335+
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
336+
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
245337
llama_file::~llama_file() = default;
246338

247339
size_t llama_file::tell() const { return pimpl->tell(); }
248340
size_t llama_file::size() const { return pimpl->size; }
249341

342+
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
343+
250344
int llama_file::file_id() const {
251345
#ifdef _WIN32
252346
return _fileno(pimpl->fp);
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
261355

262356
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
263357
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
358+
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
264359

265360
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
266361

src/llama-mmap.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <cstdint>
44
#include <memory>
55
#include <vector>
6+
#include <cstdio>
67

78
struct llama_file;
89
struct llama_mmap;
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1314
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1415

1516
struct llama_file {
16-
llama_file(const char * fname, const char * mode);
17+
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
1718
~llama_file();
1819

1920
size_t tell() const;
@@ -24,11 +25,14 @@ struct llama_file {
2425
void seek(size_t offset, int whence) const;
2526

2627
void read_raw(void * ptr, size_t len) const;
28+
void read_raw_at(void * ptr, size_t len, size_t offset) const;
29+
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
2730
uint32_t read_u32() const;
2831

2932
void write_raw(const void * ptr, size_t len) const;
3033
void write_u32(uint32_t val) const;
3134

35+
size_t read_alignment() const;
3236
private:
3337
struct impl;
3438
std::unique_ptr<impl> pimpl;

src/llama-model-loader.cpp

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ llama_model_loader::llama_model_loader(
504504
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
505505
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
506506

507-
files.emplace_back(new llama_file(fname.c_str(), "rb"));
507+
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
508508
contexts.emplace_back(ctx);
509509

510510
// Save tensors data offset of the main file.
@@ -572,7 +572,7 @@ llama_model_loader::llama_model_loader(
572572
}
573573
}
574574

575-
files.emplace_back(new llama_file(fname_split, "rb"));
575+
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
576576
contexts.emplace_back(ctx);
577577

578578
// Save tensors data offset info of the shard.
@@ -935,7 +935,15 @@ bool llama_model_loader::load_all_data(
935935
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
936936
// NVMe raid configurations might require more / larger buffers.
937937
constexpr size_t n_buffers = 4;
938-
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
938+
939+
size_t alignment = 1;
940+
for (const auto & file : files) {
941+
alignment = std::max(file->read_alignment(), alignment);
942+
}
943+
944+
// Buffer size: balance between memory usage and I/O efficiency
945+
// 64MB works well for NVMe drives
946+
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
939947

940948
std::vector<ggml_backend_buffer_t> host_buffers;
941949
std::vector<ggml_backend_event_t> events;
@@ -985,6 +993,7 @@ bool llama_model_loader::load_all_data(
985993
// If the backend is supported, create pinned memory buffers and events for synchronisation.
986994
for (size_t idx = 0; idx < n_buffers; ++idx) {
987995
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
996+
988997
if (!buf) {
989998
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
990999
ggml_backend_dev_name(dev));
@@ -1066,9 +1075,9 @@ bool llama_model_loader::load_all_data(
10661075
}
10671076
} else {
10681077
const auto & file = files.at(weight->idx);
1078+
10691079
if (ggml_backend_buffer_is_host(cur->buffer)) {
1070-
file->seek(weight->offs, SEEK_SET);
1071-
file->read_raw(cur->data, n_size);
1080+
file->read_raw_at(cur->data, n_size, weight->offs);
10721081
if (check_tensors) {
10731082
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
10741083
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1077,26 +1086,60 @@ bool llama_model_loader::load_all_data(
10771086
} else {
10781087
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
10791088
if (upload_backend) {
1080-
file->seek(weight->offs, SEEK_SET);
1089+
auto offset = (off_t) weight->offs;
1090+
alignment = file->read_alignment();
1091+
off_t aligned_offset = offset & ~(alignment - 1);
1092+
off_t offset_from_alignment = offset - aligned_offset;
1093+
file->seek(aligned_offset, SEEK_SET);
1094+
1095+
// Calculate aligned read boundaries
1096+
size_t read_start = aligned_offset;
1097+
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
10811098

10821099
size_t bytes_read = 0;
1100+
size_t data_read = 0; // Actual tensor data copied (excluding padding)
1101+
1102+
while (bytes_read < read_end - read_start) {
1103+
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
10831104

1084-
while (bytes_read < n_size) {
1085-
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
1105+
// Align the destination pointer within the pinned buffer
1106+
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
10861107

1108+
// Wait for previous upload to complete before reusing buffer
10871109
ggml_backend_event_synchronize(events[buffer_idx]);
1088-
file->read_raw(host_ptrs[buffer_idx], read_iteration);
1089-
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
1110+
1111+
// Read aligned chunk from file
1112+
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1113+
1114+
// Calculate actual data portion (excluding alignment padding)
1115+
uintptr_t ptr_data = ptr_dest_aligned;
1116+
size_t data_to_copy = read_size;
1117+
1118+
// Skip alignment padding at start of first chunk
1119+
if (bytes_read == 0) {
1120+
ptr_data += offset_from_alignment;
1121+
data_to_copy -= offset_from_alignment;
1122+
}
1123+
1124+
// Trim alignment padding at end of last chunk
1125+
if (aligned_offset + bytes_read + read_size > offset + n_size) {
1126+
data_to_copy -= (read_end - (offset + n_size));
1127+
}
1128+
1129+
// Async upload actual data to GPU
1130+
ggml_backend_tensor_set_async(upload_backend, cur,
1131+
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
10901132
ggml_backend_event_record(events[buffer_idx], upload_backend);
10911133

1092-
bytes_read += read_iteration;
1134+
data_read += data_to_copy;
1135+
bytes_read += read_size;
1136+
10931137
++buffer_idx;
10941138
buffer_idx %= n_buffers;
10951139
}
10961140
} else {
10971141
read_buf.resize(n_size);
1098-
file->seek(weight->offs, SEEK_SET);
1099-
file->read_raw(read_buf.data(), n_size);
1142+
file->read_raw_at(read_buf.data(), n_size, weight->offs);
11001143
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
11011144
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
11021145
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));

0 commit comments

Comments
 (0)