Skip to content

Commit a45fc5e

Browse files
committed
Revert "llama : Async DirectIO model loading on Linux (ggml-org#18012)"
This reverts commit 4d4f4ca.
1 parent 2e57e5e commit a45fc5e

File tree

3 files changed

+42
-184
lines changed

3 files changed

+42
-184
lines changed

src/llama-mmap.cpp

Lines changed: 28 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,9 @@
1313
#ifdef __has_include
1414
#if __has_include(<unistd.h>)
1515
#include <unistd.h>
16-
#include <fcntl.h>
17-
#include <sys/stat.h>
1816
#if defined(_POSIX_MAPPED_FILES)
1917
#include <sys/mman.h>
18+
#include <fcntl.h>
2019
#endif
2120
#if defined(_POSIX_MEMLOCK_RANGE)
2221
#include <sys/resource.h>
@@ -75,7 +74,7 @@ struct llama_file::impl {
7574
return ret;
7675
}
7776

78-
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
77+
impl(const char * fname, const char * mode) {
7978
fp = ggml_fopen(fname, mode);
8079
if (fp == NULL) {
8180
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -154,40 +153,13 @@ struct llama_file::impl {
154153
write_raw(&val, sizeof(val));
155154
}
156155

157-
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158-
throw std::runtime_error("DirectIO is not implemented on Windows.");
159-
}
160-
161156
~impl() {
162157
if (fp) {
163158
std::fclose(fp);
164159
}
165160
}
166161
#else
167-
impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
168-
#ifdef __linux__
169-
// Try unbuffered I/O for read only
170-
if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171-
fd = open(fname, O_RDONLY | O_DIRECT);
172-
173-
if (fd != -1) {
174-
struct stat file_stats{};
175-
fstat(fd, &file_stats);
176-
177-
size = file_stats.st_size;
178-
alignment = file_stats.st_blksize;
179-
180-
off_t ret = lseek(fd, 0, SEEK_SET);
181-
if (ret == -1) {
182-
throw std::runtime_error(format("seek error: %s", strerror(errno)));
183-
}
184-
return;
185-
}
186-
187-
LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188-
fname, strerror(errno));
189-
}
190-
#endif
162+
impl(const char * fname, const char * mode) {
191163
fp = ggml_fopen(fname, mode);
192164
if (fp == NULL) {
193165
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -198,30 +170,27 @@ struct llama_file::impl {
198170
}
199171

200172
size_t tell() const {
201-
if (fd == -1) {
202-
long ret = std::ftell(fp);
203-
if (ret == -1) {
204-
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
205-
}
206-
207-
return (size_t) ret;
173+
// TODO: this ifdef is never true?
174+
#ifdef _WIN32
175+
__int64 ret = _ftelli64(fp);
176+
#else
177+
long ret = std::ftell(fp);
178+
#endif
179+
if (ret == -1) {
180+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
208181
}
209182

210-
off_t pos = lseek(fd, 0, SEEK_CUR);
211-
if (pos == -1) {
212-
throw std::runtime_error(format("lseek error: %s", strerror(errno)));
213-
}
214-
return (size_t) pos;
183+
return (size_t) ret;
215184
}
216185

217186
void seek(size_t offset, int whence) const {
218-
off_t ret = 0;
219-
if (fd == -1) {
220-
ret = std::fseek(fp, (long) offset, whence);
221-
} else {
222-
ret = lseek(fd, offset, whence);
223-
}
224-
if (ret == -1) {
187+
// TODO: this ifdef is never true?
188+
#ifdef _WIN32
189+
int ret = _fseeki64(fp, (__int64) offset, whence);
190+
#else
191+
int ret = std::fseek(fp, (long) offset, whence);
192+
#endif
193+
if (ret != 0) {
225194
throw std::runtime_error(format("seek error: %s", strerror(errno)));
226195
}
227196
}
@@ -231,55 +200,13 @@ struct llama_file::impl {
231200
return;
232201
}
233202
errno = 0;
234-
if (fd == -1) {
235-
std::size_t ret = std::fread(ptr, len, 1, fp);
236-
if (ferror(fp)) {
237-
throw std::runtime_error(format("read error: %s", strerror(errno)));
238-
}
239-
if (ret != 1) {
240-
throw std::runtime_error("unexpectedly reached end of file");
241-
}
242-
} else {
243-
bool successful = false;
244-
while (!successful) {
245-
off_t ret = read(fd, ptr, len);
246-
247-
if (ret == -1) {
248-
if (errno == EINTR) {
249-
continue; // Interrupted by signal, retry
250-
}
251-
throw std::runtime_error(format("read error: %s", strerror(errno)));
252-
}
253-
if (ret == 0) {
254-
throw std::runtime_error("unexpectedly reached end of file");
255-
}
256-
257-
successful = true;
258-
}
203+
std::size_t ret = std::fread(ptr, len, 1, fp);
204+
if (ferror(fp)) {
205+
throw std::runtime_error(format("read error: %s", strerror(errno)));
259206
}
260-
}
261-
262-
void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
263-
off_t aligned_offset = offset & ~(alignment - 1);
264-
off_t offset_from_alignment = offset - aligned_offset;
265-
size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
266-
267-
void * raw_buffer = nullptr;
268-
int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
269-
if (ret != 0) {
270-
throw std::runtime_error(format("posix_memalign failed with error %d", ret));
207+
if (ret != 1) {
208+
throw std::runtime_error("unexpectedly reached end of file");
271209
}
272-
273-
struct aligned_buffer_deleter {
274-
void operator()(void * p) const { free(p); }
275-
};
276-
std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
277-
278-
seek(aligned_offset, SEEK_SET);
279-
read_raw(buffer.get(), bytes_to_read);
280-
281-
uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
282-
memcpy(dest, reinterpret_cast<void *>(actual_data), size);
283210
}
284211

285212
uint32_t read_u32() const {
@@ -304,43 +231,22 @@ struct llama_file::impl {
304231
}
305232

306233
~impl() {
307-
if (fd != -1) {
308-
close(fd);
309-
} else {
234+
if (fp) {
310235
std::fclose(fp);
311236
}
312237
}
313-
int fd = -1;
314238
#endif
315239

316-
void read_raw_at(void * ptr, size_t len, size_t offset) const {
317-
if (alignment != 1) {
318-
read_aligned_chunk(offset, ptr, len);
319-
} else {
320-
seek(offset, SEEK_SET);
321-
read_raw(ptr, len);
322-
}
323-
}
324-
325-
size_t read_alignment() const {
326-
return alignment;
327-
}
328-
329-
size_t alignment = 1;
330-
331-
FILE * fp{};
332-
size_t size{};
240+
FILE * fp;
241+
size_t size;
333242
};
334243

335-
llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
336-
pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
244+
llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
337245
llama_file::~llama_file() = default;
338246

339247
size_t llama_file::tell() const { return pimpl->tell(); }
340248
size_t llama_file::size() const { return pimpl->size; }
341249

342-
size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
343-
344250
int llama_file::file_id() const {
345251
#ifdef _WIN32
346252
return _fileno(pimpl->fp);
@@ -355,7 +261,6 @@ int llama_file::file_id() const {
355261

356262
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
357263
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
358-
void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
359264

360265
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
361266

src/llama-mmap.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include <cstdint>
44
#include <memory>
55
#include <vector>
6-
#include <cstdio>
76

87
struct llama_file;
98
struct llama_mmap;
@@ -14,7 +13,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
1413
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
1514

1615
struct llama_file {
17-
llama_file(const char * fname, const char * mode, bool use_direct_io = false);
16+
llama_file(const char * fname, const char * mode);
1817
~llama_file();
1918

2019
size_t tell() const;
@@ -25,14 +24,11 @@ struct llama_file {
2524
void seek(size_t offset, int whence) const;
2625

2726
void read_raw(void * ptr, size_t len) const;
28-
void read_raw_at(void * ptr, size_t len, size_t offset) const;
29-
void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
3027
uint32_t read_u32() const;
3128

3229
void write_raw(const void * ptr, size_t len) const;
3330
void write_u32(uint32_t val) const;
3431

35-
size_t read_alignment() const;
3632
private:
3733
struct impl;
3834
std::unique_ptr<impl> pimpl;

src/llama-model-loader.cpp

Lines changed: 13 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ llama_model_loader::llama_model_loader(
508508
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
509509
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
510510

511-
files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
511+
files.emplace_back(new llama_file(fname.c_str(), "rb"));
512512
contexts.emplace_back(ctx);
513513

514514
// Save tensors data offset of the main file.
@@ -576,7 +576,7 @@ llama_model_loader::llama_model_loader(
576576
}
577577
}
578578

579-
files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
579+
files.emplace_back(new llama_file(fname_split, "rb"));
580580
contexts.emplace_back(ctx);
581581

582582
// Save tensors data offset info of the shard.
@@ -958,15 +958,7 @@ bool llama_model_loader::load_all_data(
958958
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
959959
// NVMe raid configurations might require more / larger buffers.
960960
constexpr size_t n_buffers = 4;
961-
962-
size_t alignment = 1;
963-
for (const auto & file : files) {
964-
alignment = std::max(file->read_alignment(), alignment);
965-
}
966-
967-
// Buffer size: balance between memory usage and I/O efficiency
968-
// 64MB works well for NVMe drives
969-
const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
961+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
970962

971963
std::vector<ggml_backend_buffer_t> host_buffers;
972964
std::vector<ggml_backend_event_t> events;
@@ -1016,7 +1008,6 @@ bool llama_model_loader::load_all_data(
10161008
// If the backend is supported, create pinned memory buffers and events for synchronisation.
10171009
for (size_t idx = 0; idx < n_buffers; ++idx) {
10181010
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
1019-
10201011
if (!buf) {
10211012
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
10221013
ggml_backend_dev_name(dev));
@@ -1098,9 +1089,9 @@ bool llama_model_loader::load_all_data(
10981089
}
10991090
} else {
11001091
const auto & file = files.at(weight->idx);
1101-
11021092
if (ggml_backend_buffer_is_host(cur->buffer)) {
1103-
file->read_raw_at(cur->data, n_size, weight->offs);
1093+
file->seek(weight->offs, SEEK_SET);
1094+
file->read_raw(cur->data, n_size);
11041095
if (check_tensors) {
11051096
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
11061097
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1109,60 +1100,26 @@ bool llama_model_loader::load_all_data(
11091100
} else {
11101101
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
11111102
if (upload_backend) {
1112-
auto offset = (off_t) weight->offs;
1113-
alignment = file->read_alignment();
1114-
off_t aligned_offset = offset & ~(alignment - 1);
1115-
off_t offset_from_alignment = offset - aligned_offset;
1116-
file->seek(aligned_offset, SEEK_SET);
1117-
1118-
// Calculate aligned read boundaries
1119-
size_t read_start = aligned_offset;
1120-
size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
1103+
file->seek(weight->offs, SEEK_SET);
11211104

11221105
size_t bytes_read = 0;
1123-
size_t data_read = 0; // Actual tensor data copied (excluding padding)
1124-
1125-
while (bytes_read < read_end - read_start) {
1126-
size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
11271106

1128-
// Align the destination pointer within the pinned buffer
1129-
uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
1107+
while (bytes_read < n_size) {
1108+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
11301109

1131-
// Wait for previous upload to complete before reusing buffer
11321110
ggml_backend_event_synchronize(events[buffer_idx]);
1133-
1134-
// Read aligned chunk from file
1135-
file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1136-
1137-
// Calculate actual data portion (excluding alignment padding)
1138-
uintptr_t ptr_data = ptr_dest_aligned;
1139-
size_t data_to_copy = read_size;
1140-
1141-
// Skip alignment padding at start of first chunk
1142-
if (bytes_read == 0) {
1143-
ptr_data += offset_from_alignment;
1144-
data_to_copy -= offset_from_alignment;
1145-
}
1146-
1147-
// Trim alignment padding at end of last chunk
1148-
if (aligned_offset + bytes_read + read_size > offset + n_size) {
1149-
data_to_copy -= (read_end - (offset + n_size));
1150-
}
1151-
1152-
// Async upload actual data to GPU
1153-
ggml_backend_tensor_set_async(upload_backend, cur,
1154-
reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
1111+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
1112+
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
11551113
ggml_backend_event_record(events[buffer_idx], upload_backend);
11561114

1157-
data_read += data_to_copy;
1158-
bytes_read += read_size;
1159-
1115+
bytes_read += read_iteration;
11601116
++buffer_idx;
11611117
buffer_idx %= n_buffers;
11621118
}
11631119
} else {
11641120
read_buf.resize(n_size);
1165-
file->read_raw_at(read_buf.data(), n_size, weight->offs);
1121+
file->seek(weight->offs, SEEK_SET);
1122+
file->read_raw(read_buf.data(), n_size);
11661123
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
11671124
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
11681125
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));

0 commit comments

Comments
 (0)