delay allocation until model loading time

wbruna · wbruna · commit 73da9fbcb125 · 2025-12-11T19:44:35.000-03:00
diff --git a/model.cpp b/model.cpp
@@ -1340,7 +1340,7 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
     return json_str;
 }
 
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool use_mmap) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p, bool use_mmap, alloc_cb_t alloc_cb) {
     int64_t process_time_ms = 0;
     std::atomic<int64_t> read_time_ms(0);
     std::atomic<int64_t> memcpy_time_ms(0);
@@ -1368,6 +1368,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     const int64_t t_start                 = ggml_time_ms();
     int last_n_threads                    = 1;
 
+    if (alloc_cb) {
+        alloc_cb();
+    }
+
     for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
         std::string file_path = file_paths_[file_index];
         LOG_DEBUG("loading tensors from %s", file_path.c_str());
@@ -1598,7 +1602,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                                std::set<std::string> ignore_tensors,
                                int n_threads,
-                               bool use_mmap) {
+                               bool use_mmap,
+                               alloc_cb_t alloc_cb) {
     std::set<std::string> tensor_names_in_file;
     std::mutex tensor_names_mutex;
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@@ -1641,7 +1646,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb, n_threads, use_mmap);
+    bool success = load_tensors(on_new_tensor_cb, n_threads, use_mmap, alloc_cb);
     if (!success) {
         LOG_ERROR("load tensors from file failed");
         return false;
diff --git a/model.h b/model.h
@@ -274,6 +274,7 @@ struct TensorStorage {
 };
 
 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
+typedef std::function<void()> alloc_cb_t;
 
 typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;
 
@@ -310,11 +311,12 @@ class ModelLoader {
     std::map<ggml_type, uint32_t> get_vae_wtype_stat();
     String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
     void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0, bool use_mmap = false, alloc_cb_t alloc_cb = nullptr);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
                       int n_threads                        = 0,
-                      bool use_mmap                        = false);
+                      bool use_mmap                        = false,
+                      alloc_cb_t alloc_cb                  = nullptr);
 
     std::vector<std::string> get_tensor_names() const {
         std::vector<std::string> names;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -478,7 +478,6 @@ class StableDiffusionGGML {
                     clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map);
-                    clip_vision->alloc_params_buffer();
                     clip_vision->get_param_tensors(tensors);
                 }
             } else if (sd_version_is_qwen_image(version)) {
@@ -541,18 +540,15 @@ class StableDiffusionGGML {
                 diffusion_model->set_flash_attn_enabled(true);
             }
 
-            cond_stage_model->alloc_params_buffer();
             cond_stage_model->get_param_tensors(tensors);
 
-            diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors);
 
             if (sd_version_is_unet_edit(version)) {
                 vae_decode_only = false;
             }
 
             if (high_noise_diffusion_model) {
-                high_noise_diffusion_model->alloc_params_buffer();
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
 
@@ -570,7 +566,6 @@ class StableDiffusionGGML {
                                                                         "first_stage_model",
                                                                         vae_decode_only,
                                                                         version);
-                first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else if (version == VERSION_CHROMA_RADIANCE) {
                 first_stage_model = std::make_shared<FakeVAE>(vae_backend,
@@ -596,7 +591,6 @@ class StableDiffusionGGML {
                         vae_conv_2d_scale);
                     first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
                 }
-                first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             }
             if (use_tiny_autoencoder) {
@@ -666,10 +660,6 @@ class StableDiffusionGGML {
                 }
             }
             if (stacked_id) {
-                if (!pmid_model->alloc_params_buffer()) {
-                    LOG_ERROR(" pmid model params buffer allocation failed");
-                    return false;
-                }
                 pmid_model->get_param_tensors(tensors, "pmid");
             }
         }
@@ -710,7 +700,17 @@ class StableDiffusionGGML {
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->use_mmap);
+
+        auto alloc_cb = [&]() -> void {
+            if (clip_vision) clip_vision->alloc_params_buffer();
+            if (cond_stage_model) cond_stage_model->alloc_params_buffer();
+            if (diffusion_model) diffusion_model->alloc_params_buffer();
+            if (high_noise_diffusion_model) high_noise_diffusion_model->alloc_params_buffer();
+            if (first_stage_model) first_stage_model->alloc_params_buffer();
+            if (pmid_model) pmid_model->alloc_params_buffer();
+        };
+
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->use_mmap, alloc_cb);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);