From a27ff2e6204a4a85a2caada839fcb13aa56dcff0 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Jun 2025 10:13:02 -0300 Subject: [PATCH 1/7] refactor tile number calculation --- ggml_extend.hpp | 74 ++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 55bca1544..b3147c9f6 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -607,6 +607,33 @@ __STATIC_INLINE__ void ggml_tensor_scale_output(struct ggml_tensor* src) { typedef std::function on_tile_process; +__STATIC_INLINE__ void +sd_tiling_calc_tiles(int &num_tiles_dim, float& tile_overlap_factor_dim, int small_dim, int tile_size, const float tile_overlap_factor) { + + int tile_overlap = (tile_size * tile_overlap_factor); + int non_tile_overlap = tile_size - tile_overlap; + + num_tiles_dim = (small_dim - tile_overlap) / non_tile_overlap; + int overshoot_dim = ((num_tiles_dim + 1) * non_tile_overlap + tile_overlap) % small_dim; + + if ((overshoot_dim != non_tile_overlap) && (overshoot_dim <= num_tiles_dim * (tile_size / 2 - tile_overlap))) { + // if tiles don't fit perfectly using the desired overlap + // and there is enough room to squeeze an extra tile without overlap becoming >0.5 + num_tiles_dim++; + } + + tile_overlap_factor_dim = (float)(tile_size * num_tiles_dim - small_dim) / (float)(tile_size * (num_tiles_dim - 1)); + if (num_tiles_dim <= 2) { + if (small_dim <= tile_size) { + num_tiles_dim = 1; + tile_overlap_factor_dim = 0; + } else { + num_tiles_dim = 2; + tile_overlap_factor_dim = (2 * tile_size - small_dim) / (float)tile_size; + } + } +} + // Tiling __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { output = ggml_set_f32(output, 0); @@ -629,48 +656,13 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const small_height = input_height; } - int tile_overlap = (tile_size * tile_overlap_factor); - int non_tile_overlap = tile_size - tile_overlap; - - int num_tiles_x = (small_width - tile_overlap) / non_tile_overlap; - int overshoot_x = ((num_tiles_x + 1) * non_tile_overlap + tile_overlap) % small_width; + int num_tiles_x; + float tile_overlap_factor_x; + sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, tile_size, tile_overlap_factor); - if ((overshoot_x != non_tile_overlap) && (overshoot_x <= num_tiles_x * (tile_size / 2 - tile_overlap))) { - // if tiles don't fit perfectly using the desired overlap - // and there is enough room to squeeze an extra tile without overlap becoming >0.5 - num_tiles_x++; - } - - float tile_overlap_factor_x = (float)(tile_size * num_tiles_x - small_width) / (float)(tile_size * (num_tiles_x - 1)); - if (num_tiles_x <= 2) { - if (small_width <= tile_size) { - num_tiles_x = 1; - tile_overlap_factor_x = 0; - } else { - num_tiles_x = 2; - tile_overlap_factor_x = (2 * tile_size - small_width) / (float)tile_size; - } - } - - int num_tiles_y = (small_height - tile_overlap) / non_tile_overlap; - int overshoot_y = ((num_tiles_y + 1) * non_tile_overlap + tile_overlap) % small_height; - - if ((overshoot_y != non_tile_overlap) && (overshoot_y <= num_tiles_y * (tile_size / 2 - tile_overlap))) { - // if tiles don't fit perfectly using the desired overlap - // and there is enough room to squeeze an extra tile without overlap becoming >0.5 - num_tiles_y++; - } - - float tile_overlap_factor_y = (float)(tile_size * num_tiles_y - small_height) / (float)(tile_size * (num_tiles_y - 1)); - if (num_tiles_y <= 2) { - if (small_height <= tile_size) { - num_tiles_y = 1; - tile_overlap_factor_y = 0; - } else { - num_tiles_y = 2; - tile_overlap_factor_y = (2 * tile_size - small_height) / (float)tile_size; - } - } + int num_tiles_y; + float tile_overlap_factor_y; + sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, tile_size, tile_overlap_factor); LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); From ff6127c4cb1201b5d986501b74a76c208d9e4a4c Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Jun 2025 10:23:17 -0300 Subject: [PATCH 2/7] support non-square tiles --- ggml_extend.hpp | 26 ++++++++++++------- stable-diffusion.cpp | 61 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index b3147c9f6..d63954975 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -635,7 +635,10 @@ sd_tiling_calc_tiles(int &num_tiles_dim, float& tile_overlap_factor_dim, int sma } // Tiling -__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { +__STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, ggml_tensor* output, const int scale, + const int p_tile_size_x, const int p_tile_size_y, + const float tile_overlap_factor, on_tile_process on_processing) { + output = ggml_set_f32(output, 0); int input_width = (int)input->ne[0]; @@ -658,25 +661,25 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int num_tiles_x; float tile_overlap_factor_x; - sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, tile_size, tile_overlap_factor); + sd_tiling_calc_tiles(num_tiles_x, tile_overlap_factor_x, small_width, p_tile_size_x, tile_overlap_factor); int num_tiles_y; float tile_overlap_factor_y; - sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, tile_size, tile_overlap_factor); + sd_tiling_calc_tiles(num_tiles_y, tile_overlap_factor_y, small_height, p_tile_size_y, tile_overlap_factor); LOG_DEBUG("num tiles : %d, %d ", num_tiles_x, num_tiles_y); LOG_DEBUG("optimal overlap : %f, %f (targeting %f)", tile_overlap_factor_x, tile_overlap_factor_y, tile_overlap_factor); GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0); // should be multiple of 2 - int tile_overlap_x = (int32_t)(tile_size * tile_overlap_factor_x); - int non_tile_overlap_x = tile_size - tile_overlap_x; + int tile_overlap_x = (int32_t)(p_tile_size_x * tile_overlap_factor_x); + int non_tile_overlap_x = p_tile_size_x - tile_overlap_x; - int tile_overlap_y = (int32_t)(tile_size * tile_overlap_factor_y); - int non_tile_overlap_y = tile_size - tile_overlap_y; + int tile_overlap_y = (int32_t)(p_tile_size_y * tile_overlap_factor_y); + int non_tile_overlap_y = p_tile_size_y - tile_overlap_y; - int tile_size_x = tile_size < small_width ? tile_size : small_width; - int tile_size_y = tile_size < small_height ? tile_size : small_height; + int tile_size_x = p_tile_size_x < small_width ? p_tile_size_x : small_width; + int tile_size_y = p_tile_size_y < small_height ? p_tile_size_y : small_height; int input_tile_size_x = tile_size_x; int input_tile_size_y = tile_size_y; @@ -765,6 +768,11 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const ggml_free(tiles_ctx); } +__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, + const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { + sd_tiling_non_square(input, output, scale, tile_size, tile_size, tile_overlap_factor, on_processing); +} + __STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx, struct ggml_tensor* a) { const float eps = 1e-6f; // default eps parameter diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1d1971485..83b801260 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1427,13 +1427,62 @@ class StableDiffusionGGML { x->ne[3]); // channels int64_t t0 = ggml_time_ms(); - int tile_size = 32; + int tile_size_x = 32; + int tile_size_y = 32; // TODO: arg instead of env? const char* SD_TILE_SIZE = getenv("SD_TILE_SIZE"); if (SD_TILE_SIZE != nullptr) { + // format is AxB, or just A (equivalent to AxA) + // A and B can be integers (tile size) or floating point + // floating point <= 1 means fraction of the latent dimension + // floating point > 1 means number of tiles in that dimension + // a single number gets applied to both dimensions std::string sd_tile_size_str = SD_TILE_SIZE; + size_t x_pos = sd_tile_size_str.find('x'); try { - tile_size = std::stoi(sd_tile_size_str); + int tmp_x = tile_size_x, tmp_y = tile_size_y; + if (x_pos != std::string::npos) { + std::string tile_x_str = sd_tile_size_str.substr(0, x_pos); + std::string tile_y_str = sd_tile_size_str.substr(x_pos + 1); + if (tile_x_str.find('.') != std::string::npos) { + float tile_factor = std::stof(tile_x_str); + if (tile_factor > 0.0) { + if (tile_factor > 1.0) + tile_factor = 1.0 / tile_factor; + tmp_x = (W / (decode ? 1 : 8)) * tile_factor; + } + } + else { + tmp_x = std::stoi(tile_x_str); + } + if (tile_y_str.find('.') != std::string::npos) { + float tile_factor = std::stof(tile_y_str); + if (tile_factor > 0.0) { + if (tile_factor > 1.0) + tile_factor = 1.0 / tile_factor; + tmp_y = (H / (decode ? 1 : 8)) * tile_factor; + } + } + else { + tmp_y = std::stoi(tile_y_str); + } + } + else { + if (sd_tile_size_str.find('.') != std::string::npos) { + float tile_factor = std::stof(sd_tile_size_str); + if (tile_factor > 0) { + if (tile_factor > 1.0) + tile_factor = 1.0 / tile_factor; + tmp_x = (W / (decode ? 1 : 8)) * tile_factor; + tmp_y = (H / (decode ? 1 : 8)) * tile_factor; + } + } + else { + tmp_x = tmp_y = std::stoi(sd_tile_size_str); + } + } + tile_size_x = tmp_x; + tile_size_y = tmp_y; } catch (const std::invalid_argument&) { LOG_WARN("Invalid"); } catch (const std::out_of_range&) { @@ -1443,7 +1492,8 @@ class StableDiffusionGGML { if(!decode){ // TODO: also use and arg for this one? // to keep the compute buffer size consistent - tile_size*=1.30539; + tile_size_x*=1.30539; + tile_size_y*=1.30539; } if (!use_tiny_autoencoder) { if (decode) { @@ -1452,11 +1502,14 @@ class StableDiffusionGGML { ggml_tensor_scale_input(x); } if (vae_tiling) { + if (SD_TILE_SIZE != nullptr) { + LOG_INFO("VAE Tile size: %dx%d", tile_size_x, tile_size_y); + } // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { first_stage_model->compute(n_threads, in, decode, &out); }; - sd_tiling(x, result, 8, tile_size, 0.5f, on_tiling); + sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, 0.5f, on_tiling); } else { first_stage_model->compute(n_threads, x, decode, &result); } From 680140db54f48cb0f428ae35641dcfd25b99164d Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sun, 15 Jun 2025 10:24:02 -0300 Subject: [PATCH 3/7] add env var to change tile overlap --- stable-diffusion.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 83b801260..ecb5b61e7 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1489,6 +1489,18 @@ class StableDiffusionGGML { LOG_WARN("OOR"); } } + float tile_overlap = 0.5f; + const char* SD_TILE_OVERLAP = getenv("SD_TILE_OVERLAP"); + if (SD_TILE_OVERLAP != nullptr) { + std::string sd_tile_overlap_str = SD_TILE_OVERLAP; + try { + tile_overlap = std::stof(sd_tile_overlap_str); + } catch (const std::invalid_argument&) { + LOG_WARN("Invalid"); + } catch (const std::out_of_range&) { + LOG_WARN("OOR"); + } + } if(!decode){ // TODO: also use and arg for this one? // to keep the compute buffer size consistent @@ -1505,11 +1517,14 @@ class StableDiffusionGGML { if (SD_TILE_SIZE != nullptr) { LOG_INFO("VAE Tile size: %dx%d", tile_size_x, tile_size_y); } + if (SD_TILE_OVERLAP != nullptr) { + LOG_INFO("VAE Tile overlap: %.2f", tile_overlap); + } // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { first_stage_model->compute(n_threads, in, decode, &out); }; - sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, 0.5f, on_tiling); + sd_tiling_non_square(x, result, 8, tile_size_x, tile_size_y, tile_overlap, on_tiling); } else { first_stage_model->compute(n_threads, x, decode, &result); } From fe84190af330fc2fe852c1fbe151f9bc62ed5ade Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Wed, 18 Jun 2025 09:25:10 -0300 Subject: [PATCH 4/7] add safeguards and better error messages for SD_TILE_OVERLAP --- stable-diffusion.cpp | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ecb5b61e7..229d2bc36 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1427,9 +1427,31 @@ class StableDiffusionGGML { x->ne[3]); // channels int64_t t0 = ggml_time_ms(); + // TODO: args instead of env for tile size / overlap? + + float tile_overlap = 0.5f; + const char* SD_TILE_OVERLAP = getenv("SD_TILE_OVERLAP"); + if (SD_TILE_OVERLAP != nullptr) { + std::string sd_tile_overlap_str = SD_TILE_OVERLAP; + try { + tile_overlap = std::stof(sd_tile_overlap_str); + if (tile_overlap < 0.0) { + LOG_WARN("SD_TILE_OVERLAP too low, setting it to 0.0"); + tile_overlap = 0.0; + } + else if (tile_overlap > 0.95) { + LOG_WARN("SD_TILE_OVERLAP too high, setting it to 0.95"); + tile_overlap = 0.95; + } + } catch (const std::invalid_argument&) { + LOG_WARN("SD_TILE_OVERLAP is invalid, keeping the default"); + } catch (const std::out_of_range&) { + LOG_WARN("SD_TILE_OVERLAP is out of range, keeping the default"); + } + } + int tile_size_x = 32; int tile_size_y = 32; - // TODO: arg instead of env? const char* SD_TILE_SIZE = getenv("SD_TILE_SIZE"); if (SD_TILE_SIZE != nullptr) { // format is AxB, or just A (equivalent to AxA) @@ -1484,23 +1506,12 @@ class StableDiffusionGGML { tile_size_x = tmp_x; tile_size_y = tmp_y; } catch (const std::invalid_argument&) { - LOG_WARN("Invalid"); - } catch (const std::out_of_range&) { - LOG_WARN("OOR"); - } - } - float tile_overlap = 0.5f; - const char* SD_TILE_OVERLAP = getenv("SD_TILE_OVERLAP"); - if (SD_TILE_OVERLAP != nullptr) { - std::string sd_tile_overlap_str = SD_TILE_OVERLAP; - try { - tile_overlap = std::stof(sd_tile_overlap_str); - } catch (const std::invalid_argument&) { - LOG_WARN("Invalid"); + LOG_WARN("SD_TILE_SIZE is invalid, keeping the default"); } catch (const std::out_of_range&) { - LOG_WARN("OOR"); + LOG_WARN("SD_TILE_SIZE is out of range, keeping the default"); } } + if(!decode){ // TODO: also use and arg for this one? // to keep the compute buffer size consistent From 4798be9b49fedd09ae250cc07146962220bcb5df Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Wed, 18 Jun 2025 13:16:43 -0300 Subject: [PATCH 5/7] add safeguards and include overlapping factor for SD_TILE_SIZE --- stable-diffusion.cpp | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 229d2bc36..894cd9e66 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1456,9 +1456,18 @@ class StableDiffusionGGML { if (SD_TILE_SIZE != nullptr) { // format is AxB, or just A (equivalent to AxA) // A and B can be integers (tile size) or floating point - // floating point <= 1 means fraction of the latent dimension - // floating point > 1 means number of tiles in that dimension - // a single number gets applied to both dimensions + // floating point <= 1 means simple fraction of the latent dimension + // floating point > 1 means number of tiles across that dimension + // a single number gets applied to both + auto get_tile_factor = [tile_overlap](const std::string& factor_str) { + float factor = std::stof(factor_str); + if (factor > 1.0) + factor = 1 / (factor - factor * tile_overlap + tile_overlap); + return factor; + }; + const int latent_x = W / (decode ? 1 : 8); + const int latent_y = H / (decode ? 1 : 8); + const int min_tile_dimension = 4; std::string sd_tile_size_str = SD_TILE_SIZE; size_t x_pos = sd_tile_size_str.find('x'); try { @@ -1467,23 +1476,13 @@ class StableDiffusionGGML { std::string tile_x_str = sd_tile_size_str.substr(0, x_pos); std::string tile_y_str = sd_tile_size_str.substr(x_pos + 1); if (tile_x_str.find('.') != std::string::npos) { - float tile_factor = std::stof(tile_x_str); - if (tile_factor > 0.0) { - if (tile_factor > 1.0) - tile_factor = 1.0 / tile_factor; - tmp_x = (W / (decode ? 1 : 8)) * tile_factor; - } + tmp_x = latent_x * get_tile_factor(tile_x_str); } else { tmp_x = std::stoi(tile_x_str); } if (tile_y_str.find('.') != std::string::npos) { - float tile_factor = std::stof(tile_y_str); - if (tile_factor > 0.0) { - if (tile_factor > 1.0) - tile_factor = 1.0 / tile_factor; - tmp_y = (H / (decode ? 1 : 8)) * tile_factor; - } + tmp_y = latent_y * get_tile_factor(tile_y_str); } else { tmp_y = std::stoi(tile_y_str); @@ -1491,20 +1490,16 @@ class StableDiffusionGGML { } else { if (sd_tile_size_str.find('.') != std::string::npos) { - float tile_factor = std::stof(sd_tile_size_str); - if (tile_factor > 0) { - if (tile_factor > 1.0) - tile_factor = 1.0 / tile_factor; - tmp_x = (W / (decode ? 1 : 8)) * tile_factor; - tmp_y = (H / (decode ? 1 : 8)) * tile_factor; - } + float tile_factor = get_tile_factor(sd_tile_size_str); + tmp_x = latent_x * tile_factor; + tmp_y = latent_y * tile_factor; } else { tmp_x = tmp_y = std::stoi(sd_tile_size_str); } } - tile_size_x = tmp_x; - tile_size_y = tmp_y; + tile_size_x = std::max(std::min(tmp_x, latent_x), min_tile_dimension); + tile_size_y = std::max(std::min(tmp_y, latent_y), min_tile_dimension); } catch (const std::invalid_argument&) { LOG_WARN("SD_TILE_SIZE is invalid, keeping the default"); } catch (const std::out_of_range&) { From fef286ea7537fb34f8aa8bc213e03aaa3895ffa1 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Wed, 18 Jun 2025 13:41:45 -0300 Subject: [PATCH 6/7] avoid rounding issues when specifying SD_TILE_SIZE as a factor --- stable-diffusion.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 894cd9e66..eae17b7d1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1476,13 +1476,13 @@ class StableDiffusionGGML { std::string tile_x_str = sd_tile_size_str.substr(0, x_pos); std::string tile_y_str = sd_tile_size_str.substr(x_pos + 1); if (tile_x_str.find('.') != std::string::npos) { - tmp_x = latent_x * get_tile_factor(tile_x_str); + tmp_x = std::round(latent_x * get_tile_factor(tile_x_str)); } else { tmp_x = std::stoi(tile_x_str); } if (tile_y_str.find('.') != std::string::npos) { - tmp_y = latent_y * get_tile_factor(tile_y_str); + tmp_y = std::round(latent_y * get_tile_factor(tile_y_str)); } else { tmp_y = std::stoi(tile_y_str); @@ -1491,8 +1491,8 @@ class StableDiffusionGGML { else { if (sd_tile_size_str.find('.') != std::string::npos) { float tile_factor = get_tile_factor(sd_tile_size_str); - tmp_x = latent_x * tile_factor; - tmp_y = latent_y * tile_factor; + tmp_x = std::round(latent_x * tile_factor); + tmp_y = std::round(latent_y * tile_factor); } else { tmp_x = tmp_y = std::stoi(sd_tile_size_str); From d64a9c86751e9a6299a7fb481398e378666b730e Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Wed, 18 Jun 2025 14:24:51 -0300 Subject: [PATCH 7/7] lower SD_TILE_OVERLAP limit --- stable-diffusion.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index eae17b7d1..29bda43c4 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1439,9 +1439,9 @@ class StableDiffusionGGML { LOG_WARN("SD_TILE_OVERLAP too low, setting it to 0.0"); tile_overlap = 0.0; } - else if (tile_overlap > 0.95) { - LOG_WARN("SD_TILE_OVERLAP too high, setting it to 0.95"); - tile_overlap = 0.95; + else if (tile_overlap > 0.5) { + LOG_WARN("SD_TILE_OVERLAP too high, setting it to 0.5"); + tile_overlap = 0.5; } } catch (const std::invalid_argument&) { LOG_WARN("SD_TILE_OVERLAP is invalid, keeping the default");