Skip to content

Commit f88bb9e

Browse files
committed
Merge branch 'master' into type_rules_generation
2 parents 9ac48e8 + 742a733 commit f88bb9e

38 files changed

+3443
-1617
lines changed

.github/workflows/build.yml

Lines changed: 102 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ jobs:
6565
6666
- name: Get commit hash
6767
id: commit
68-
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
68+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
6969
uses: pr-mpt/actions-commit-hash@v2
7070

7171
- name: Fetch system info
@@ -118,7 +118,7 @@ jobs:
118118
119119
- name: Get commit hash
120120
id: commit
121-
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
121+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
122122
uses: pr-mpt/actions-commit-hash@v2
123123

124124
- name: Fetch system info
@@ -164,8 +164,6 @@ jobs:
164164
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
165165
- build: "cuda12"
166166
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
167-
# - build: "rocm5.5"
168-
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
169167
- build: 'vulkan'
170168
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
171169
steps:
@@ -184,22 +182,9 @@ jobs:
184182
method: "network"
185183
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
186184

187-
- name: Install rocm-toolkit
188-
id: rocm-toolkit
189-
if: ${{ matrix.build == 'rocm5.5' }}
190-
uses: Cyberhan123/rocm-toolkit@v0.1.0
191-
with:
192-
rocm: "5.5.0"
193-
194-
- name: Install Ninja
195-
id: install-ninja
196-
if: ${{ matrix.build == 'rocm5.5' }}
197-
uses: urkle/action-get-ninja@v1
198-
with:
199-
version: 1.11.1
200185
- name: Install Vulkan SDK
201186
id: get_vulkan
202-
if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe
187+
if: ${{ matrix.build == 'vulkan' }}
203188
run: |
204189
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
205190
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -277,6 +262,104 @@ jobs:
277262
path: |
278263
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
279264
265+
windows-latest-cmake-hip:
266+
runs-on: windows-2022
267+
268+
env:
269+
HIPSDK_INSTALLER_VERSION: "25.Q3"
270+
GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
271+
272+
steps:
273+
- uses: actions/checkout@v3
274+
with:
275+
submodules: recursive
276+
277+
- name: Cache ROCm Installation
278+
id: cache-rocm
279+
uses: actions/cache@v4
280+
with:
281+
path: C:\Program Files\AMD\ROCm
282+
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
283+
284+
- name: ccache
285+
uses: ggml-org/ccache-action@v1.2.16
286+
with:
287+
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
288+
evict-old-files: 1d
289+
290+
- name: Install ROCm
291+
if: steps.cache-rocm.outputs.cache-hit != 'true'
292+
run: |
293+
$ErrorActionPreference = "Stop"
294+
write-host "Downloading AMD HIP SDK Installer"
295+
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
296+
write-host "Installing AMD HIP SDK"
297+
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
298+
$completed = $proc.WaitForExit(600000)
299+
if (-not $completed) {
300+
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
301+
$proc.Kill()
302+
exit 1
303+
}
304+
if ($proc.ExitCode -ne 0) {
305+
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
306+
exit 1
307+
}
308+
write-host "Completed AMD HIP SDK installation"
309+
310+
- name: Verify ROCm
311+
run: |
312+
# Find and test ROCm installation
313+
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
314+
if (-not $clangPath) {
315+
Write-Error "ROCm installation not found"
316+
exit 1
317+
}
318+
& $clangPath.FullName --version
319+
# Set HIP_PATH environment variable for later steps
320+
echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
321+
322+
- name: Build
323+
run: |
324+
mkdir build
325+
cd build
326+
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
327+
cmake .. `
328+
-G "Unix Makefiles" `
329+
-DSD_HIPBLAS=ON `
330+
-DSD_BUILD_SHARED_LIBS=ON `
331+
-DGGML_NATIVE=OFF `
332+
-DCMAKE_C_COMPILER=clang `
333+
-DCMAKE_CXX_COMPILER=clang++ `
334+
-DCMAKE_BUILD_TYPE=Release `
335+
-DGPU_TARGETS="${{ env.GPU_TARGETS }}"
336+
cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
337+
338+
- name: Get commit hash
339+
id: commit
340+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
341+
uses: pr-mpt/actions-commit-hash@v2
342+
343+
- name: Pack artifacts
344+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
345+
run: |
346+
md "build\bin\rocblas\library\"
347+
md "build\bin\hipblaslt\library"
348+
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
349+
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
350+
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
351+
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
352+
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
353+
7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
354+
355+
- name: Upload artifacts
356+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
357+
uses: actions/upload-artifact@v4
358+
with:
359+
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
360+
path: |
361+
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
362+
280363
release:
281364
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
282365

@@ -286,6 +369,7 @@ jobs:
286369
- ubuntu-latest-cmake
287370
- macOS-latest-cmake
288371
- windows-latest-cmake
372+
- windows-latest-cmake-hip
289373

290374
steps:
291375
- name: Clone

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ test/
1212
output*.png
1313
models*
1414
*.log
15+
preview.png

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ API and command-line option may change frequently.***
8181
- [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
8282
- `DPM++ 2S a`
8383
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
84-
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
84+
- Cross-platform reproducibility
85+
- `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
86+
- `--rng cpu`, consistent with the `comfyui RNG`
8587
- Embedds generation parameters into png output as webui-compatible text string
8688

8789
## Quick Start

clip.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -936,7 +936,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
936936
size_t max_token_idx = 0,
937937
bool return_pooled = false,
938938
int clip_skip = -1) {
939-
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
939+
struct ggml_cgraph* gf = new_graph_custom(2048);
940940

941941
input_ids = to_backend(input_ids);
942942

common.hpp

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -182,31 +182,21 @@ class GEGLU : public UnaryBlock {
182182
int64_t dim_in;
183183
int64_t dim_out;
184184

185-
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
186-
enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
187-
enum ggml_type bias_wtype = GGML_TYPE_F32;
188-
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
189-
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
190-
}
191-
192185
public:
193186
GEGLU(int64_t dim_in, int64_t dim_out)
194-
: dim_in(dim_in), dim_out(dim_out) {}
187+
: dim_in(dim_in), dim_out(dim_out) {
188+
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
189+
}
195190

196191
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
197192
// x: [ne3, ne2, ne1, dim_in]
198193
// return: [ne3, ne2, ne1, dim_out]
199-
struct ggml_tensor* w = params["proj.weight"];
200-
struct ggml_tensor* b = params["proj.bias"];
201-
202-
auto x_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
203-
auto x_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
204-
auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
205-
auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
194+
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
206195

207-
auto x_in = x;
208-
x = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
209-
auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
196+
x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
197+
auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
198+
x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
199+
auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
210200

211201
gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
212202

@@ -410,6 +400,22 @@ class SpatialTransformer : public GGMLBlock {
410400
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
411401
bool use_linear = false;
412402

403+
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
404+
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
405+
if (iter != tensor_storage_map.end()) {
406+
int64_t inner_dim = n_head * d_head;
407+
if (iter->second.n_dims == 4 && use_linear) {
408+
use_linear = false;
409+
blocks["proj_in"] = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
410+
blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
411+
} else if (iter->second.n_dims == 2 && !use_linear) {
412+
use_linear = true;
413+
blocks["proj_in"] = std::make_shared<Linear>(in_channels, inner_dim);
414+
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
415+
}
416+
}
417+
}
418+
413419
public:
414420
SpatialTransformer(int64_t in_channels,
415421
int64_t n_head,

conditioner.hpp

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct Conditioner {
3434
virtual void free_params_buffer() = 0;
3535
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
3636
virtual size_t get_params_buffer_size() = 0;
37+
virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
3738
virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
3839
int n_threads,
3940
const ConditionerParams& conditioner_params) {
@@ -108,10 +109,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
108109
return buffer_size;
109110
}
110111

112+
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
113+
text_model->set_weight_adapter(adapter);
114+
if (sd_version_is_sdxl(version)) {
115+
text_model2->set_weight_adapter(adapter);
116+
}
117+
}
118+
111119
bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
112120
// the order matters
113121
ModelLoader model_loader;
114-
if (!model_loader.init_from_file(embd_path)) {
122+
if (!model_loader.init_from_file_and_convert_name(embd_path)) {
115123
LOG_ERROR("embedding '%s' failed", embd_name.c_str());
116124
return false;
117125
}
@@ -270,13 +278,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
270278
const std::string& curr_text = item.first;
271279
float curr_weight = item.second;
272280
// printf(" %s: %f \n", curr_text.c_str(), curr_weight);
281+
int32_t clean_index = 0;
282+
if (curr_text == "BREAK" && curr_weight == -1.0f) {
283+
// Pad token array up to chunk size at this point.
284+
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
285+
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
286+
int padding_size = 75 - (tokens_acc % 75);
287+
for (int j = 0; j < padding_size; j++) {
288+
clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
289+
clean_index++;
290+
}
291+
292+
// After padding, continue to the next iteration to process the following text as a new segment
293+
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
294+
weights.insert(weights.end(), padding_size, curr_weight);
295+
continue;
296+
}
297+
298+
// Regular token, process normally
273299
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
274-
int32_t clean_index = 0;
275300
for (uint32_t i = 0; i < curr_tokens.size(); i++) {
276301
int token_id = curr_tokens[i];
277-
if (token_id == image_token)
302+
if (token_id == image_token) {
278303
class_token_index.push_back(clean_index - 1);
279-
else {
304+
} else {
280305
clean_input_ids.push_back(token_id);
281306
clean_index++;
282307
}
@@ -379,6 +404,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
379404
for (const auto& item : parsed_attention) {
380405
const std::string& curr_text = item.first;
381406
float curr_weight = item.second;
407+
408+
if (curr_text == "BREAK" && curr_weight == -1.0f) {
409+
// Pad token array up to chunk size at this point.
410+
// TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
411+
// Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
412+
size_t current_size = tokens.size();
413+
size_t padding_size = (75 - (current_size % 75)) % 75; // Ensure no negative padding
414+
415+
if (padding_size > 0) {
416+
LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
417+
tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
418+
weights.insert(weights.end(), padding_size, 1.0f);
419+
}
420+
continue; // Skip to the next item after handling BREAK
421+
}
422+
382423
std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
383424
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
384425
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
@@ -764,6 +805,18 @@ struct SD3CLIPEmbedder : public Conditioner {
764805
return buffer_size;
765806
}
766807

808+
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
809+
if (clip_l) {
810+
clip_l->set_weight_adapter(adapter);
811+
}
812+
if (clip_g) {
813+
clip_g->set_weight_adapter(adapter);
814+
}
815+
if (t5) {
816+
t5->set_weight_adapter(adapter);
817+
}
818+
}
819+
767820
std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
768821
size_t max_length = 0,
769822
bool padding = false) {
@@ -1160,6 +1213,15 @@ struct FluxCLIPEmbedder : public Conditioner {
11601213
return buffer_size;
11611214
}
11621215

1216+
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
1217+
if (clip_l) {
1218+
clip_l->set_weight_adapter(adapter);
1219+
}
1220+
if (t5) {
1221+
t5->set_weight_adapter(adapter);
1222+
}
1223+
}
1224+
11631225
std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
11641226
size_t max_length = 0,
11651227
bool padding = false) {
@@ -1400,6 +1462,12 @@ struct T5CLIPEmbedder : public Conditioner {
14001462
return buffer_size;
14011463
}
14021464

1465+
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
1466+
if (t5) {
1467+
t5->set_weight_adapter(adapter);
1468+
}
1469+
}
1470+
14031471
std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
14041472
size_t max_length = 0,
14051473
bool padding = false) {
@@ -1589,6 +1657,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
15891657
return buffer_size;
15901658
}
15911659

1660+
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
1661+
if (qwenvl) {
1662+
qwenvl->set_weight_adapter(adapter);
1663+
}
1664+
}
1665+
15921666
std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
15931667
size_t max_length = 0,
15941668
size_t system_prompt_length = 0,

0 commit comments

Comments
 (0)