@@ -66,17 +66,21 @@ struct SDParams {
6666 // models
6767 std::string model_path;
6868 std::string clip_l_path;
69+ std::string clip_g_path;
6970 std::string t5xxl_path;
7071 std::string diffusion_model_path;
7172 std::string vae_path;
72- // std::string taesd_path;
73+ std::string taesd_path;
74+ std::string esrgan_path;
75+ std::string controlnet_path;
7376 std::string embeddings_path;
7477 std::string stacked_id_embeddings_path;
75- std::string lora_model_dir;
76-
78+ std::string input_id_images_path;
7779 sd_type_t wtype = SD_TYPE_COUNT;
80+ std::string lora_model_dir;
7881 std::string output_path = " output.png" ;
7982 std::string input_path;
83+ std::string control_image_path;
8084
8185 std::string prompt;
8286 std::string negative_prompt;
@@ -93,17 +97,22 @@ struct SDParams {
9397 schedule_t schedule = DEFAULT;
9498 int sample_steps = 20 ;
9599 float strength = 0 .75f ;
100+ float control_strength = 0 .9f ;
96101 rng_type_t rng_type = CUDA_RNG;
97102 int64_t seed = 42 ;
98103 bool verbose = false ;
99104 bool vae_tiling = false ;
105+ bool control_net_cpu = false ;
100106 bool normalize_input = false ;
101107 bool clip_on_cpu = false ;
102108 bool vae_on_cpu = false ;
109+ bool diffusion_flash_attn = false ;
103110 bool color = false ;
104111
105- // Photomaker params
106- std::string input_id_images_path;
112+ std::vector<int > skip_layers = {7 , 8 , 9 };
113+ float slg_scale = 0 .;
114+ float skip_layer_start = 0.01 ;
115+ float skip_layer_end = 0.2 ;
107116
108117 // server things
109118 int port = 8080 ;
@@ -113,24 +122,34 @@ struct SDParams {
113122void print_params (SDParams params) {
114123 printf (" Option: \n " );
115124 printf (" n_threads: %d\n " , params.n_threads );
125+ printf (" mode: server\n " );
116126 printf (" model_path: %s\n " , params.model_path .c_str ());
117127 printf (" wtype: %s\n " , params.wtype < SD_TYPE_COUNT ? sd_type_name (params.wtype ) : " unspecified" );
118128 printf (" clip_l_path: %s\n " , params.clip_l_path .c_str ());
129+ printf (" clip_g_path: %s\n " , params.clip_g_path .c_str ());
119130 printf (" t5xxl_path: %s\n " , params.t5xxl_path .c_str ());
120131 printf (" diffusion_model_path: %s\n " , params.diffusion_model_path .c_str ());
121132 printf (" vae_path: %s\n " , params.vae_path .c_str ());
122- // printf(" taesd_path: %s\n", params.taesd_path.c_str());
133+ printf (" taesd_path: %s\n " , params.taesd_path .c_str ());
134+ printf (" controlnet_path: %s\n " , params.controlnet_path .c_str ());
123135 printf (" embeddings_path: %s\n " , params.embeddings_path .c_str ());
124136 printf (" stacked_id_embeddings_path: %s\n " , params.stacked_id_embeddings_path .c_str ());
137+ printf (" input_id_images_path: %s\n " , params.input_id_images_path .c_str ());
125138 printf (" style ratio: %.2f\n " , params.style_ratio );
126- printf (" normzalize input image : %s\n " , params.normalize_input ? " true" : " false" );
139+ printf (" normalize input image : %s\n " , params.normalize_input ? " true" : " false" );
127140 printf (" output_path: %s\n " , params.output_path .c_str ());
141+ printf (" init_img: %s\n " , params.input_path .c_str ());
142+ printf (" control_image: %s\n " , params.control_image_path .c_str ());
128143 printf (" clip on cpu: %s\n " , params.clip_on_cpu ? " true" : " false" );
144+ printf (" controlnet cpu: %s\n " , params.control_net_cpu ? " true" : " false" );
129145 printf (" vae decoder on cpu:%s\n " , params.vae_on_cpu ? " true" : " false" );
146+ printf (" diffusion flash attention:%s\n " , params.diffusion_flash_attn ? " true" : " false" );
147+ printf (" strength(control): %.2f\n " , params.control_strength );
130148 printf (" prompt: %s\n " , params.prompt .c_str ());
131149 printf (" negative_prompt: %s\n " , params.negative_prompt .c_str ());
132150 printf (" min_cfg: %.2f\n " , params.min_cfg );
133151 printf (" cfg_scale: %.2f\n " , params.cfg_scale );
152+ printf (" slg_scale: %.2f\n " , params.slg_scale );
134153 printf (" guidance: %.2f\n " , params.guidance );
135154 printf (" clip_skip: %d\n " , params.clip_skip );
136155 printf (" width: %d\n " , params.width );
@@ -150,40 +169,59 @@ void print_usage(int argc, const char* argv[]) {
150169 printf (" \n " );
151170 printf (" arguments:\n " );
152171 printf (" -h, --help show this help message and exit\n " );
153- printf (" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n " );
154- printf (" -t, --threads N number of threads to use during computation (default: -1).\n " );
172+ printf (" -t, --threads N number of threads to use during computation (default: -1)\n " );
155173 printf (" If threads <= 0, then threads will be set to the number of CPU physical cores\n " );
156174 printf (" -m, --model [MODEL] path to full model\n " );
157175 printf (" --diffusion-model path to the standalone diffusion model\n " );
158176 printf (" --clip_l path to the clip-l text encoder\n " );
159- printf (" --t5xxl path to the the t5xxl text encoder.\n " );
177+ printf (" --clip_g path to the clip-g text encoder\n " );
178+ printf (" --t5xxl path to the the t5xxl text encoder\n " );
160179 printf (" --vae [VAE] path to vae\n " );
161- printf (" --embd-dir [EMBEDDING_PATH] path to embeddings.\n " );
180+ printf (" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n " );
181+ printf (" --control-net [CONTROL_PATH] path to control net model\n " );
182+ printf (" --embd-dir [EMBEDDING_PATH] path to embeddings\n " );
183+ printf (" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n " );
184+ printf (" --input-id-images-dir [DIR] path to PHOTOMAKER input id images dir\n " );
185+ printf (" --normalize-input normalize PHOTOMAKER input id images\n " );
186+ // printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
187+ // printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
162188 printf (" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)\n " );
163- printf (" If not specified, the default is the type of the weight file. \n " );
189+ printf (" If not specified, the default is the type of the weight file\n " );
164190 printf (" --lora-model-dir [DIR] lora model directory\n " );
191+ printf (" --control-image [IMAGE] path to image condition, control net\n " );
165192 printf (" -o, --output OUTPUT path to write result image to (default: ./output.png)\n " );
166193 printf (" -p, --prompt [PROMPT] the prompt to render\n " );
167194 printf (" -n, --negative-prompt PROMPT the negative prompt (default: \"\" )\n " );
168195 printf (" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n " );
196+ printf (" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n " );
197+ printf (" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n " );
198+ printf (" --skip_layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n " );
199+ printf (" --skip_layer_start START SLG enabling point: (default: 0.01)\n " );
200+ printf (" --skip_layer_end END SLG disabling point: (default: 0.2)\n " );
201+ printf (" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n " );
169202 printf (" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n " );
170203 printf (" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n " );
171204 printf (" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n " );
172205 printf (" 1.0 corresponds to full destruction of information in init image\n " );
173206 printf (" -H, --height H image height, in pixel space (default: 512)\n " );
174207 printf (" -W, --width W image width, in pixel space (default: 512)\n " );
175- printf (" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n " );
208+ printf (" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}\n " );
176209 printf (" sampling method (default: \" euler_a\" )\n " );
177210 printf (" --steps STEPS number of sample steps (default: 20)\n " );
178211 printf (" --rng {std_default, cuda} RNG (default: cuda)\n " );
179212 printf (" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n " );
180- printf (" -b, --batch-count COUNT number of images to generate. \n " );
181- printf (" --schedule {discrete, karras, ays} Denoiser sigma schedule (default: discrete)\n " );
213+ printf (" -b, --batch-count COUNT number of images to generate\n " );
214+ printf (" --schedule {discrete, karras, exponential, ays, gits } Denoiser sigma schedule (default: discrete)\n " );
182215 printf (" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n " );
183216 printf (" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n " );
184217 printf (" --vae-tiling process vae in tiles to reduce memory usage\n " );
185218 printf (" --vae-on-cpu keep vae in cpu (for low vram)\n " );
186- printf (" --clip-on-cpu keep clip in cpu (for low vram).\n " );
219+ printf (" --clip-on-cpu keep clip in cpu (for low vram)\n " );
220+ printf (" --diffusion-fa use flash attention in the diffusion model (for low vram)\n " );
221+ printf (" Might lower quality, since it implies converting k and v to f16.\n " );
222+ printf (" This might crash if it is not supported by the backend.\n " );
223+ printf (" --control-net-cpu keep controlnet in cpu (for low vram)\n " );
224+ printf (" --canny apply canny preprocessor (edge detection)\n " );
187225 printf (" --color Colors the logging tags according to level\n " );
188226 printf (" -v, --verbose print extra info\n " );
189227 printf (" --port port used for server (default: 8080)\n " );
@@ -214,6 +252,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
214252 break ;
215253 }
216254 params.clip_l_path = argv[i];
255+ } else if (arg == " --clip_g" ) {
256+ if (++i >= argc) {
257+ invalid_arg = true ;
258+ break ;
259+ }
260+ params.clip_g_path = argv[i];
217261 } else if (arg == " --t5xxl" ) {
218262 if (++i >= argc) {
219263 invalid_arg = true ;
@@ -232,7 +276,42 @@ void parse_args(int argc, const char** argv, SDParams& params) {
232276 break ;
233277 }
234278 params.vae_path = argv[i];
235- // TODO Tiny AE
279+ } else if (arg == " --taesd" ) {
280+ if (++i >= argc) {
281+ invalid_arg = true ;
282+ break ;
283+ }
284+ params.taesd_path = argv[i];
285+ } else if (arg == " --control-net" ) {
286+ if (++i >= argc) {
287+ invalid_arg = true ;
288+ break ;
289+ }
290+ params.controlnet_path = argv[i];
291+ } else if (arg == " --upscale-model" ) {
292+ if (++i >= argc) {
293+ invalid_arg = true ;
294+ break ;
295+ }
296+ params.esrgan_path = argv[i];
297+ } else if (arg == " --embd-dir" ) {
298+ if (++i >= argc) {
299+ invalid_arg = true ;
300+ break ;
301+ }
302+ params.embeddings_path = argv[i];
303+ } else if (arg == " --stacked-id-embd-dir" ) {
304+ if (++i >= argc) {
305+ invalid_arg = true ;
306+ break ;
307+ }
308+ params.stacked_id_embeddings_path = argv[i];
309+ } else if (arg == " --input-id-images-dir" ) {
310+ if (++i >= argc) {
311+ invalid_arg = true ;
312+ break ;
313+ }
314+ params.input_id_images_path = argv[i];
236315 } else if (arg == " --type" ) {
237316 if (++i >= argc) {
238317 invalid_arg = true ;
@@ -270,6 +349,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
270349 break ;
271350 }
272351 params.lora_model_dir = argv[i];
352+ } else if (arg == " -i" || arg == " --init-img" ) {
353+ if (++i >= argc) {
354+ invalid_arg = true ;
355+ break ;
356+ }
357+ params.input_path = argv[i];
358+ } else if (arg == " --control-image" ) {
359+ if (++i >= argc) {
360+ invalid_arg = true ;
361+ break ;
362+ }
363+ params.control_image_path = argv[i];
273364 } else if (arg == " -o" || arg == " --output" ) {
274365 if (++i >= argc) {
275366 invalid_arg = true ;
@@ -312,6 +403,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
312403 break ;
313404 }
314405 params.style_ratio = std::stof (argv[i]);
406+ } else if (arg == " --control-strength" ) {
407+ if (++i >= argc) {
408+ invalid_arg = true ;
409+ break ;
410+ }
411+ params.control_strength = std::stof (argv[i]);
315412 } else if (arg == " -H" || arg == " --height" ) {
316413 if (++i >= argc) {
317414 invalid_arg = true ;
@@ -338,12 +435,16 @@ void parse_args(int argc, const char** argv, SDParams& params) {
338435 params.clip_skip = std::stoi (argv[i]);
339436 } else if (arg == " --vae-tiling" ) {
340437 params.vae_tiling = true ;
438+ } else if (arg == " --control-net-cpu" ) {
439+ params.control_net_cpu = true ;
341440 } else if (arg == " --normalize-input" ) {
342441 params.normalize_input = true ;
343442 } else if (arg == " --clip-on-cpu" ) {
344443 params.clip_on_cpu = true ; // will slow down get_learned_condiotion but necessary for low MEM GPUs
345444 } else if (arg == " --vae-on-cpu" ) {
346445 params.vae_on_cpu = true ; // will slow down latent decoding but necessary for low MEM GPUs
446+ } else if (arg == " --diffusion-fa" ) {
447+ params.diffusion_flash_attn = true ; // can reduce MEM significantly
347448 } else if (arg == " -b" || arg == " --batch-count" ) {
348449 if (++i >= argc) {
349450 invalid_arg = true ;
@@ -411,6 +512,61 @@ void parse_args(int argc, const char** argv, SDParams& params) {
411512 params.verbose = true ;
412513 } else if (arg == " --color" ) {
413514 params.color = true ;
515+ } else if (arg == " --slg-scale" ) {
516+ if (++i >= argc) {
517+ invalid_arg = true ;
518+ break ;
519+ }
520+ params.slg_scale = std::stof (argv[i]);
521+ } else if (arg == " --skip-layers" ) {
522+ if (++i >= argc) {
523+ invalid_arg = true ;
524+ break ;
525+ }
526+ if (argv[i][0 ] != ' [' ) {
527+ invalid_arg = true ;
528+ break ;
529+ }
530+ std::string layers_str = argv[i];
531+ while (layers_str.back () != ' ]' ) {
532+ if (++i >= argc) {
533+ invalid_arg = true ;
534+ break ;
535+ }
536+ layers_str += " " + std::string (argv[i]);
537+ }
538+ layers_str = layers_str.substr (1 , layers_str.size () - 2 );
539+
540+ std::regex regex (" [, ]+" );
541+ std::sregex_token_iterator iter (layers_str.begin (), layers_str.end (), regex, -1 );
542+ std::sregex_token_iterator end;
543+ std::vector<std::string> tokens (iter, end);
544+ std::vector<int > layers;
545+ for (const auto & token : tokens) {
546+ try {
547+ layers.push_back (std::stoi (token));
548+ } catch (const std::invalid_argument& e) {
549+ invalid_arg = true ;
550+ break ;
551+ }
552+ }
553+ params.skip_layers = layers;
554+
555+ if (invalid_arg) {
556+ break ;
557+ }
558+ } else if (arg == " --skip-layer-start" ) {
559+ if (++i >= argc) {
560+ invalid_arg = true ;
561+ break ;
562+ }
563+ params.skip_layer_start = std::stof (argv[i]);
564+ } else if (arg == " --skip-layer-end" ) {
565+ if (++i >= argc) {
566+ invalid_arg = true ;
567+ break ;
568+ }
569+ params.skip_layer_end = std::stof (argv[i]);
414570 } else if (arg == " --port" ) {
415571 if (++i >= argc) {
416572 invalid_arg = true ;
@@ -716,11 +872,12 @@ int main(int argc, const char* argv[]) {
716872
717873 sd_ctx_t * sd_ctx = new_sd_ctx (params.model_path .c_str (),
718874 params.clip_l_path .c_str (),
875+ params.clip_g_path .c_str (),
719876 params.t5xxl_path .c_str (),
720877 params.diffusion_model_path .c_str (),
721878 params.vae_path .c_str (),
722- " " ,
723- " " ,
879+ params. taesd_path . c_str () ,
880+ params. controlnet_path . c_str () ,
724881 params.lora_model_dir .c_str (),
725882 params.embeddings_path .c_str (),
726883 params.stacked_id_embeddings_path .c_str (),
@@ -732,8 +889,9 @@ int main(int argc, const char* argv[]) {
732889 params.rng_type ,
733890 params.schedule ,
734891 params.clip_on_cpu ,
735- true ,
736- params.vae_on_cpu );
892+ params.control_net_cpu ,
893+ params.vae_on_cpu ,
894+ params.diffusion_flash_attn );
737895
738896 if (sd_ctx == NULL ) {
739897 printf (" new_sd_ctx_t failed\n " );
@@ -787,7 +945,12 @@ int main(int argc, const char* argv[]) {
787945 1 ,
788946 params.style_ratio ,
789947 params.normalize_input ,
790- params.input_id_images_path .c_str ());
948+ params.input_id_images_path .c_str (),
949+ params.skip_layers .data (),
950+ params.skip_layers .size (),
951+ params.slg_scale ,
952+ params.skip_layer_start ,
953+ params.skip_layer_end );
791954
792955 if (results == NULL ) {
793956 printf (" generate failed\n " );
0 commit comments