Commit c587a43

stduhpfleejet

and

authored

feat: support incrementing ref image index (omni-kontext) (#755)

* kontext: support ref images indices * lora: support x_embedder * update help message * Support for negative indices * support for OmniControl (offsets at index 0) * c++11 compat * add --increase-ref-index option * simplify the logic and fix some issues * update README.md * remove unused variable --------- Co-authored-by: leejet <leejet714@gmail.com>

1 parent f8fe4e7 commit c587a43Copy full SHA for c587a43

File tree

8 files changed

+48

-12

lines changed

README.md
diffusion_model.hpp
examples/cli
- main.cpp
flux.hpp
lora.hpp
rope.hpp
stable-diffusion.cpp
stable-diffusion.h

8 files changed

+48

-12

lines changed

`‎README.md‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -319,6 +319,7 @@ arguments:`
`319`	`319`	`-i, --end-img [IMAGE] path to the end image, required by flf2v`
`320`	`320`	`--control-image [IMAGE] path to image condition, control net`
`321`	`321`	`-r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)`
	`322`	`+ --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).`
`322`	`323`	`-o, --output OUTPUT path to write result image to (default: ./output.png)`
`323`	`324`	`-p, --prompt [PROMPT] the prompt to render`
`324`	`325`	`-n, --negative-prompt PROMPT the negative prompt (default: "")`

`‎diffusion_model.hpp‎`

Lines changed: 6 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ struct DiffusionModel {`
`16`	`16`	`struct ggml_tensor* y,`
`17`	`17`	`struct ggml_tensor* guidance,`
`18`	`18`	`std::vector<ggml_tensor*> ref_latents = {},`
	`19`	`+ bool increase_ref_index = false,`
`19`	`20`	`int num_video_frames = -1,`
`20`	`21`	`std::vector<struct ggml_tensor*> controls = {},`
`21`	`22`	`float control_strength = 0.f,`
`@@ -77,6 +78,7 @@ struct UNetModel : public DiffusionModel {`
`77`	`78`	`struct ggml_tensor* y,`
`78`	`79`	`struct ggml_tensor* guidance,`
`79`	`80`	`std::vector<ggml_tensor*> ref_latents = {},`
	`81`	`+ bool increase_ref_index = false,`
`80`	`82`	`int num_video_frames = -1,`
`81`	`83`	`std::vector<struct ggml_tensor*> controls = {},`
`82`	`84`	`float control_strength = 0.f,`
`@@ -133,6 +135,7 @@ struct MMDiTModel : public DiffusionModel {`
`133`	`135`	`struct ggml_tensor* y,`
`134`	`136`	`struct ggml_tensor* guidance,`
`135`	`137`	`std::vector<ggml_tensor*> ref_latents = {},`
	`138`	`+ bool increase_ref_index = false,`
`136`	`139`	`int num_video_frames = -1,`
`137`	`140`	`std::vector<struct ggml_tensor*> controls = {},`
`138`	`141`	`float control_strength = 0.f,`
`@@ -191,13 +194,14 @@ struct FluxModel : public DiffusionModel {`
`191`	`194`	`struct ggml_tensor* y,`
`192`	`195`	`struct ggml_tensor* guidance,`
`193`	`196`	`std::vector<ggml_tensor*> ref_latents = {},`
	`197`	`+ bool increase_ref_index = false,`
`194`	`198`	`int num_video_frames = -1,`
`195`	`199`	`std::vector<struct ggml_tensor*> controls = {},`
`196`	`200`	`float control_strength = 0.f,`
`197`	`201`	`struct ggml_tensor** output = NULL,`
`198`	`202`	`struct ggml_context* output_ctx = NULL,`
`199`	`203`	`std::vector<int> skip_layers = std::vector<int>()) {`
`200`		`- return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);`
	`204`	`+ return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, output, output_ctx, skip_layers);`
`201`	`205`	`}`
`202`	`206`	`};`
`203`	`207`
`@@ -250,6 +254,7 @@ struct WanModel : public DiffusionModel {`
`250`	`254`	`struct ggml_tensor* y,`
`251`	`255`	`struct ggml_tensor* guidance,`
`252`	`256`	`std::vector<ggml_tensor*> ref_latents = {},`
	`257`	`+ bool increase_ref_index = false,`
`253`	`258`	`int num_video_frames = -1,`
`254`	`259`	`std::vector<struct ggml_tensor*> controls = {},`
`255`	`260`	`float control_strength = 0.f,`

`‎examples/cli/main.cpp‎`

Lines changed: 5 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ struct SDParams {`
`74`	`74`	`std::string mask_image_path;`
`75`	`75`	`std::string control_image_path;`
`76`	`76`	`std::vector<std::string> ref_image_paths;`
	`77`	`+ bool increase_ref_index = false;`
`77`	`78`
`78`	`79`	`std::string prompt;`
`79`	`80`	`std::string negative_prompt;`
`@@ -156,6 +157,7 @@ void print_params(SDParams params) {`
`156`	`157`	`for (auto& path : params.ref_image_paths) {`
`157`	`158`	`printf(" %s\n", path.c_str());`
`158`	`159`	`};`
	`160`	`+ printf(" increase_ref_index: %s\n", params.increase_ref_index ? "true" : "false");`
`159`	`161`	`printf(" offload_params_to_cpu: %s\n", params.offload_params_to_cpu ? "true" : "false");`
`160`	`162`	`printf(" clip_on_cpu: %s\n", params.clip_on_cpu ? "true" : "false");`
`161`	`163`	`printf(" control_net_cpu: %s\n", params.control_net_cpu ? "true" : "false");`
`@@ -222,6 +224,7 @@ void print_usage(int argc, const char* argv[]) {`
`222`	`224`	`printf(" -i, --end-img [IMAGE] path to the end image, required by flf2v\n");`
`223`	`225`	`printf(" --control-image [IMAGE] path to image condition, control net\n");`
`224`	`226`	`printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");`
	`227`	`+ printf(" --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).\n");`
`225`	`228`	`printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");`
`226`	`229`	`printf(" -p, --prompt [PROMPT] the prompt to render\n");`
`227`	`230`	`printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");`
`@@ -536,6 +539,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {`
`536`	`539`	`{"", "--color", "", true, &params.color},`
`537`	`540`	`{"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},`
`538`	`541`	`{"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},`
	`542`	`+ {"", "--increase-ref-index", "", true, &params.increase_ref_index},`
`539`	`543`	`};`
`540`	`544`
`541`	`545`	`auto on_mode_arg = [&](int argc, const char** argv, int index) {`
`@@ -1207,6 +1211,7 @@ int main(int argc, const char* argv[]) {`
`1207`	`1211`	`init_image,`
`1208`	`1212`	`ref_images.data(),`
`1209`	`1213`	`(int)ref_images.size(),`
	`1214`	`+ params.increase_ref_index,`
`1210`	`1215`	`mask_image,`
`1211`	`1216`	`params.width,`
`1212`	`1217`	`params.height,`

`‎flux.hpp‎`

Lines changed: 5 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -960,6 +960,7 @@ namespace Flux {`
`960`	`960`	`struct ggml_tensor* y,`
`961`	`961`	`struct ggml_tensor* guidance,`
`962`	`962`	`std::vector<ggml_tensor*> ref_latents = {},`
	`963`	`+ bool increase_ref_index = false,`
`963`	`964`	`std::vector<int> skip_layers = {}) {`
`964`	`965`	`GGML_ASSERT(x->ne[3] == 1);`
`965`	`966`	`struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);`
`@@ -999,6 +1000,7 @@ namespace Flux {`
`999`	`1000`	`x->ne[3],`
`1000`	`1001`	`context->ne[1],`
`1001`	`1002`	`ref_latents,`
	`1003`	`+ increase_ref_index,`
`1002`	`1004`	`flux_params.theta,`
`1003`	`1005`	`flux_params.axes_dim);`
`1004`	`1006`	`int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;`
`@@ -1035,6 +1037,7 @@ namespace Flux {`
`1035`	`1037`	`struct ggml_tensor* y,`
`1036`	`1038`	`struct ggml_tensor* guidance,`
`1037`	`1039`	`std::vector<ggml_tensor*> ref_latents = {},`
	`1040`	`+ bool increase_ref_index = false,`
`1038`	`1041`	`struct ggml_tensor** output = NULL,`
`1039`	`1042`	`struct ggml_context* output_ctx = NULL,`
`1040`	`1043`	`std::vector<int> skip_layers = std::vector<int>()) {`
`@@ -1044,7 +1047,7 @@ namespace Flux {`
`1044`	`1047`	`// y: [N, adm_in_channels] or [1, adm_in_channels]`
`1045`	`1048`	`// guidance: [N, ]`
`1046`	`1049`	`auto get_graph = [&]() -> struct ggml_cgraph* {`
`1047`		`- return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers);`
	`1050`	`+ return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, increase_ref_index, skip_layers);`
`1048`	`1051`	`};`
`1049`	`1052`
`1050`	`1053`	`GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);`
`@@ -1084,7 +1087,7 @@ namespace Flux {`
`1084`	`1087`	`struct ggml_tensor* out = NULL;`
`1085`	`1088`
`1086`	`1089`	`int t0 = ggml_time_ms();`
`1087`		`- compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx);`
	`1090`	`+ compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx);`
`1088`	`1091`	`int t1 = ggml_time_ms();`
`1089`	`1092`
`1090`	`1093`	`print_ggml_tensor(out);`

`‎lora.hpp‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ struct LoraModel : public GGMLRunner {`
`58`	`58`	`{"x_block.attn.proj", "attn.to_out.0"},`
`59`	`59`	`{"x_block.attn2.proj", "attn2.to_out.0"},`
`60`	`60`	`// flux`
	`61`	`+ {"img_in", "x_embedder"},`
`61`	`62`	`// singlestream`
`62`	`63`	`{"linear2", "proj_out"},`
`63`	`64`	`{"modulation.lin", "norm.linear"},`

`‎rope.hpp‎`

Lines changed: 16 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -156,25 +156,33 @@ struct Rope {`
`156`	`156`	`int patch_size,`
`157`	`157`	`int bs,`
`158`	`158`	`int context_len,`
`159`		`- std::vector<ggml_tensor*> ref_latents) {`
	`159`	`+ std::vector<ggml_tensor*> ref_latents,`
	`160`	`+ bool increase_ref_index) {`
`160`	`161`	`auto txt_ids = gen_txt_ids(bs, context_len);`
`161`	`162`	`auto img_ids = gen_img_ids(h, w, patch_size, bs);`
`162`	`163`
`163`	`164`	`auto ids = concat_ids(txt_ids, img_ids, bs);`
`164`	`165`	`uint64_t curr_h_offset = 0;`
`165`	`166`	`uint64_t curr_w_offset = 0;`
	`167`	`+ int index = 1;`
`166`	`168`	`for (ggml_tensor* ref : ref_latents) {`
`167`	`169`	`uint64_t h_offset = 0;`
`168`	`170`	`uint64_t w_offset = 0;`
`169`		`- if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {`
`170`		`- w_offset = curr_w_offset;`
`171`		`- } else {`
`172`		`- h_offset = curr_h_offset;`
	`171`	`+ if (!increase_ref_index) {`
	`172`	`+ if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {`
	`173`	`+ w_offset = curr_w_offset;`
	`174`	`+ } else {`
	`175`	`+ h_offset = curr_h_offset;`
	`176`	`+ }`
`173`	`177`	`}`
`174`	`178`
`175`		`- auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);`
	`179`	`+ auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);`
`176`	`180`	`ids = concat_ids(ids, ref_ids, bs);`
`177`	`181`
	`182`	`+ if (increase_ref_index) {`
	`183`	`+ index++;`
	`184`	`+ }`
	`185`	`+`
`178`	`186`	`curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);`
`179`	`187`	`curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);`
`180`	`188`	`}`
`@@ -188,9 +196,10 @@ struct Rope {`
`188`	`196`	`int bs,`
`189`	`197`	`int context_len,`
`190`	`198`	`std::vector<ggml_tensor*> ref_latents,`
	`199`	`+ bool increase_ref_index,`
`191`	`200`	`int theta,`
`192`	`201`	`const std::vector<int>& axes_dim) {`
`193`		`- std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents);`
	`202`	`+ std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);`
`194`	`203`	`return embed_nd(ids, bs, theta, axes_dim);`
`195`	`204`	`}`
`196`	`205`

`‎stable-diffusion.cpp‎`

Lines changed: 13 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -775,7 +775,7 @@ class StableDiffusionGGML {`
`775`	`775`
`776`	`776`	`int64_t t0 = ggml_time_ms();`
`777`	`777`	`struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);`
`778`		`- diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out);`
	`778`	`+ diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, false, -1, {}, 0.f, &out);`
`779`	`779`	`diffusion_model->free_compute_buffer();`
`780`	`780`
`781`	`781`	`double result = 0.f;`
`@@ -1032,6 +1032,7 @@ class StableDiffusionGGML {`
`1032`	`1032`	`int start_merge_step,`
`1033`	`1033`	`SDCondition id_cond,`
`1034`	`1034`	`std::vector<ggml_tensor*> ref_latents = {},`
	`1035`	`+ bool increase_ref_index = false,`
`1035`	`1036`	`ggml_tensor* denoise_mask = nullptr) {`
`1036`	`1037`	`std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);`
`1037`	`1038`
`@@ -1126,6 +1127,7 @@ class StableDiffusionGGML {`
`1126`	`1127`	`cond.c_vector,`
`1127`	`1128`	`guidance_tensor,`
`1128`	`1129`	`ref_latents,`
	`1130`	`+ increase_ref_index,`
`1129`	`1131`	`-1,`
`1130`	`1132`	`controls,`
`1131`	`1133`	`control_strength,`
`@@ -1139,6 +1141,7 @@ class StableDiffusionGGML {`
`1139`	`1141`	`id_cond.c_vector,`
`1140`	`1142`	`guidance_tensor,`
`1141`	`1143`	`ref_latents,`
	`1144`	`+ increase_ref_index,`
`1142`	`1145`	`-1,`
`1143`	`1146`	`controls,`
`1144`	`1147`	`control_strength,`
`@@ -1160,6 +1163,7 @@ class StableDiffusionGGML {`
`1160`	`1163`	`uncond.c_vector,`
`1161`	`1164`	`guidance_tensor,`
`1162`	`1165`	`ref_latents,`
	`1166`	`+ increase_ref_index,`
`1163`	`1167`	`-1,`
`1164`	`1168`	`controls,`
`1165`	`1169`	`control_strength,`
`@@ -1177,6 +1181,7 @@ class StableDiffusionGGML {`
`1177`	`1181`	`img_cond.c_vector,`
`1178`	`1182`	`guidance_tensor,`
`1179`	`1183`	`ref_latents,`
	`1184`	`+ increase_ref_index,`
`1180`	`1185`	`-1,`
`1181`	`1186`	`controls,`
`1182`	`1187`	`control_strength,`
`@@ -1198,6 +1203,7 @@ class StableDiffusionGGML {`
`1198`	`1203`	`cond.c_vector,`
`1199`	`1204`	`guidance_tensor,`
`1200`	`1205`	`ref_latents,`
	`1206`	`+ increase_ref_index,`
`1201`	`1207`	`-1,`
`1202`	`1208`	`controls,`
`1203`	`1209`	`control_strength,`
`@@ -1710,6 +1716,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {`
`1710`	`1716`	`"\n"`
`1711`	`1717`	`"batch_count: %d\n"`
`1712`	`1718`	`"ref_images_count: %d\n"`
	`1719`	`+ "increase_ref_index: %s\n"`
`1713`	`1720`	`"control_strength: %.2f\n"`
`1714`	`1721`	`"style_strength: %.2f\n"`
`1715`	`1722`	`"normalize_input: %s\n"`
`@@ -1724,6 +1731,7 @@ char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {`
`1724`	`1731`	`sd_img_gen_params->seed,`
`1725`	`1732`	`sd_img_gen_params->batch_count,`
`1726`	`1733`	`sd_img_gen_params->ref_images_count,`
	`1734`	`+ BOOL_STR(sd_img_gen_params->increase_ref_index),`
`1727`	`1735`	`sd_img_gen_params->control_strength,`
`1728`	`1736`	`sd_img_gen_params->style_strength,`
`1729`	`1737`	`BOOL_STR(sd_img_gen_params->normalize_input),`
`@@ -1797,6 +1805,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,`
`1797`	`1805`	`bool normalize_input,`
`1798`	`1806`	`std::string input_id_images_path,`
`1799`	`1807`	`std::vector<ggml_tensor*> ref_latents,`
	`1808`	`+ bool increase_ref_index,`
`1800`	`1809`	`ggml_tensor* concat_latent = NULL,`
`1801`	`1810`	`ggml_tensor* denoise_mask = NULL) {`
`1802`	`1811`	`if (seed < 0) {`
`@@ -2054,6 +2063,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,`
`2054`	`2063`	`start_merge_step,`
`2055`	`2064`	`id_cond,`
`2056`	`2065`	`ref_latents,`
	`2066`	`+ increase_ref_index,`
`2057`	`2067`	`denoise_mask);`
`2058`	`2068`	`// print_ggml_tensor(x_0);`
`2059`	`2069`	`int64_t sampling_end = ggml_time_ms();`
`@@ -2304,7 +2314,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g`
`2304`	`2314`	`LOG_INFO("EDIT mode");`
`2305`	`2315`	`}`
`2306`	`2316`
`2307`		`- std::vector<structggml_tensor*> ref_latents;`
	`2317`	`+ std::vector<ggml_tensor*> ref_latents;`
`2308`	`2318`	`for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {`
`2309`	`2319`	`ggml_tensor* img = ggml_new_tensor_4d(work_ctx,`
`2310`	`2320`	`GGML_TYPE_F32,`
`@@ -2359,6 +2369,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g`
`2359`	`2369`	`sd_img_gen_params->normalize_input,`
`2360`	`2370`	`sd_img_gen_params->input_id_images_path,`
`2361`	`2371`	`ref_latents,`
	`2372`	`+ sd_img_gen_params->increase_ref_index,`
`2362`	`2373`	`concat_latent,`
`2363`	`2374`	`denoise_mask);`
`2364`	`2375`

`‎stable-diffusion.h‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -182,6 +182,7 @@ typedef struct {`
`182`	`182`	`sd_image_t init_image;`
`183`	`183`	`sd_image_t* ref_images;`
`184`	`184`	`int ref_images_count;`
	`185`	`+ bool increase_ref_index;`
`185`	`186`	`sd_image_t mask_image;`
`186`	`187`	`int width;`
`187`	`188`	`int height;`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit c587a43

File tree

8 files changed

8 files changed

`‎README.md‎`

`‎diffusion_model.hpp‎`

`‎examples/cli/main.cpp‎`

`‎flux.hpp‎`

`‎lora.hpp‎`

`‎rope.hpp‎`

`‎stable-diffusion.cpp‎`

`‎stable-diffusion.h‎`

0 commit comments