Commit c457adf

committed

Support Flex-2

1 parent 1896b28 commit c457adfCopy full SHA for c457adf

File tree

6 files changed

+114

-16

lines changed

flux.hpp
ggml_extend.hpp
model.cpp
model.h
stable-diffusion.cpp
vae.hpp

6 files changed

+114

-16

lines changed

`‎flux.hpp‎`

Lines changed: 24 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -984,7 +984,8 @@ namespace Flux {`
`984`	`984`	`struct ggml_tensor* pe,`
`985`	`985`	`struct ggml_tensor* mod_index_arange = NULL,`
`986`	`986`	`std::vector<ggml_tensor*> ref_latents = {},`
`987`		`- std::vector<int> skip_layers = {}) {`
	`987`	`+ std::vector<int> skip_layers = {},`
	`988`	`+ SDVersion version = VERSION_FLUX) {`
`988`	`989`	`// Forward pass of DiT.`
`989`	`990`	`// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)`
`990`	`991`	`// timestep: (N,) tensor of diffusion timesteps`
`@@ -1007,14 +1008,30 @@ namespace Flux {`
`1007`	`1008`	`auto img = process_img(ctx, x);`
`1008`	`1009`	`uint64_t img_tokens = img->ne[1];`
`1009`	`1010`
`1010`		`- if (c_concat != NULL) {`
	`1011`	`+ if (version == VERSION_FLUX_FILL) {`
	`1012`	`+ GGML_ASSERT(c_concat != NULL);`
`1011`	`1013`	`ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);`
`1012`	`1014`	`ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);`
`1013`	`1015`
`1014`	`1016`	`masked = process_img(ctx, masked);`
`1015`	`1017`	`mask = process_img(ctx, mask);`
`1016`	`1018`
`1017`	`1019`	`img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);`
	`1020`	`+ } else if (version == VERSION_FLEX_2) {`
	`1021`	`+ GGML_ASSERT(c_concat != NULL);`
	`1022`	`+ ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);`
	`1023`	`+ ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);`
	`1024`	`+ ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));`
	`1025`	`+`
	`1026`	`+ masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);`
	`1027`	`+ mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);`
	`1028`	`+ control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0);`
	`1029`	`+`
	`1030`	`+ masked = patchify(ctx, masked, patch_size);`
	`1031`	`+ mask = patchify(ctx, mask, patch_size);`
	`1032`	`+ control = patchify(ctx, control, patch_size);`
	`1033`	`+`
	`1034`	`+ img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);`
`1018`	`1035`	`}`
`1019`	`1036`
`1020`	`1037`	`if (ref_latents.size() > 0) {`
`@@ -1055,13 +1072,15 @@ namespace Flux {`
`1055`	`1072`	`SDVersion version = VERSION_FLUX,`
`1056`	`1073`	`bool flash_attn = false,`
`1057`	`1074`	`bool use_mask = false)`
`1058`		`- : GGMLRunner(backend), use_mask(use_mask) {`
	`1075`	`+ : GGMLRunner(backend), version(version), use_mask(use_mask) {`
`1059`	`1076`	`flux_params.flash_attn = flash_attn;`
`1060`	`1077`	`flux_params.guidance_embed = false;`
`1061`	`1078`	`flux_params.depth = 0;`
`1062`	`1079`	`flux_params.depth_single_blocks = 0;`
`1063`	`1080`	`if (version == VERSION_FLUX_FILL) {`
`1064`	`1081`	`flux_params.in_channels = 384;`
	`1082`	`+ } else if (version == VERSION_FLEX_2) {`
	`1083`	`+ flux_params.in_channels = 196;`
`1065`	`1084`	`}`
`1066`	`1085`	`for (auto pair : tensor_types) {`
`1067`	`1086`	`std::string tensor_name = pair.first;`
`@@ -1171,7 +1190,8 @@ namespace Flux {`
`1171`	`1190`	`pe,`
`1172`	`1191`	`mod_index_arange,`
`1173`	`1192`	`ref_latents,`
`1174`		`- skip_layers);`
	`1193`	`+ skip_layers,`
	`1194`	`+ version);`
`1175`	`1195`
`1176`	`1196`	`ggml_build_forward_expand(gf, out);`
`1177`	`1197`

`‎ggml_extend.hpp‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -384,6 +384,8 @@ __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,`
`384`	`384`	`int64_t width = output->ne[0];`
`385`	`385`	`int64_t height = output->ne[1];`
`386`	`386`	`int64_t channels = output->ne[2];`
	`387`	`+ float rescale_mx = mask->ne[0]/output->ne[0];`
	`388`	`+ float rescale_my = mask->ne[1]/output->ne[1];`
`387`	`389`	`GGML_ASSERT(output->type == GGML_TYPE_F32);`
`388`	`390`	`for (int ix = 0; ix < width; ix++) {`
`389`	`391`	`for (int iy = 0; iy < height; iy++) {`

`‎model.cpp‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1689,6 +1689,9 @@ SDVersion ModelLoader::get_sd_version() {`
`1689`	`1689`	`if (is_inpaint) {`
`1690`	`1690`	`return VERSION_FLUX_FILL;`
`1691`	`1691`	`}`
	`1692`	`+ if(input_block_weight.ne[0] == 196){`
	`1693`	`+ return VERSION_FLEX_2;`
	`1694`	`+ }`
`1692`	`1695`	`return VERSION_FLUX;`
`1693`	`1696`	`}`
`1694`	`1697`

`‎model.h‎`

Lines changed: 3 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -31,11 +31,12 @@ enum SDVersion {`
`31`	`31`	`VERSION_SD3,`
`32`	`32`	`VERSION_FLUX,`
`33`	`33`	`VERSION_FLUX_FILL,`
	`34`	`+ VERSION_FLEX_2,`
`34`	`35`	`VERSION_COUNT,`
`35`	`36`	`};`
`36`	`37`
`37`	`38`	`static inline bool sd_version_is_flux(SDVersion version) {`
`38`		`- if (version == VERSION_FLUX \|\| version == VERSION_FLUX_FILL) {`
	`39`	`+ if (version == VERSION_FLUX \|\| version == VERSION_FLUX_FILL \|\| version == VERSION_FLEX_2 ) {`
`39`	`40`	`return true;`
`40`	`41`	`}`
`41`	`42`	`return false;`
`@@ -70,7 +71,7 @@ static inline bool sd_version_is_sdxl(SDVersion version) {`
`70`	`71`	`}`
`71`	`72`
`72`	`73`	`static inline bool sd_version_is_inpaint(SDVersion version) {`
`73`		`- if (version == VERSION_SD1_INPAINT \|\| version == VERSION_SD2_INPAINT \|\| version == VERSION_SDXL_INPAINT \|\| version == VERSION_FLUX_FILL) {`
	`74`	`+ if (version == VERSION_SD1_INPAINT \|\| version == VERSION_SD2_INPAINT \|\| version == VERSION_SDXL_INPAINT \|\| version == VERSION_FLUX_FILL \|\| version == VERSION_FLEX_2) {`
`74`	`75`	`return true;`
`75`	`76`	`}`
`76`	`77`	`return false;`

`‎stable-diffusion.cpp‎`

Lines changed: 81 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ class StableDiffusionGGML {`
`95`	`95`	`std::shared_ptr<DiffusionModel> diffusion_model;`
`96`	`96`	`std::shared_ptr<AutoEncoderKL> first_stage_model;`
`97`	`97`	`std::shared_ptr<TinyAutoEncoder> tae_first_stage;`
`98`		`- std::shared_ptr<ControlNet> control_net;`
	`98`	`+ std::shared_ptr<ControlNet> control_net = NULL;`
`99`	`99`	`std::shared_ptr<PhotoMakerIDEncoder> pmid_model;`
`100`	`100`	`std::shared_ptr<LoraModel> pmid_lora;`
`101`	`101`	`std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;`
`@@ -297,6 +297,11 @@ class StableDiffusionGGML {`
`297`	`297`	`// TODO: shift_factor`
`298`	`298`	`}`
`299`	`299`
	`300`	`+ if(version == VERSION_FLEX_2){`
	`301`	`+ // Might need vae encode for control cond`
	`302`	`+ vae_decode_only = false;`
	`303`	`+ }`
	`304`	`+`
`300`	`305`	`bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;`
`301`	`306`
`302`	`307`	`if (version == VERSION_SVD) {`
`@@ -933,7 +938,7 @@ class StableDiffusionGGML {`
`933`	`938`
`934`	`939`	`std::vector<struct ggml_tensor*> controls;`
`935`	`940`
`936`		`- if (control_hint != NULL) {`
	`941`	`+ if (control_hint != NULL && control_net != NULL) {`
`937`	`942`	`control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);`
`938`	`943`	`controls = control_net->controls;`
`939`	`944`	`// print_ggml_tensor(controls[12]);`
`@@ -972,7 +977,7 @@ class StableDiffusionGGML {`
`972`	`977`	`float* negative_data = NULL;`
`973`	`978`	`if (has_unconditioned) {`
`974`	`979`	`// uncond`
`975`		`- if (control_hint != NULL) {`
	`980`	`+ if (control_hint != NULL && control_net != NULL) {`
`976`	`981`	`control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);`
`977`	`982`	`controls = control_net->controls;`
`978`	`983`	`}`
`@@ -1721,6 +1726,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,`
`1721`	`1726`	`int64_t mask_channels = 1;`
`1722`	`1727`	`if (sd_ctx->sd->version == VERSION_FLUX_FILL) {`
`1723`	`1728`	`mask_channels = 8 * 8; // flatten the whole mask`
	`1729`	`+ } else if (sd_ctx->sd->version == VERSION_FLEX_2) {`
	`1730`	`+ mask_channels = 1 + init_latent->ne[2];`
`1724`	`1731`	`}`
`1725`	`1732`	`auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);`
`1726`	`1733`	`// no mask, set the whole image as masked`
`@@ -1734,6 +1741,11 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,`
`1734`	`1741`	`for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {`
`1735`	`1742`	`ggml_tensor_set_f32(empty_latent, 1, x, y, c);`
`1736`	`1743`	`}`
	`1744`	`+ } else if (sd_ctx->sd->version == VERSION_FLEX_2) {`
	`1745`	`+ for (int64_t c = 0; c < empty_latent->ne[2]; c++) {`
	`1746`	`+ // 0x16,1x1,0x16`
	`1747`	`+ ggml_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c);`
	`1748`	`+ }`
`1737`	`1749`	`} else {`
`1738`	`1750`	`ggml_tensor_set_f32(empty_latent, 1, x, y, 0);`
`1739`	`1751`	`for (int64_t c = 1; c < empty_latent->ne[2]; c++) {`
`@@ -1742,12 +1754,42 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,`
`1742`	`1754`	`}`
`1743`	`1755`	`}`
`1744`	`1756`	`}`
`1745`		`- if (concat_latent == NULL) {`
	`1757`	`+ if (sd_ctx->sd->version == VERSION_FLEX_2 && image_hint != NULL && sd_ctx->sd->control_net == NULL) {`
	`1758`	`+ bool no_inpaint = concat_latent == NULL;`
	`1759`	`+ if (no_inpaint) {`
	`1760`	`+ concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);`
	`1761`	`+ }`
	`1762`	`+ // fill in the control image here`
	`1763`	`+ struct ggml_tensor* control_latents = NULL;`
	`1764`	`+ if (!sd_ctx->sd->use_tiny_autoencoder) {`
	`1765`	`+ struct ggml_tensor* control_moments = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);`
	`1766`	`+ control_latents = sd_ctx->sd->get_first_stage_encoding(work_ctx, control_moments);`
	`1767`	`+ } else {`
	`1768`	`+ control_latents = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);`
	`1769`	`+ }`
	`1770`	`+ for (int64_t x = 0; x < concat_latent->ne[0]; x++) {`
	`1771`	`+ for (int64_t y = 0; y < concat_latent->ne[1]; y++) {`
	`1772`	`+ if (no_inpaint) {`
	`1773`	`+ for (int64_t c = 0; c < concat_latent->ne[2] - control_latents->ne[2]; c++) {`
	`1774`	`+ // 0x16,1x1,0x16`
	`1775`	`+ ggml_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c);`
	`1776`	`+ }`
	`1777`	`+ }`
	`1778`	`+ for (int64_t c = 0; c < control_latents->ne[2]; c++) {`
	`1779`	`+ float v = ggml_tensor_get_f32(control_latents, x, y, c);`
	`1780`	`+ ggml_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latents->ne[2] + c);`
	`1781`	`+ }`
	`1782`	`+ }`
	`1783`	`+ }`
	`1784`	`+ // Disable controlnet`
	`1785`	`+ image_hint = NULL;`
	`1786`	`+ } else if (concat_latent == NULL) {`
`1746`	`1787`	`concat_latent = empty_latent;`
`1747`	`1788`	`}`
`1748`	`1789`	`cond.c_concat = concat_latent;`
`1749`	`1790`	`uncond.c_concat = empty_latent;`
`1750`	`1791`	`denoise_mask = NULL;`
	`1792`	`+ } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {`
`1751`	`1793`	`} else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {`
`1752`	`1794`	`auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);`
`1753`	`1795`	`ggml_set_f32(empty_latent, 0);`
`@@ -1935,10 +1977,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g`
`1935`	`1977`	`sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img);`
`1936`	`1978`	`sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img);`
`1937`	`1979`
	`1980`	`+ if (!sd_ctx->sd->use_tiny_autoencoder) {`
	`1981`	`+ ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);`
	`1982`	`+ init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);`
	`1983`	`+ } else {`
	`1984`	`+ init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);`
	`1985`	`+ }`
	`1986`	`+`
`1938`	`1987`	`if (sd_version_is_inpaint(sd_ctx->sd->version)) {`
`1939`	`1988`	`int64_t mask_channels = 1;`
`1940`	`1989`	`if (sd_ctx->sd->version == VERSION_FLUX_FILL) {`
`1941`	`1990`	`mask_channels = 8 * 8; // flatten the whole mask`
	`1991`	`+ } else if (sd_ctx->sd->version == VERSION_FLEX_2) {`
	`1992`	`+ mask_channels = 1 + init_latent->ne[2];`
`1942`	`1993`	`}`
`1943`	`1994`	`ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);`
`1944`	`1995`	`sd_apply_mask(init_img, mask_img, masked_img);`
`@@ -1973,6 +2024,32 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g`
`1973`	`2024`	`ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);`
`1974`	`2025`	`}`
`1975`	`2026`	`}`
	`2027`	`+ } else if (sd_ctx->sd->version == VERSION_FLEX_2) {`
	`2028`	`+ float m = ggml_tensor_get_f32(mask_img, mx, my);`
	`2029`	`+ // masked image`
	`2030`	`+ for (int k = 0; k < masked_latent->ne[2]; k++) {`
	`2031`	`+ float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);`
	`2032`	`+ ggml_tensor_set_f32(concat_latent, v, ix, iy, k);`
	`2033`	`+ }`
	`2034`	`+ // downsampled mask`
	`2035`	`+ ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]);`
	`2036`	`+ // control (todo: support this)`
	`2037`	`+ for (int k = 0; k < masked_latent->ne[2]; k++) {`
	`2038`	`+ ggml_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k);`
	`2039`	`+ }`
	`2040`	`+ } else if (sd_ctx->sd->version == VERSION_FLEX_2) {`
	`2041`	`+ float m = ggml_tensor_get_f32(mask_img, mx, my);`
	`2042`	`+ // masked image`
	`2043`	`+ for (int k = 0; k < masked_latent->ne[2]; k++) {`
	`2044`	`+ float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);`
	`2045`	`+ ggml_tensor_set_f32(concat_latent, v, ix, iy, k);`
	`2046`	`+ }`
	`2047`	`+ // downsampled mask`
	`2048`	`+ ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]);`
	`2049`	`+ // control (todo: support this)`
	`2050`	`+ for (int k = 0; k < masked_latent->ne[2]; k++) {`
	`2051`	`+ ggml_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k);`
	`2052`	`+ }`
`1976`	`2053`	`} else {`
`1977`	`2054`	`float m = ggml_tensor_get_f32(mask_img, mx, my);`
`1978`	`2055`	`ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);`
`@@ -1998,12 +2075,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g`
`1998`	`2075`	`}`
`1999`	`2076`	`}`
`2000`	`2077`
`2001`		`- if (!sd_ctx->sd->use_tiny_autoencoder) {`
`2002`		`- ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);`
`2003`		`- init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);`
`2004`		`- } else {`
`2005`		`- init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);`
`2006`		`- }`
`2007`	`2078`	`} else {`
`2008`	`2079`	`LOG_INFO("TXT2IMG");`
`2009`	`2080`	`if (sd_version_is_inpaint(sd_ctx->sd->version)) {`

`‎vae.hpp‎`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -559,6 +559,7 @@ struct AutoEncoderKL : public GGMLRunner {`
`559`	`559`	`bool decode_graph,`
`560`	`560`	`struct ggml_tensor** output,`
`561`	`561`	`struct ggml_context* output_ctx = NULL) {`
	`562`	`+ GGML_ASSERT(!decode_only \|\| decode_graph);`
`562`	`563`	`auto get_graph = [&]() -> struct ggml_cgraph* {`
`563`	`564`	`return build_graph(z, decode_graph);`
`564`	`565`	`};`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit c457adf

File tree

6 files changed

6 files changed

`‎flux.hpp‎`

`‎ggml_extend.hpp‎`

`‎model.cpp‎`

`‎model.h‎`

`‎stable-diffusion.cpp‎`

`‎vae.hpp‎`

0 commit comments