Commit f8fe4e7

authored

fix: add flash attn support check (#803)

1 parent 1c07fb6 commit f8fe4e7Copy full SHA for f8fe4e7

File tree

11 files changed

+191

-120

lines changed

clip.hpp
common.hpp
conditioner.hpp
control.hpp
flux.hpp
ggml_extend.hpp
mmdit.hpp
pmid.hpp
t5.hpp
unet.hpp
wan.hpp

11 files changed

+191

-120

lines changed

`‎clip.hpp‎`

Lines changed: 17 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -488,14 +488,14 @@ struct CLIPLayer : public GGMLBlock {`
`488`	`488`	`blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));`
`489`	`489`	`}`
`490`	`490`
`491`		`- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = true) {`
	`491`	`+ struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {`
`492`	`492`	`// x: [N, n_token, d_model]`
`493`	`493`	`auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);`
`494`	`494`	`auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);`
`495`	`495`	`auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);`
`496`	`496`	`auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);`
`497`	`497`
`498`		`- x = ggml_add(ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));`
	`498`	`+ x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));`
`499`	`499`	`x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));`
`500`	`500`	`return x;`
`501`	`501`	`}`
`@@ -517,7 +517,11 @@ struct CLIPEncoder : public GGMLBlock {`
`517`	`517`	`}`
`518`	`518`	`}`
`519`	`519`
`520`		`- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {`
	`520`	`+ struct ggml_tensor* forward(struct ggml_context* ctx,`
	`521`	`+ ggml_backend_t backend,`
	`522`	`+ struct ggml_tensor* x,`
	`523`	`+ int clip_skip = -1,`
	`524`	`+ bool mask = true) {`
`521`	`525`	`// x: [N, n_token, d_model]`
`522`	`526`	`int layer_idx = n_layer - 1;`
`523`	`527`	`// LOG_DEBUG("clip_skip %d", clip_skip);`
`@@ -532,7 +536,7 @@ struct CLIPEncoder : public GGMLBlock {`
`532`	`536`	`}`
`533`	`537`	`std::string name = "layers." + std::to_string(i);`
`534`	`538`	`auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);`
`535`		`- x = layer->forward(ctx, x, mask); // [N, n_token, d_model]`
	`539`	`+ x = layer->forward(ctx, backend, x, mask); // [N, n_token, d_model]`
`536`	`540`	`// LOG_DEBUG("layer %d", i);`
`537`	`541`	`}`
`538`	`542`	`return x;`
`@@ -712,6 +716,7 @@ class CLIPTextModel : public GGMLBlock {`
`712`	`716`	`}`
`713`	`717`
`714`	`718`	`struct ggml_tensor* forward(struct ggml_context* ctx,`
	`719`	`+ ggml_backend_t backend,`
`715`	`720`	`struct ggml_tensor* input_ids,`
`716`	`721`	`struct ggml_tensor* tkn_embeddings,`
`717`	`722`	`size_t max_token_idx = 0,`
`@@ -722,7 +727,7 @@ class CLIPTextModel : public GGMLBlock {`
`722`	`727`	`auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);`
`723`	`728`
`724`	`729`	`auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]`
`725`		`- x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);`
	`730`	`+ x = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);`
`726`	`731`	`if (return_pooled \|\| with_final_ln) {`
`727`	`732`	`x = final_layer_norm->forward(ctx, x);`
`728`	`733`	`}`
`@@ -775,6 +780,7 @@ class CLIPVisionModel : public GGMLBlock {`
`775`	`780`	`}`
`776`	`781`
`777`	`782`	`struct ggml_tensor* forward(struct ggml_context* ctx,`
	`783`	`+ ggml_backend_t backend,`
`778`	`784`	`struct ggml_tensor* pixel_values,`
`779`	`785`	`bool return_pooled = true,`
`780`	`786`	`int clip_skip = -1) {`
`@@ -786,7 +792,7 @@ class CLIPVisionModel : public GGMLBlock {`
`786`	`792`
`787`	`793`	`auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]`
`788`	`794`	`x = pre_layernorm->forward(ctx, x);`
`789`		`- x = encoder->forward(ctx, x, clip_skip, false);`
	`795`	`+ x = encoder->forward(ctx, backend, x, clip_skip, false);`
`790`	`796`	`// print_ggml_tensor(x, true, "ClipVisionModel x: ");`
`791`	`797`	`auto last_hidden_state = x;`
`792`	`798`	`x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]`
`@@ -855,6 +861,7 @@ class CLIPVisionModelProjection : public GGMLBlock {`
`855`	`861`	`}`
`856`	`862`
`857`	`863`	`struct ggml_tensor* forward(struct ggml_context* ctx,`
	`864`	`+ ggml_backend_t backend,`
`858`	`865`	`struct ggml_tensor* pixel_values,`
`859`	`866`	`bool return_pooled = true,`
`860`	`867`	`int clip_skip = -1) {`
`@@ -863,7 +870,7 @@ class CLIPVisionModelProjection : public GGMLBlock {`
`863`	`870`	`auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);`
`864`	`871`	`auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);`
`865`	`872`
`866`		`- auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]`
	`873`	`+ auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]`
`867`	`874`
`868`	`875`	`if (return_pooled) {`
`869`	`876`	`x = visual_projection->forward(ctx, x); // [N, projection_dim]`
`@@ -900,6 +907,7 @@ struct CLIPTextModelRunner : public GGMLRunner {`
`900`	`907`	`}`
`901`	`908`
`902`	`909`	`struct ggml_tensor* forward(struct ggml_context* ctx,`
	`910`	`+ ggml_backend_t backend,`
`903`	`911`	`struct ggml_tensor* input_ids,`
`904`	`912`	`struct ggml_tensor* embeddings,`
`905`	`913`	`size_t max_token_idx = 0,`
`@@ -911,7 +919,7 @@ struct CLIPTextModelRunner : public GGMLRunner {`
`911`	`919`	`input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);`
`912`	`920`	`}`
`913`	`921`
`914`		`- return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);`
	`922`	`+ return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);`
`915`	`923`	`}`
`916`	`924`
`917`	`925`	`struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,`
`@@ -937,7 +945,7 @@ struct CLIPTextModelRunner : public GGMLRunner {`
`937`	`945`	`embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);`
`938`	`946`	`}`
`939`	`947`
`940`		`- struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);`
	`948`	`+ struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);`
`941`	`949`
`942`	`950`	`ggml_build_forward_expand(gf, hidden_states);`
`943`	`951`

`‎common.hpp‎`

Lines changed: 16 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -270,7 +270,10 @@ class CrossAttention : public GGMLBlock {`
`270`	`270`	`// to_out_1 is nn.Dropout(), skip for inference`
`271`	`271`	`}`
`272`	`272`
`273`		`- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {`
	`273`	`+ struct ggml_tensor* forward(struct ggml_context* ctx,`
	`274`	`+ ggml_backend_t backend,`
	`275`	`+ struct ggml_tensor* x,`
	`276`	`+ struct ggml_tensor* context) {`
`274`	`277`	`// x: [N, n_token, query_dim]`
`275`	`278`	`// context: [N, n_context, context_dim]`
`276`	`279`	`// return: [N, n_token, query_dim]`
`@@ -288,7 +291,7 @@ class CrossAttention : public GGMLBlock {`
`288`	`291`	`auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]`
`289`	`292`	`auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]`
`290`	`293`
`291`		`- x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]`
	`294`	`+ x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]`
`292`	`295`
`293`	`296`	`x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]`
`294`	`297`	`return x;`
`@@ -327,7 +330,10 @@ class BasicTransformerBlock : public GGMLBlock {`
`327`	`330`	`}`
`328`	`331`	`}`
`329`	`332`
`330`		`- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {`
	`333`	`+ struct ggml_tensor* forward(struct ggml_context* ctx,`
	`334`	`+ ggml_backend_t backend,`
	`335`	`+ struct ggml_tensor* x,`
	`336`	`+ struct ggml_tensor* context) {`
`331`	`337`	`// x: [N, n_token, query_dim]`
`332`	`338`	`// context: [N, n_context, context_dim]`
`333`	`339`	`// return: [N, n_token, query_dim]`
`@@ -352,11 +358,11 @@ class BasicTransformerBlock : public GGMLBlock {`
`352`	`358`
`353`	`359`	`auto r = x;`
`354`	`360`	`x = norm1->forward(ctx, x);`
`355`		`- x = attn1->forward(ctx, x, x); // self-attention`
	`361`	`+ x = attn1->forward(ctx, backend, x, x); // self-attention`
`356`	`362`	`x = ggml_add(ctx, x, r);`
`357`	`363`	`r = x;`
`358`	`364`	`x = norm2->forward(ctx, x);`
`359`		`- x = attn2->forward(ctx, x, context); // cross-attention`
	`365`	`+ x = attn2->forward(ctx, backend, x, context); // cross-attention`
`360`	`366`	`x = ggml_add(ctx, x, r);`
`361`	`367`	`r = x;`
`362`	`368`	`x = norm3->forward(ctx, x);`
`@@ -401,7 +407,10 @@ class SpatialTransformer : public GGMLBlock {`
`401`	`407`	`blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));`
`402`	`408`	`}`
`403`	`409`
`404`		`- virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {`
	`410`	`+ virtual struct ggml_tensor* forward(struct ggml_context* ctx,`
	`411`	`+ ggml_backend_t backend,`
	`412`	`+ struct ggml_tensor* x,`
	`413`	`+ struct ggml_tensor* context) {`
`405`	`414`	`// x: [N, in_channels, h, w]`
`406`	`415`	`// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]`
`407`	`416`	`auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);`
`@@ -424,7 +433,7 @@ class SpatialTransformer : public GGMLBlock {`
`424`	`433`	`std::string name = "transformer_blocks." + std::to_string(i);`
`425`	`434`	`auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);`
`426`	`435`
`427`		`- x = transformer_block->forward(ctx, x, context);`
	`436`	`+ x = transformer_block->forward(ctx, backend, x, context);`
`428`	`437`	`}`
`429`	`438`
`430`	`439`	`x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]`

`‎conditioner.hpp‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -639,7 +639,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {`
`639`	`639`
`640`	`640`	`pixel_values = to_backend(pixel_values);`
`641`	`641`
`642`		`- struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values, return_pooled, clip_skip);`
	`642`	`+ struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, runtime_backend, pixel_values, return_pooled, clip_skip);`
`643`	`643`
`644`	`644`	`ggml_build_forward_expand(gf, hidden_states);`
`645`	`645`

`‎control.hpp‎`

Lines changed: 8 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -174,10 +174,11 @@ class ControlNetBlock : public GGMLBlock {`
`174`	`174`
`175`	`175`	`struct ggml_tensor* attention_layer_forward(std::string name,`
`176`	`176`	`struct ggml_context* ctx,`
	`177`	`+ ggml_backend_t backend,`
`177`	`178`	`struct ggml_tensor* x,`
`178`	`179`	`struct ggml_tensor* context) {`
`179`	`180`	`auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);`
`180`		`- return block->forward(ctx, x, context);`
	`181`	`+ return block->forward(ctx, backend, x, context);`
`181`	`182`	`}`
`182`	`183`
`183`	`184`	`struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,`
`@@ -199,6 +200,7 @@ class ControlNetBlock : public GGMLBlock {`
`199`	`200`	`}`
`200`	`201`
`201`	`202`	`std::vector<struct ggml_tensor> forward(struct ggml_context ctx,`
	`203`	`+ ggml_backend_t backend,`
`202`	`204`	`struct ggml_tensor* x,`
`203`	`205`	`struct ggml_tensor* hint,`
`204`	`206`	`struct ggml_tensor* guided_hint,`
`@@ -272,7 +274,7 @@ class ControlNetBlock : public GGMLBlock {`
`272`	`274`	`h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]`
`273`	`275`	`if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {`
`274`	`276`	`std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";`
`275`		`- h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]`
	`277`	`+ h = attention_layer_forward(name, ctx, backend, h, context); // [N, mult*model_channels, h, w]`
`276`	`278`	`}`
`277`	`279`
`278`	`280`	`auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);`
`@@ -296,9 +298,9 @@ class ControlNetBlock : public GGMLBlock {`
`296`	`298`	`// [N, 4*model_channels, h/8, w/8]`
`297`	`299`
`298`	`300`	`// middle_block`
`299`		`- h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]`
`300`		`- h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]`
`301`		`- h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]`
	`301`	`+ h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]`
	`302`	`+ h = attention_layer_forward("middle_block.1", ctx, backend, h, context); // [N, 4*model_channels, h/8, w/8]`
	`303`	`+ h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]`
`302`	`304`
`303`	`305`	`// out`
`304`	`306`	`outs.push_back(middle_block_out->forward(ctx, h));`
`@@ -403,6 +405,7 @@ struct ControlNet : public GGMLRunner {`
`403`	`405`	`timesteps = to_backend(timesteps);`
`404`	`406`
`405`	`407`	`auto outs = control_net.forward(compute_ctx,`
	`408`	`+ runtime_backend,`
`406`	`409`	`x,`
`407`	`410`	`hint,`
`408`	`411`	`guided_hint_cached ? guided_hint : NULL,`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit f8fe4e7

File tree

11 files changed

11 files changed

`‎clip.hpp‎`

`‎common.hpp‎`

`‎conditioner.hpp‎`

`‎control.hpp‎`

0 commit comments