Commit d23facd

authored

Merge pull request #2388 from laclouis5/fix-mqa-v2

Fix MQA V2

2 parents 2d734d9 + 2d5277e commit d23facdCopy full SHA for d23facd

File tree

+23

-9

lines changed

+23

-9

lines changed

Lines changed: 18 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`import torch`
`3`	`3`	`import torch.nn as nn`
`4`	`4`
`5`		`-from timm.layers import create_act_layer, set_layer_config, get_act_layer, get_act_fn, Attention2d`
	`5`	`+from timm.layers import create_act_layer, set_layer_config, get_act_layer, get_act_fn, Attention2d, MultiQueryAttentionV2`
`6`	`6`
`7`	`7`	`import importlib`
`8`	`8`	`import os`
`@@ -121,6 +121,23 @@ def test_get_act_fn_none():`
`121`	`121`	`assert get_act_fn('') is None`
`122`	`122`
`123`	`123`
	`124`	`+@pytest.mark.parametrize("dim", [128])`
	`125`	`+@pytest.mark.parametrize("dim_out", [128, 256])`
	`126`	`+@pytest.mark.parametrize("use_m", [True, False])`
	`127`	`+def test_mqa_v2(dim, dim_out, use_m):`
	`128`	`+ mqa = MultiQueryAttentionV2(dim, dim_out)`
	`129`	`+`
	`130`	`+ x = torch.randn(1, dim, 32, 48)`
	`131`	`+ if use_m:`
	`132`	`+ m = torch.randn(1, dim, 16, 24)`
	`133`	`+ else:`
	`134`	`+ m = None`
	`135`	`+`
	`136`	`+ y = mqa(x, m=m)`
	`137`	`+`
	`138`	`+ assert (y.shape) == (1, dim_out, 32, 48)`
	`139`	`+`
	`140`	`+`
`124`	`141`	`@pytest.mark.parametrize("bias", [True, False])`
`125`	`142`	`@pytest.mark.parametrize("expand_first", [True, False])`
`126`	`143`	`@pytest.mark.parametrize("head_first", [True, False])`
`@@ -141,6 +158,3 @@ def test_attn2d(bias, expand_first, head_first, attn_mask):`
`141`	`158`	`o2 = attn(x, mask)`
`142`	`159`
`143`	`160`	`assert torch.allclose(o1, o2, atol=1e-5), f"{torch.abs(o1 - o2).max()}"`
`144`		`-`
`145`		`-`
`146`		`-`

Lines changed: 5 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -59,24 +59,24 @@ def _reshape_input(self, t):`
`59`	`59`
`60`	`60`	`def forward(self, x, m: Optional[torch.Tensor] = None):`
`61`	`61`	`"""Run layer computation."""`
`62`		`- s = x.shape`
`63`		`- m = m or x`
	`62`	`+ b, _, h, w = x.shape`
	`63`	`+ m = m ifmisnotNoneelse x`
`64`	`64`
`65`	`65`	`reshaped_x = self._reshape_input(x)`
`66`	`66`	`reshaped_m = self._reshape_input(m)`
`67`	`67`
`68`	`68`	`q = torch.einsum('bnd,hkd->bnhk', reshaped_x, self.query_proj)`
`69`	`69`	`k = torch.einsum('bmd,dk->bmk', reshaped_m, self.key_proj)`
`70`	`70`
`71`		`- attn = torch.einsum('bnhk,bmk->bnhm', q, k)`
	`71`	`+ attn = torch.einsum('bnhk,bmk->bnhm', q, k)*self.scale`
`72`	`72`	`attn = attn.softmax(dim=-1)`
`73`	`73`	`attn = self.attn_drop(attn)`
`74`	`74`
`75`	`75`	`v = torch.einsum('bmd,dv->bmv', reshaped_m, self.value_proj)`
`76`	`76`	`o = torch.einsum('bnhm,bmv->bnhv', attn, v)`
`77`		`- result = torch.einsum('bnhv,dhv->bnd', o, self.out_proj)`
	`77`	`+ result = torch.einsum('bnhv,dhv->bdn', o, self.out_proj)`
`78`	`78`	`result = self.proj_drop(result)`
`79`		`- return result.reshape(s)`
	`79`	`+ return result.reshape(b, -1, h, w)`
`80`	`80`
`81`	`81`
`82`	`82`	`class MultiQueryAttention2d(nn.Module):`

Comments

(0)