Commit 250f5cb

lauri9sayakpaulgithub-actions[bot]

authored

Add AITER attention backend (#12549)

* add aiter attention backend * Apply style fixes --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

1 parent dc6bd15 commit 250f5cbCopy full SHA for 250f5cb

File tree

5 files changed

+98

-0

lines changed

docs/source/en/optimization
- attention_backends.md
src/diffusers
- models
  - attention_dispatch.py
- utils
  - __init__.py
  - import_utils.py
tests/others
- test_attention_backends.py

5 files changed

+98

-0

lines changed

`‎docs/source/en/optimization/attention_backends.md‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ Refer to the table below for an overview of the available attention families and`
`21`	`21`	`\| attention family \| main feature \|`
`22`	`22`	`\|---\|---\|`
`23`	`23`	`\| FlashAttention \| minimizes memory reads/writes through tiling and recomputation \|`
	`24`	`+\| AI Tensor Engine for ROCm \| FlashAttention implementation optimized for AMD ROCm accelerators \|`
`24`	`25`	`\| SageAttention \| quantizes attention to int8 \|`
`25`	`26`	`\| PyTorch native \| built-in PyTorch implementation using [scaled_dot_product_attention](./fp16#scaled-dot-product-attention) \|`
`26`	`27`	`\| xFormers \| memory-efficient attention with support for various attention kernels \|`
`@@ -139,6 +140,7 @@ Refer to the table below for a complete list of available attention backends and`
`139`	`140`	\| `_native_xla` \| [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) \| XLA-optimized attention \|
`140`	`141`	\| `flash` \| [FlashAttention](https://github.com/Dao-AILab/flash-attention) \| FlashAttention-2 \|
`141`	`142`	\| `flash_varlen` \| [FlashAttention](https://github.com/Dao-AILab/flash-attention) \| Variable length FlashAttention \|
	`143`	+\| `aiter` \| [AI Tensor Engine for ROCm](https://github.com/ROCm/aiter) \| FlashAttention for AMD ROCm \|
`142`	`144`	\| `_flash_3` \| [FlashAttention](https://github.com/Dao-AILab/flash-attention) \| FlashAttention-3 \|
`143`	`145`	\| `_flash_varlen_3` \| [FlashAttention](https://github.com/Dao-AILab/flash-attention) \| Variable length FlashAttention-3 \|
`144`	`146`	\| `_flash_3_hub` \| [FlashAttention](https://github.com/Dao-AILab/flash-attention) \| FlashAttention-3 from kernels \|

`‎src/diffusers/models/attention_dispatch.py‎`

Lines changed: 60 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,8 @@`
`27`	`27`
`28`	`28`	`from ..utils import (`
`29`	`29`	`get_logger,`
	`30`	`+ is_aiter_available,`
	`31`	`+ is_aiter_version,`
`30`	`32`	`is_flash_attn_3_available,`
`31`	`33`	`is_flash_attn_available,`
`32`	`34`	`is_flash_attn_version,`
`@@ -47,13 +49,15 @@`
`47`	`49`	`from ._modeling_parallel import ParallelConfig`
`48`	`50`
`49`	`51`	`_REQUIRED_FLASH_VERSION = "2.6.3"`
	`52`	`+_REQUIRED_AITER_VERSION = "0.1.5"`
`50`	`53`	`_REQUIRED_SAGE_VERSION = "2.1.1"`
`51`	`54`	`_REQUIRED_FLEX_VERSION = "2.5.0"`
`52`	`55`	`_REQUIRED_XLA_VERSION = "2.2"`
`53`	`56`	`_REQUIRED_XFORMERS_VERSION = "0.0.29"`
`54`	`57`
`55`	`58`	`_CAN_USE_FLASH_ATTN = is_flash_attn_available() and is_flash_attn_version(">=", _REQUIRED_FLASH_VERSION)`
`56`	`59`	`_CAN_USE_FLASH_ATTN_3 = is_flash_attn_3_available()`
	`60`	`+_CAN_USE_AITER_ATTN = is_aiter_available() and is_aiter_version(">=", _REQUIRED_AITER_VERSION)`
`57`	`61`	`_CAN_USE_SAGE_ATTN = is_sageattention_available() and is_sageattention_version(">=", _REQUIRED_SAGE_VERSION)`
`58`	`62`	`_CAN_USE_FLEX_ATTN = is_torch_version(">=", _REQUIRED_FLEX_VERSION)`
`59`	`63`	`_CAN_USE_NPU_ATTN = is_torch_npu_available()`
`@@ -78,6 +82,12 @@`
`78`	`82`	`flash_attn_3_func = None`
`79`	`83`	`flash_attn_3_varlen_func = None`
`80`	`84`
	`85`	`+`
	`86`	`+if _CAN_USE_AITER_ATTN:`
	`87`	`+ from aiter import flash_attn_func as aiter_flash_attn_func`
	`88`	`+else:`
	`89`	`+ aiter_flash_attn_func = None`
	`90`	`+`
`81`	`91`	`if DIFFUSERS_ENABLE_HUB_KERNELS:`
`82`	`92`	`if not is_kernels_available():`
`83`	`93`	`raise ImportError(`
`@@ -178,6 +188,9 @@ class AttentionBackendName(str, Enum):`
`178`	`188`	`_FLASH_3_HUB = "_flash_3_hub"`
`179`	`189`	`# _FLASH_VARLEN_3_HUB = "_flash_varlen_3_hub" # not supported yet.`
`180`	`190`
	`191`	+ # `aiter`
	`192`	`+ AITER = "aiter"`
	`193`	`+`
`181`	`194`	`# PyTorch native`
`182`	`195`	`FLEX = "flex"`
`183`	`196`	`NATIVE = "native"`
`@@ -414,6 +427,12 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None`
`414`	`427`	f"Flash Attention 3 Hub backend '{backend.value}' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`."
`415`	`428`	`)`
`416`	`429`
	`430`	`+ elif backend == AttentionBackendName.AITER:`
	`431`	`+ if not _CAN_USE_AITER_ATTN:`
	`432`	`+ raise RuntimeError(`
	`433`	+ f"Aiter Attention backend '{backend.value}' is not usable because of missing package or the version is too old. Please install `aiter>={_REQUIRED_AITER_VERSION}`."
	`434`	`+ )`
	`435`	`+`
`417`	`436`	`elif backend in [`
`418`	`437`	`AttentionBackendName.SAGE,`
`419`	`438`	`AttentionBackendName.SAGE_VARLEN,`
`@@ -1397,6 +1416,47 @@ def _flash_varlen_attention_3(`
`1397`	`1416`	`return (out, lse) if return_lse else out`
`1398`	`1417`
`1399`	`1418`
	`1419`	`+@_AttentionBackendRegistry.register(`
	`1420`	`+ AttentionBackendName.AITER,`
	`1421`	`+ constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape],`
	`1422`	`+)`
	`1423`	`+def _aiter_flash_attention(`
	`1424`	`+ query: torch.Tensor,`
	`1425`	`+ key: torch.Tensor,`
	`1426`	`+ value: torch.Tensor,`
	`1427`	`+ dropout_p: float = 0.0,`
	`1428`	`+ is_causal: bool = False,`
	`1429`	`+ scale: Optional[float] = None,`
	`1430`	`+ return_lse: bool = False,`
	`1431`	`+ _parallel_config: Optional["ParallelConfig"] = None,`
	`1432`	`+) -> torch.Tensor:`
	`1433`	`+ if not return_lse and torch.is_grad_enabled():`
	`1434`	`+ # aiter requires return_lse=True by assertion when gradients are enabled.`
	`1435`	`+ out, lse, *_ = aiter_flash_attn_func(`
	`1436`	`+ q=query,`
	`1437`	`+ k=key,`
	`1438`	`+ v=value,`
	`1439`	`+ dropout_p=dropout_p,`
	`1440`	`+ softmax_scale=scale,`
	`1441`	`+ causal=is_causal,`
	`1442`	`+ return_lse=True,`
	`1443`	`+ )`
	`1444`	`+ else:`
	`1445`	`+ out = aiter_flash_attn_func(`
	`1446`	`+ q=query,`
	`1447`	`+ k=key,`
	`1448`	`+ v=value,`
	`1449`	`+ dropout_p=dropout_p,`
	`1450`	`+ softmax_scale=scale,`
	`1451`	`+ causal=is_causal,`
	`1452`	`+ return_lse=return_lse,`
	`1453`	`+ )`
	`1454`	`+ if return_lse:`
	`1455`	`+ out, lse, *_ = out`
	`1456`	`+`
	`1457`	`+ return (out, lse) if return_lse else out`
	`1458`	`+`
	`1459`	`+`
`1400`	`1460`	`@_AttentionBackendRegistry.register(`
`1401`	`1461`	`AttentionBackendName.FLEX,`
`1402`	`1462`	`constraints=[_check_attn_mask_or_causal, _check_device, _check_shape],`

`‎src/diffusers/utils/init.py‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,8 @@`
`64`	`64`	`get_objects_from_module,`
`65`	`65`	`is_accelerate_available,`
`66`	`66`	`is_accelerate_version,`
	`67`	`+ is_aiter_available,`
	`68`	`+ is_aiter_version,`
`67`	`69`	`is_better_profanity_available,`
`68`	`70`	`is_bitsandbytes_available,`
`69`	`71`	`is_bitsandbytes_version,`

`‎src/diffusers/utils/import_utils.py‎`

Lines changed: 21 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -226,6 +226,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b`
`226`	`226`	`_sageattention_available, _sageattention_version = _is_package_available("sageattention")`
`227`	`227`	`_flash_attn_available, _flash_attn_version = _is_package_available("flash_attn")`
`228`	`228`	`_flash_attn_3_available, _flash_attn_3_version = _is_package_available("flash_attn_3")`
	`229`	`+_aiter_available, _aiter_version = _is_package_available("aiter")`
`229`	`230`	`_kornia_available, _kornia_version = _is_package_available("kornia")`
`230`	`231`	`_nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True)`
`231`	`232`
`@@ -406,6 +407,10 @@ def is_flash_attn_3_available():`
`406`	`407`	`return _flash_attn_3_available`
`407`	`408`
`408`	`409`
	`410`	`+def is_aiter_available():`
	`411`	`+ return _aiter_available`
	`412`	`+`
	`413`	`+`
`409`	`414`	`def is_kornia_available():`
`410`	`415`	`return _kornia_available`
`411`	`416`
`@@ -911,6 +916,22 @@ def is_flash_attn_version(operation: str, version: str):`
`911`	`916`	`return compare_versions(parse(_flash_attn_version), operation, version)`
`912`	`917`
`913`	`918`
	`919`	`+@cache`
	`920`	`+def is_aiter_version(operation: str, version: str):`
	`921`	`+ """`
	`922`	`+ Compares the current aiter version to a given reference with an operation.`
	`923`	`+`
	`924`	`+ Args:`
	`925`	+ operation (`str`):
	`926`	+ A string representation of an operator, such as `">"` or `"<="`
	`927`	+ version (`str`):
	`928`	`+ A version string`
	`929`	`+ """`
	`930`	`+ if not _aiter_available:`
	`931`	`+ return False`
	`932`	`+ return compare_versions(parse(_aiter_version), operation, version)`
	`933`	`+`
	`934`	`+`
`914`	`935`	`def get_objects_from_module(module):`
`915`	`936`	`"""`
`916`	`937`	`Returns a dict of object names and values in a module, while skipping private/internal objects`

`‎tests/others/test_attention_backends.py‎`

Lines changed: 13 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,10 @@`
`14`	`14`
`15`	`15`	`Tests were conducted on an H100 with PyTorch 2.8.0 (CUDA 12.9). Slices for the compilation tests in`
`16`	`16`	`"native" variants were obtained with a torch nightly version (2.10.0.dev20250924+cu128).`
	`17`	`+`
	`18`	`+Tests for aiter backend were conducted and slices for the aiter backend tests collected on a MI355X`
	`19`	`+with torch 2025年09月25日 nightly version (ad2f7315ca66b42497047bb7951f696b50f1e81b) and`
	`20`	`+aiter 0.1.5.post4.dev20+ga25e55e79.`
`17`	`21`	`"""`
`18`	`22`
`19`	`23`	`import os`
`@@ -44,6 +48,10 @@`
`44`	`48`	`"_native_cudnn",`
`45`	`49`	`torch.tensor([0.0781, 0.0840, 0.0879, 0.0957, 0.0898, 0.0957, 0.0957, 0.0977, 0.2168, 0.2246, 0.2324, 0.2500, 0.2539, 0.2480, 0.2441, 0.2695], dtype=torch.bfloat16),`
`46`	`50`	`),`
	`51`	`+ (`
	`52`	`+ "aiter",`
	`53`	`+ torch.tensor([0.0781, 0.0820, 0.0879, 0.0957, 0.0898, 0.0938, 0.0957, 0.0957, 0.2285, 0.2363, 0.2461, 0.2637, 0.2695, 0.2617, 0.2617, 0.2891], dtype=torch.bfloat16),`
	`54`	`+ )`
`47`	`55`	`]`
`48`	`56`
`49`	`57`	`COMPILE_CASES = [`
`@@ -63,6 +71,11 @@`
`63`	`71`	`torch.tensor([0.0410, 0.0410, 0.0430, 0.0508, 0.0488, 0.0586, 0.0605, 0.0586, 0.2344, 0.2461, 0.2578, 0.2773, 0.2871, 0.2832, 0.2793, 0.3086], dtype=torch.bfloat16),`
`64`	`72`	`True,`
`65`	`73`	`),`
	`74`	`+ (`
	`75`	`+ "aiter",`
	`76`	`+ torch.tensor([0.0391, 0.0391, 0.0430, 0.0488, 0.0469, 0.0566, 0.0586, 0.0566, 0.2402, 0.2539, 0.2637, 0.2812, 0.2930, 0.2910, 0.2891, 0.3164], dtype=torch.bfloat16),`
	`77`	`+ True,`
	`78`	`+ )`
`66`	`79`	`]`
`67`	`80`	`# fmt: on`
`68`	`81`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 250f5cb

File tree

5 files changed

5 files changed

`‎docs/source/en/optimization/attention_backends.md‎`

`‎src/diffusers/models/attention_dispatch.py‎`

`‎src/diffusers/utils/init.py‎`

`‎src/diffusers/utils/import_utils.py‎`

`‎tests/others/test_attention_backends.py‎`

0 commit comments

File tree

5 files changed

5 files changed

‎docs/source/en/optimization/attention_backends.md‎

‎src/diffusers/models/attention_dispatch.py‎

‎src/diffusers/utils/__init__.py‎

‎src/diffusers/utils/import_utils.py‎

‎tests/others/test_attention_backends.py‎

0 commit comments

`‎docs/source/en/optimization/attention_backends.md‎`

`‎src/diffusers/models/attention_dispatch.py‎`

`‎src/diffusers/utils/init.py‎`

`‎src/diffusers/utils/import_utils.py‎`

`‎tests/others/test_attention_backends.py‎`