From 71f24e36dee3e7407ebba250f052ca0008f19e50 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: 2025年10月13日 10:22:01 +0530 Subject: [PATCH 1/5] up --- .../models/autoencoders/autoencoder_dc.py | 25 +------------ .../models/autoencoders/autoencoder_kl.py | 33 +---------------- .../autoencoders/autoencoder_kl_allegro.py | 32 +--------------- .../autoencoders/autoencoder_kl_cogvideox.py | 25 +------------ .../autoencoders/autoencoder_kl_cosmos.py | 25 +------------ .../autoencoder_kl_hunyuan_video.py | 26 +------------ .../models/autoencoders/autoencoder_kl_ltx.py | 25 +------------ .../autoencoders/autoencoder_kl_magvit.py | 25 +------------ .../autoencoders/autoencoder_kl_mochi.py | 25 +------------ .../autoencoders/autoencoder_kl_qwenimage.py | 25 +------------ .../models/autoencoders/autoencoder_kl_wan.py | 25 +------------ .../autoencoders/autoencoder_oobleck.py | 17 +-------- .../models/autoencoders/autoencoder_tiny.py | 33 +---------------- .../autoencoders/consistency_decoder_vae.py | 37 +------------------ src/diffusers/models/autoencoders/vae.py | 35 ++++++++++++++++++ 15 files changed, 63 insertions(+), 350 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index 783f22e97daf..724ec3bb760c 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -27,7 +27,7 @@ from ..modeling_utils import ModelMixin from ..normalization import RMSNorm, get_normalization from ..transformers.sana_transformer import GLUMBConv -from .vae import DecoderOutput, EncoderOutput +from .vae import AutoencoderMixin, DecoderOutput, EncoderOutput class ResBlock(nn.Module): @@ -378,7 +378,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin): +class AutoencoderDC(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin): r""" An Autoencoder model introduced in [DCAE](https://huggingface.co/papers/2410.10733) and used in [SANA](https://huggingface.co/papers/2410.10629). @@ -536,27 +536,6 @@ def enable_tiling( self.tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio self.tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio - def disable_tiling(self) -> None: - r""" - Disable tiled AE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced AE decoding. When this option is enabled, the AE will split the input tensor in slices to compute - decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced AE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _encode(self, x: torch.Tensor) -> torch.Tensor: batch_size, num_channels, height, width = x.shape diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index d823c2fb8b04..1a72aa3cfeb3 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -32,10 +32,10 @@ ) from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder +from .vae import AutoencoderMixin, Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder -class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): +class AutoencoderKL(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. @@ -138,35 +138,6 @@ def __init__( self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))) self.tile_overlap_factor = 0.25 - def enable_tiling(self, use_tiling: bool = True): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.use_tiling = use_tiling - - def disable_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.enable_tiling(False) - - def enable_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - @property # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors def attn_processors(self) -> Dict[str, AttentionProcessor]: diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py b/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py index c24b8f42aca4..6756586460d3 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_allegro.py @@ -28,6 +28,7 @@ from ..modeling_utils import ModelMixin from ..resnet import ResnetBlock2D from ..upsampling import Upsample2D +from .vae import AutoencoderMixin class AllegroTemporalConvLayer(nn.Module): @@ -673,7 +674,7 @@ def forward(self, sample: torch.Tensor) -> torch.Tensor: return sample -class AutoencoderKLAllegro(ModelMixin, ConfigMixin): +class AutoencoderKLAllegro(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Used in [Allegro](https://github.com/rhymes-ai/Allegro). @@ -795,35 +796,6 @@ def __init__( sample_size - self.tile_overlap_w, ) - def enable_tiling(self) -> None: - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.use_tiling = True - - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _encode(self, x: torch.Tensor) -> torch.Tensor: # TODO(aryan) # if self.use_tiling and (width> self.tile_sample_min_width or height> self.tile_sample_min_height): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index e0e9436e8984..5096b725d0bb 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -29,7 +29,7 @@ from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin from ..upsampling import CogVideoXUpsample3D -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -955,7 +955,7 @@ def forward( return hidden_states, new_conv_cache -class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin): +class AutoencoderKLCogVideoX(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in [CogVideoX](https://github.com/THUDM/CogVideo). @@ -1124,27 +1124,6 @@ def enable_tiling( self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _encode(self, x: torch.Tensor) -> torch.Tensor: batch_size, num_channels, num_frames, height, width = x.shape diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py b/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py index 500e316ebcf0..b17522d1c424 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cosmos.py @@ -24,7 +24,7 @@ from ...utils.accelerate_utils import apply_forward_hook from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, IdentityDistribution +from .vae import AutoencoderMixin, DecoderOutput, IdentityDistribution logger = get_logger(__name__) @@ -875,7 +875,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -class AutoencoderKLCosmos(ModelMixin, ConfigMixin): +class AutoencoderKLCosmos(ModelMixin, AutoencoderMixin, ConfigMixin): r""" Autoencoder used in [Cosmos](https://huggingface.co/papers/2501.03575). @@ -1031,27 +1031,6 @@ def enable_tiling( self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _encode(self, x: torch.Tensor) -> torch.Tensor: x = self.encoder(x) enc = self.quant_conv(x) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py index 7b0f9889a52f..88b9bb507ff6 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py @@ -18,7 +18,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -import torch.utils.checkpoint from ...configuration_utils import ConfigMixin, register_to_config from ...utils import logging @@ -27,7 +26,7 @@ from ..attention_processor import Attention from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -625,7 +624,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -class AutoencoderKLHunyuanVideo(ModelMixin, ConfigMixin): +class AutoencoderKLHunyuanVideo(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Introduced in [HunyuanVideo](https://huggingface.co/papers/2412.03603). @@ -764,27 +763,6 @@ def enable_tiling( self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _encode(self, x: torch.Tensor) -> torch.Tensor: batch_size, num_channels, num_frames, height, width = x.shape diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index 51c600a4e915..47f2081b7e45 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -26,7 +26,7 @@ from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin from ..normalization import RMSNorm -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution class LTXVideoCausalConv3d(nn.Module): @@ -1034,7 +1034,7 @@ def forward(self, hidden_states: torch.Tensor, temb: Optional[torch.Tensor] = No return hidden_states -class AutoencoderKLLTXVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin): +class AutoencoderKLLTXVideo(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in [LTX](https://huggingface.co/Lightricks/LTX-Video). @@ -1219,27 +1219,6 @@ def enable_tiling( self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _encode(self, x: torch.Tensor) -> torch.Tensor: batch_size, num_channels, num_frames, height, width = x.shape diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py b/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py index 43294a901f02..97ca9d669264 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_magvit.py @@ -26,7 +26,7 @@ from ..activations import get_activation from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -663,7 +663,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -class AutoencoderKLMagvit(ModelMixin, ConfigMixin): +class AutoencoderKLMagvit(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. This model is used in [EasyAnimate](https://huggingface.co/papers/2405.18991). @@ -805,27 +805,6 @@ def enable_tiling( self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - @apply_forward_hook def _encode( self, x: torch.Tensor, return_dict: bool = True diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py index 404d2f6d860a..3ded9a0a5491 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py @@ -27,7 +27,7 @@ from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin from .autoencoder_kl_cogvideox import CogVideoXCausalConv3d -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -657,7 +657,7 @@ def forward( return hidden_states, new_conv_cache -class AutoencoderKLMochi(ModelMixin, ConfigMixin): +class AutoencoderKLMochi(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in [Mochi 1 preview](https://github.com/genmoai/models). @@ -818,27 +818,6 @@ def enable_tiling( self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def _enable_framewise_encoding(self): r""" Enables the framewise VAE encoding implementation with past latent padding. By default, Diffusers uses the diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py index 87ac40659212..844530d1f141 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py @@ -32,7 +32,7 @@ from ..activations import get_activation from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -664,7 +664,7 @@ def forward(self, x, feat_cache=None, feat_idx=[0]): return x -class AutoencoderKLQwenImage(ModelMixin, ConfigMixin, FromOriginalModelMixin): +class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin): r""" A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. @@ -764,27 +764,6 @@ def enable_tiling( self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def clear_cache(self): def _count_conv3d(model): count = 0 diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py index e6e58c1cce85..cc3fd664da1a 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -26,7 +26,7 @@ from ..activations import get_activation from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, DiagonalGaussianDistribution +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -952,7 +952,7 @@ def unpatchify(x, patch_size): return x -class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin): +class AutoencoderKLWan(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModelMixin): r""" A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos. Introduced in [Wan 2.1]. @@ -1111,27 +1111,6 @@ def enable_tiling( self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_tiling = False - - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - def clear_cache(self): # Use cached conv counts for decoder and encoder to avoid re-iterating modules each call self._conv_num = self._cached_conv_counts["decoder"] diff --git a/src/diffusers/models/autoencoders/autoencoder_oobleck.py b/src/diffusers/models/autoencoders/autoencoder_oobleck.py index a10b616b4e25..d83264559209 100644 --- a/src/diffusers/models/autoencoders/autoencoder_oobleck.py +++ b/src/diffusers/models/autoencoders/autoencoder_oobleck.py @@ -25,6 +25,7 @@ from ...utils.accelerate_utils import apply_forward_hook from ...utils.torch_utils import randn_tensor from ..modeling_utils import ModelMixin +from .vae import AutoencoderMixin class Snake1d(nn.Module): @@ -291,7 +292,7 @@ def forward(self, hidden_state): return hidden_state -class AutoencoderOobleck(ModelMixin, ConfigMixin): +class AutoencoderOobleck(ModelMixin, AutoencoderMixin, ConfigMixin): r""" An autoencoder for encoding waveforms into latents and decoding latent representations into waveforms. First introduced in Stable Audio. @@ -356,20 +357,6 @@ def __init__( self.use_slicing = False - def enable_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - @apply_forward_hook def encode( self, x: torch.Tensor, return_dict: bool = True diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py index 3e2b28606e29..b9ac713d7392 100644 --- a/src/diffusers/models/autoencoders/autoencoder_tiny.py +++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py @@ -22,7 +22,7 @@ from ...utils import BaseOutput from ...utils.accelerate_utils import apply_forward_hook from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, DecoderTiny, EncoderTiny +from .vae import AutoencoderMixin, DecoderOutput, DecoderTiny, EncoderTiny @dataclass @@ -38,7 +38,7 @@ class AutoencoderTinyOutput(BaseOutput): latents: torch.Tensor -class AutoencoderTiny(ModelMixin, ConfigMixin): +class AutoencoderTiny(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A tiny distilled VAE model for encoding images into latents and decoding latent representations into images. @@ -162,35 +162,6 @@ def unscale_latents(self, x: torch.Tensor) -> torch.Tensor: """[0, 1] -> raw latents""" return x.sub(self.latent_shift).mul(2 * self.latent_magnitude) - def enable_slicing(self) -> None: - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self) -> None: - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - - def enable_tiling(self, use_tiling: bool = True) -> None: - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.use_tiling = use_tiling - - def disable_tiling(self) -> None: - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.enable_tiling(False) - def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py index b3017a878092..0a6258fed37f 100644 --- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py +++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py @@ -32,7 +32,7 @@ ) from ..modeling_utils import ModelMixin from ..unets.unet_2d import UNet2DModel -from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution, Encoder @dataclass @@ -49,7 +49,7 @@ class ConsistencyDecoderVAEOutput(BaseOutput): latent_dist: "DiagonalGaussianDistribution" -class ConsistencyDecoderVAE(ModelMixin, ConfigMixin): +class ConsistencyDecoderVAE(ModelMixin, AutoencoderMixin, ConfigMixin): r""" The consistency decoder used with DALL-E 3. @@ -167,39 +167,6 @@ def __init__( self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))) self.tile_overlap_factor = 0.25 - # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling - def enable_tiling(self, use_tiling: bool = True): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.use_tiling = use_tiling - - # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_tiling - def disable_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.enable_tiling(False) - - # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_slicing - def enable_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_slicing - def disable_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - @property # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors def attn_processors(self) -> Dict[str, AttentionProcessor]: diff --git a/src/diffusers/models/autoencoders/vae.py b/src/diffusers/models/autoencoders/vae.py index 1d74d4f472d7..c8f29aeadfbc 100644 --- a/src/diffusers/models/autoencoders/vae.py +++ b/src/diffusers/models/autoencoders/vae.py @@ -894,3 +894,38 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # scale image from [0, 1] to [-1, 1] to match diffusers convention return x.mul(2).sub(1) + + +class AutoencoderMixin: + def enable_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + if not hasattr(self, "use_tiling"): + raise NotImplementedError(f"Tiling doesn't seem to be implemented for {self.__class__.__name__}.") + self.use_tiling = True + + def disable_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing + decoding in one step. + """ + self.use_tiling = False + + def enable_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + if not hasattr(self, "use_slicing"): + raise NotImplementedError(f"Tiling doesn't seem to be implemented for {self.__class__.__name__}.") + self.use_slicing = True + + def disable_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing + decoding in one step. + """ + self.use_slicing = False From 4c9db168740db03128efcebe22e9e52453c09585 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: 2025年10月13日 10:43:58 +0530 Subject: [PATCH 2/5] correct wording. --- src/diffusers/models/autoencoders/vae.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/autoencoders/vae.py b/src/diffusers/models/autoencoders/vae.py index c8f29aeadfbc..7b171961254a 100644 --- a/src/diffusers/models/autoencoders/vae.py +++ b/src/diffusers/models/autoencoders/vae.py @@ -920,7 +920,7 @@ def enable_slicing(self): compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. """ if not hasattr(self, "use_slicing"): - raise NotImplementedError(f"Tiling doesn't seem to be implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"Slicing doesn't seem to be implemented for {self.__class__.__name__}.") self.use_slicing = True def disable_slicing(self): From 18054507b989d21c0386f8a95ad081f60c539750 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: 2025年10月19日 09:31:36 -1000 Subject: [PATCH 3/5] up --- tests/models/autoencoders/testing_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/models/autoencoders/testing_utils.py b/tests/models/autoencoders/testing_utils.py index cf1f10a4a545..8ae362ac2e94 100644 --- a/tests/models/autoencoders/testing_utils.py +++ b/tests/models/autoencoders/testing_utils.py @@ -57,6 +57,9 @@ def test_enable_disable_tiling(self): torch.manual_seed(0) model = self.model_class(**init_dict).to(torch_device) + if not hasattr(model, "use_tiling"): + pytest.skip(f"Skipping test as {self.model_class.__name__} doesn't support tiling.") + inputs_dict.update({"return_dict": False}) _ = inputs_dict.pop("generator", None) accepts_generator = self._accepts_generator(model) @@ -102,6 +105,8 @@ def test_enable_disable_slicing(self): torch.manual_seed(0) model = self.model_class(**init_dict).to(torch_device) + if not hasattr(model, "use_slicing"): + pytest.skip(f"Skipping test as {self.model_class.__name__} doesn't support tiling.") inputs_dict.update({"return_dict": False}) _ = inputs_dict.pop("generator", None) From 231b316fc04e4124c59d3cca65f6117dfa09a45a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: 2025年10月20日 20:19:07 -1000 Subject: [PATCH 4/5] up --- src/diffusers/models/autoencoders/autoencoder_asym_kl.py | 4 ++-- .../models/autoencoders/autoencoder_kl_temporal_decoder.py | 4 ++-- src/diffusers/models/autoencoders/vq_model.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py index 54b1fc677188..fc812b22fba3 100644 --- a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py @@ -20,10 +20,10 @@ from ...utils.accelerate_utils import apply_forward_hook from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin -from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder -class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin): +class AsymmetricAutoencoderKL(ModelMixin, AutoencoderMixin, ConfigMixin): r""" Designing a Better Asymmetric VQGAN for StableDiffusion https://huggingface.co/papers/2306.04632 . A VAE model with KL loss for encoding images into latents and decoding latent representations into images. diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index cf46e52564bf..ab76254d19e2 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -23,7 +23,7 @@ from ..modeling_outputs import AutoencoderKLOutput from ..modeling_utils import ModelMixin from ..unets.unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder -from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder +from .vae import AutoencoderMixin, DecoderOutput, DiagonalGaussianDistribution, Encoder class TemporalDecoder(nn.Module): @@ -135,7 +135,7 @@ def forward( return sample -class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin): +class AutoencoderKLTemporalDecoder(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. diff --git a/src/diffusers/models/autoencoders/vq_model.py b/src/diffusers/models/autoencoders/vq_model.py index c1094e62f7ec..82436473dfc6 100644 --- a/src/diffusers/models/autoencoders/vq_model.py +++ b/src/diffusers/models/autoencoders/vq_model.py @@ -22,6 +22,7 @@ from ...utils.accelerate_utils import apply_forward_hook from ..autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer from ..modeling_utils import ModelMixin +from .vae import AutoencoderMixin @dataclass @@ -37,7 +38,7 @@ class VQEncoderOutput(BaseOutput): latents: torch.Tensor -class VQModel(ModelMixin, ConfigMixin): +class VQModel(ModelMixin, AutoencoderMixin, ConfigMixin): r""" A VQ-VAE model for decoding latent representations. From 99bc6649d01933da579dead78bc062ab2c2a1218 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: 2025年10月21日 14:49:50 -1000 Subject: [PATCH 5/5] up --- src/diffusers/models/autoencoders/autoencoder_asym_kl.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py index fc812b22fba3..fa49fcfe79f8 100644 --- a/src/diffusers/models/autoencoders/autoencoder_asym_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py @@ -107,9 +107,6 @@ def __init__( self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) - self.use_slicing = False - self.use_tiling = False - self.register_to_config(block_out_channels=up_block_out_channels) self.register_to_config(force_upcast=False)

AltStyle によって変換されたページ (->オリジナル) /