Source code for modelzoo.vision.pytorch.dit.layers.vae.UNetMidBlock2D

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.nn as nn

from modelzoo.common.pytorch.layers.AttentionHelper import get_attention_module
from modelzoo.vision.pytorch.dit.layers.vae.ResNetBlock2D import ResnetBlock2D


[docs]class UNetMidBlock2D(nn.Module):
[docs]    def __init__(
        self,
        in_channels: int,
        temb_channels: int,
        dropout: float = 0.0,
        num_layers: int = 1,
        resnet_eps: float = 1e-6,
        resnet_time_scale_shift: str = "default",
        resnet_act_fn: str = "swish",
        resnet_groups: int = 32,
        resnet_pre_norm: bool = True,
        add_attention: bool = True,
        attn_num_head_channels=1,
        output_scale_factor=1.0,
        attention_type="aiayn_attention",
        extra_attn_params=None,
    ):
        super().__init__()
        resnet_groups = (
            resnet_groups
            if resnet_groups is not None
            else min(in_channels // 4, 32)
        )
        self.add_attention = add_attention
        extra_attn_params = (
            {} if extra_attn_params is None else extra_attn_params
        )
        AttentionModule = get_attention_module(
            attention_type, extra_attn_params
        )
        self.output_scale_factor = output_scale_factor

        # there is always at least one resnet
        resnets = [
            ResnetBlock2D(
                in_channels=in_channels,
                out_channels=in_channels,
                temb_channels=temb_channels,
                eps=resnet_eps,
                groups=resnet_groups,
                dropout=dropout,
                time_embedding_norm=resnet_time_scale_shift,
                non_linearity=resnet_act_fn,
                output_scale_factor=self.output_scale_factor,
                pre_norm=resnet_pre_norm,
            )
        ]
        attentions = []
        norms = []

        for _ in range(num_layers):
            if self.add_attention:
                group_norm = nn.GroupNorm(
                    num_channels=in_channels,
                    num_groups=resnet_groups,
                    eps=resnet_eps,
                    affine=True,
                )
                if attn_num_head_channels is not None:
                    num_heads = in_channels // attn_num_head_channels
                else:
                    num_heads = 1
                attention_layer = AttentionModule(
                    embed_dim=in_channels,
                    num_heads=num_heads,
                    inner_dim=None,
                    dropout=0.0,
                    batch_first=True,
                    attention_type="scaled_dot_product",
                    softmax_dtype_fp32=True,
                    use_projection_bias=True,
                    use_ffn_bias=True,
                )
                norms.append(group_norm)
                attentions.append(attention_layer)
            else:
                attentions.append(None)

            resnets.append(
                ResnetBlock2D(
                    in_channels=in_channels,
                    out_channels=in_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=self.output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )
        self.norms = nn.ModuleList(norms)
        self.attentions = nn.ModuleList(attentions)
        self.resnets = nn.ModuleList(resnets)

    def forward(self, hidden_states, temb=None):
        hidden_states = self.resnets[0](hidden_states, temb)
        for norm, attn, resnet in zip(
            self.norms, self.attentions, self.resnets[1:]
        ):
            if attn is not None:
                residual = hidden_states
                batch, channel, height, width = hidden_states.shape

                # norm
                hidden_states = norm(hidden_states)

                # attn
                hidden_states = hidden_states.view(
                    batch, channel, height * width
                ).transpose(1, 2)

                attn_mask = torch.ones(
                    hidden_states.shape[1],
                    hidden_states.shape[1],
                    dtype=hidden_states.dtype,
                    device=hidden_states.device,
                )
                hidden_states = attn(
                    hidden_states,
                    hidden_states,
                    hidden_states,
                    attn_mask=attn_mask,
                )
                hidden_states = hidden_states.transpose(-1, -2).reshape(
                    batch, channel, height, width
                )

                # residual connection
                hidden_states = (
                    hidden_states + residual
                ) / self.output_scale_factor

            hidden_states = resnet(hidden_states, temb)

        return hidden_states