Source code for cerebras.modelzoo.trainer.callbacks.precision

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Contains the MixedPrecision callback class for mixed precision training."""

from abc import ABC, abstractmethod
from contextlib import nullcontext
from typing import Literal, Optional, Union
from warnings import warn

import torch

import cerebras.pytorch as cstorch
from cerebras.modelzoo.trainer.callbacks import Callback


[docs]class Precision(Callback, ABC):
    """Base precision class for implementing custom backwards pass and
    optimization step to handle different precision types.
    """

[docs]    @abstractmethod
    def autocast_context_manager(self):
        """Returns the context manager that performs autocasting for the
        forward pass.
        """

[docs]    @abstractmethod
    def backward(self, loss: torch.Tensor):
        """Performs the backward pass.

        Args:
            loss: Loss tensor.
        """

[docs]    @abstractmethod
    def clip_gradients(self, optimizer: cstorch.optim.Optimizer):
        """Clips the gradients before the optimization step.

        Args:
            optimizer: The optimizer to step.
        """

[docs]    @abstractmethod
    def optimizer_step(self, optimizer: cstorch.optim.Optimizer):
        """Performs the optimization step.

        Args:
            optimizer: The optimizer to step.
        """


[docs]class MixedPrecision(Precision):
    """
    Callback class that handles mixed precision training.
    """

    def __init__(
        self,
        enabled: bool = True,
        fp16_type: Literal["float16", "bfloat16", "cbfloat16"] = "float16",
        precision_opt_level: Optional[int] = None,
        loss_scaling_factor: Union[float, Literal["dynamic"]] = 1.0,
        initial_loss_scale: Optional[float] = None,
        steps_per_increase: int = 2000,
        min_loss_scale: Optional[float] = None,
        max_loss_scale: Optional[float] = None,
        max_gradient_norm: Optional[float] = None,
        max_gradient_value: Optional[float] = None,
        log_loss_scale: bool = False,
    ):
        """
        Args:
            enabled: If True, enables mixed precision training.
            fp16_type: Half precision type. One of "float16", "bfloat16", "cbfloat16".
            precision_opt_level: Precision optimization level. If not None, sets the
                global precision optimization level.
            loss_scaling_factor: Initial loss scaling factor.
            initial_loss_scale: Initial loss scale.
            steps_per_increase: Number of steps before increasing the loss scale.
            min_loss_scale: Minimum loss scale.
            max_loss_scale: Maximum loss scale.
            max_gradient_norm: Maximum gradient norm for gradient clipping.
            max_gradient_value: Maximum gradient value for gradient clipping.
            log_loss_scale: If True, log the gradient scaler's loss scale.
        """
        self.scaler = None

        self.enabled = enabled
        self.fp16_type = fp16_type
        self.precision_opt_level = precision_opt_level
        self.loss_scaling_factor = loss_scaling_factor
        self.initial_loss_scale = initial_loss_scale
        self.steps_per_increase = steps_per_increase
        self.min_loss_scale = min_loss_scale
        self.max_loss_scale = max_loss_scale
        self.max_gradient_norm = max_gradient_norm
        self.max_gradient_value = max_gradient_value
        self.log_loss_scale = log_loss_scale

        if precision_opt_level is not None:
            cstorch.backends.csx.precision.optimization_level = (
                precision_opt_level
            )

    def setup(self, trainer):
        # pylint: disable=attribute-defined-outside-init
        self.backend = trainer.backend

        if not self.enabled:
            if self.backend.is_csx:
                warn(
                    "Mixed precision must be enabled for CSX. Setting enabled = True"
                )
                self.enabled = True
            else:
                return

        valid_dtypes = ["float16", "bfloat16", "cbfloat16"]
        if self.fp16_type not in valid_dtypes:
            raise ValueError(
                f"Invalid value for \"fp16_type\". Expected one of {valid_dtypes}. "
                f"Got: {self.fp16_type}."
            )

        if self.fp16_type == "cbfloat16":
            if not self.backend.is_csx:
                self.fp16_type = torch.bfloat16
                warn(
                    "cbfloat16 is only supported on CSX. Setting half dtype to bfloat16."
                )
            elif self.loss_scaling_factor != "dynamic":
                raise ValueError(
                    f"In order to use cbfloat16, dynamic loss scaling must be enabled. "
                    f"Otherwise, gradients might underflow/overflow in the middle of "
                    f"training and cause NaNs. Please set `loss_scaling_factor` to "
                    f"`dynamic` to use cbfloat16."
                )
        elif self.backend.is_cpu and self.fp16_type != "bfloat16":
            self.fp16_type = torch.bfloat16
            warn(
                "Mixed precision on CPU is only supported with bfloat16. "
                "Setting half dtype to bfloat16."
            )

        cstorch.amp.set_half_dtype(self.fp16_type)

        if self.fp16_type == "bfloat16":
            if self.loss_scaling_factor == "dynamic":
                trainer.logger.info(
                    f"No need to use DLS for loss when half dtype is bfloat16. "
                    f"Disabling gradient scaling."
                )
            self.scaler = None
        elif self.backend.is_csx:
            self.scaler = cstorch.amp.GradScaler(
                loss_scale=self.loss_scaling_factor,
                init_scale=self.initial_loss_scale,
                steps_per_increase=self.steps_per_increase,
                min_loss_scale=self.min_loss_scale,
                max_loss_scale=self.max_loss_scale,
            )
        else:
            if self.loss_scaling_factor == "dynamic":
                self.scaler = torch.cuda.amp.GradScaler(
                    init_scale=self.initial_loss_scale,
                    growth_interval=self.steps_per_increase,
                )
            else:
                self.scaler = torch.cuda.amp.GradScaler(
                    init_scale=self.loss_scaling_factor,
                    growth_interval=2**63 - 1,
                )

        max_gradient_norm = self.max_gradient_norm
        max_gradient_value = self.max_gradient_value

        if max_gradient_norm is not None and max_gradient_norm <= 0.0:
            raise ValueError(
                f"max_gradient_norm has to be a positive float. Got "
                f"{max_gradient_norm}"
            )
        if max_gradient_value is not None and max_gradient_value <= 0.0:
            raise ValueError(
                f"max_gradient_value has to be a positive float. Got "
                f"{max_gradient_value}"
            )
        if max_gradient_norm is not None and max_gradient_value is not None:
            raise ValueError(
                f"Gradients can be clipped by norm(={max_gradient_norm}) or by "
                f"value(={max_gradient_value}), but not both. "
                f"Do not set both `max_gradient_norm` and `max_gradient_value`."
            )

    def autocast_context_manager(self):
        if not self.enabled or self.backend.is_csx:
            return nullcontext()
        else:
            return cstorch.amp.autocast()

    def backward(self, loss):
        if self.scaler:
            self.scaler.scale(loss).backward()
        else:
            loss.backward()

[docs]    def on_before_optimizer_step(self, trainer, model, optimizer):
        """Unscales the gradients and performs gradient clipping."""
        if self.scaler:
            self.scaler.unscale_(optimizer)

    def clip_gradients(self, optimizer):
        if self.scaler:
            params = (
                p
                for param_group in optimizer.param_groups
                for p in param_group["params"]
            )
            if self.max_gradient_norm is not None:
                torch.nn.utils.clip_grad_norm_(
                    list(params), self.max_gradient_norm
                )
            elif self.max_gradient_value is not None:
                torch.nn.utils.clip_grad_value_(
                    list(params), self.max_gradient_value
                )

    def optimizer_step(self, optimizer):
        if self.scaler:
            self.scaler.step(optimizer)
            self.scaler.update()
        else:
            optimizer.step()

    def on_train_batch_end(self, trainer, model, outputs, batch, batch_idx):
        if self.scaler and self.log_loss_scale:
            trainer.log_metrics(loss_scale=self.scaler.get_scale())

    def on_save_checkpoint(self, trainer, state_dict):
        if self.scaler:
            state_dict["grad_scaler"] = self.scaler.state_dict()

    def on_load_checkpoint(self, trainer, state_dict):
        if self.scaler:
            # TODO: handle conversion between CUDA grad scaler and cstorch grad scaler
            if "grad_scaler" in state_dict:
                self.scaler.load_state_dict(state_dict["grad_scaler"])

                trainer.logger.info(
                    f"Gradient scaler state found in checkpoint and loaded successfully."
                )
            else:
                trainer.logger.info(
                    "Gradient scaler state not found in the checkpoint. "
                    "Using default initialized state."
                )