Source code for cerebras_pytorch.amp.grad_scaler

# Copyright 2016-2023 Cerebras Systems
# SPDX-License-Identifier: BSD-3-Clause

"""Cerebras Gradient Scaler implementation"""
import warnings
from contextlib import ExitStack, contextmanager
from enum import Enum, auto
from typing import Union

import torch

import cerebras_pytorch.amp as amp
from cerebras_pytorch.backend import current_backend_impl
from cerebras_pytorch.utils._tensor import conditional_update

from ._amp_state import _amp_state, maybe_print


class OptState(Enum):
    """
    An enum to specify the optimizer's current state regarding if its been
    scaled or not
    """

    READY = auto()
    SCALED = auto()
    UNSCALED = auto()
    STEPPED = auto()

    def is_unscaled(self):
        """Returns true if the state is unscaled"""
        return self == OptState.UNSCALED


[docs]class GradScaler: """ Faciliates mixed precision training and DLS, DLS + GCC For more details please see docs for amp.initialize. Args: loss_scale: If loss_scale == "dynamic", then configure dynamic loss scaling. Otherwise, it is the loss scale value used in static loss scaling. init_scale: The initial loss scale value if loss_scale == "dynamic" steps_per_increase: The number of steps after which to increase the loss scaling condition min_loss_scale: The minimum loss scale value that can be chosen by dynamic loss scaling max_loss_scale: The maximum loss scale value that can be chosen by dynamic loss scaling overflow_tolerance: The maximum fraction of steps involving infinite or undefined values in the gradient we allow. We reduce the loss scale if the tolerance is exceeded max_gradient_norm: The maximum gradient norm to use for global gradient clipping Only applies in the DLS + GCC case. If GCC is not enabled, then this parameter has no effect """ warned_unscaling_non_fp32_grad = False
[docs] def __init__( self, loss_scale: Union[str, float] = None, init_scale: float = None, steps_per_increase: int = None, min_loss_scale: float = None, max_loss_scale: float = None, overflow_tolerance: float = 0.0, max_gradient_norm: float = None, ): fp16_type = _amp_state.half_dtype_str default_max_loss_scale_value = ( 2.0 ** 31 if fp16_type == "cbfloat16" else 2.0 ** 15 ) loss_scale = loss_scale if loss_scale else 1.0 init_scale = init_scale if init_scale else default_max_loss_scale_value steps_per_increase = steps_per_increase if steps_per_increase else 2000 min_loss_scale = min_loss_scale if min_loss_scale else 2.0 ** -14 max_loss_scale = ( max_loss_scale if max_loss_scale else default_max_loss_scale_value ) self.loss_is_scaled = loss_scale != 1.0 self.backend = current_backend_impl() if loss_scale == "dynamic": if min_loss_scale < 2.0 ** -14: raise ValueError("min_loss_scale too small") if overflow_tolerance < 0: raise ValueError( "loss scaling counter threshold must be set >= 0" ) self.dynamic = True self._loss_scale = torch.tensor( min(max_loss_scale, init_scale), dtype=torch.float32, ) self._steps_since_rescale = torch.tensor(0, dtype=torch.int64) self._overflows_since_rescale = torch.tensor(0, dtype=torch.int64) self._overflow_tolerance = overflow_tolerance self._max_gradient_norm = max_gradient_norm if max_gradient_norm: warnings.warn( "Using global gradient clipping built into GradScaler " "is deprecated. Use torch.nn.utils.clip_grad_norm_" ) # Will be set in `_unscale_helper` self._squared_local_norms = [] # Will bet set in `update_scale` self.isfinite = None else: self.dynamic = False self._loss_scale = loss_scale self.isfinite = True max_gradient_norm = None self._max_loss_scale = max_loss_scale self._min_loss_scale = min_loss_scale self._steps_per_increase = steps_per_increase self.global_norm = None self.backend.setup_grad_scaler(self) for optimizer in self.backend.optimizer_registry: amp.setup_optimizer(optimizer) optimizer._amp_stash.state = OptState.READY
[docs] def state_dict(self, destination=None): """ Returns a dictionary containing the state to be saved to a checkpoint """ if not self.backend.backend_type.is_csx: return {} if self.dynamic: return { "loss_scale": self._loss_scale, "steps_since_rescale": self._steps_since_rescale, "overflows_since_rescale": self._overflows_since_rescale, } else: return {"loss_scale": self._loss_scale}
[docs] def load_state_dict(self, state_dict): """ Loads the state dictionary into the current params """ def load_param(param, param_name): # Only load if the key exists in the state_dict if param_name in state_dict: value = state_dict[param_name] if isinstance(param, torch.Tensor): if isinstance(value, torch.Tensor): # Only move to device is the param device is not CPU # Otherwise keep the original value's device if ( value.device.type != param.device.type and param.device.type != "cpu" ): return value.to(param.device) return value else: return torch.tensor(value, dtype=param.dtype).to( param.device ) else: return value else: return param self._loss_scale = load_param(self._loss_scale, "loss_scale") if self.dynamic: self._steps_since_rescale = load_param( self._steps_since_rescale, "steps_since_rescale" ) self._overflows_since_rescale = load_param( self._overflows_since_rescale, "overflows_since_rescale" )
[docs] def scale(self, loss: torch.Tensor): """Scales the loss in preparation of the backwards pass""" # TODO: handle the case of outputs being iterable # which is supported by the torch interface if not self.backend.backend_type.is_csx: return loss with self.backend.name_scope("grad_scaler.scale"): loss = self.backend.pre_backward(loss) if (not self.dynamic) and self._loss_scale == 1.0: # Mark optimizers has having been unscaled since there is # no scaling to be done for optimizer in self.backend.optimizer_registry: # pylint: disable=protected-access optimizer._amp_stash.state = OptState.UNSCALED return loss.float() for optimizer in self.backend.optimizer_registry: # pylint: disable=protected-access if optimizer._amp_stash.state == OptState.READY: optimizer._prepare_amp_backward() optimizer._amp_stash.state = OptState.SCALED continue if optimizer._amp_stash.state != OptState.SCALED: raise RuntimeError( "Optimizer parameter gradients already scaled" ) return (loss.float()) * self._loss_scale
[docs] def get_scale(self): """Return the loss scale""" return self._loss_scale
def _unscale_helper(self, model_grads, master_grads, scale): for model, master in zip(model_grads, master_grads): if model is not None: if ( master is not model ): # copy_ probably internally short-circuits this master.copy_(model) if not self.dynamic and scale == 1.0: return if not GradScaler.warned_unscaling_non_fp32_grad: for master in master_grads: if master.dtype != torch.float32: maybe_print( f"Attempting to unscale a grad with type {master.type()} " f"Unscaling non-fp32 grads may indicate an error. " f"When using Amp, you don't need to call .half() on your model." ) GradScaler.warned_unscaling_non_fp32_grad = True if self.dynamic: inv_scale = torch.tensor(1.0, dtype=torch.float32) / scale else: inv_scale = torch.tensor(1.0 / scale, dtype=torch.float32) for master in master_grads: master.mul_(inv_scale) if self.dynamic: # Use CS1 compatible algorithm for detcting NaN/inf by using global # L2 norm of all gradients norms_squared = [torch.sum(g * g) for g in master_grads] self._squared_local_norms.extend(norms_squared) def _unscale( self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None, ): # implementation scale = self._loss_scale if scale_override is not None: scale = scale_override if self.dynamic or not models_are_masters or scale != 1.0: self._unscale_helper(model_grads, master_grads, scale) def _unscale_with_stashed_python( self, model_grads, stashed_master_grads, master_grads, a, b ): # pylint: disable=missing-function-docstring raise NotImplementedError("stashed grads not supported") def _unscale_with_stashed( self, model_grads, stashed_master_grads, master_grads, scale_override=None, ): # pylint: disable=missing-function-docstring raise NotImplementedError("stashed grads not supported")
[docs] def unscale_(self, optimizer): """Unscales the optimizer's params gradients inplace""" # Go unscale all the gradients # pylint: disable=protected-access if optimizer._amp_stash.state == OptState.UNSCALED: return # no-op elif optimizer._amp_stash.state == OptState.STEPPED: raise RuntimeError("unscale_() is being called after step().") # if not dynamic, short circuit to match the implicit context manager case if (not self.dynamic) and self._loss_scale == 1.0: optimizer._amp_stash.state = OptState.UNSCALED return with self.backend.name_scope("grad_scaler.unscale_"): optimizer._post_amp_backward(self) optimizer._amp_stash.params_have_scaled_gradients = False optimizer._amp_stash.state = OptState.UNSCALED
# Reconsider this name because `_handle_undefined_state` is kind of nonsense. @contextmanager def _handle_undefined_state(self, optimizer): if not self.dynamic: yield return # Rather than conditionally execute the optimizer using python flow control, # we must trace the execution of the optimizer. However, if the gradients # were not finite, the state should not be updated. Therefore, all state # update (parameters and optimizer state) must be conditional. Use the # conditional_update context manager for each tensor which may be updated. with ExitStack() as all_conditional_updates: for group in optimizer.param_groups: for p in group["params"]: if p.grad is None: continue all_conditional_updates.enter_context( conditional_update(p, self.isfinite) ) for state in optimizer.state[p].values(): all_conditional_updates.enter_context( conditional_update(state, self.isfinite) ) yield # Update optimizer parameter and state
[docs] def step_if_finite(self, optimizer, *args, **kwargs): """ Directly conditionalize the call to optimizer.step(*args, **kwargs) but only if this GradScaler detected finite grads. Args: optimizer (cerebras_pytorch.optim.Optimizer): Optimizer that applies the gradients. args: Any arguments. kwargs: Any keyword arguments. Returns: The result of optimizer.step() """ with self._handle_undefined_state( optimizer ): # combine static and dynamic next with amp.disable_casts(): return optimizer.step(*args, **kwargs)
[docs] def clip_gradients_and_return_isfinite(self, optimizers): """ Clip the optimizer's params's gradients and return whether or not the norm is finite """ # Compute gloal norm from all squared local norms # if not self.global_norm: self.global_norm = torch.sqrt( torch.sum(torch.stack(self._squared_local_norms)) ) def float32(value): return torch.tensor( value, dtype=torch.float32, device=self.global_norm.device ) # self.isfinite = torch.isfinite(self.global_norm) # TODO: torch.isfinite^ hits a lowering error! so use: self.isfinite = self.global_norm < float32(float("inf")) if self._max_gradient_norm: # Then we're doing combo GGC + DLS # https://github.com/pytorch/pytorch/blob/release/1.9/torch/nn/utils/clip_grad.py#L56-L59 clip_coef = float32(self._max_gradient_norm) / ( self.global_norm + 1e-6 ) clip_coef = torch.where(clip_coef < 1, clip_coef, 1.0,) for optimizer in optimizers: for group in optimizer.param_groups: for p in group['params']: if p.grad is None: continue p.grad.detach().mul_(clip_coef) return self.isfinite
[docs] def step(self, optimizer, *args, **kwargs): """ `Step` carries out the following two operations: 1. Internally invokes ``unscale_(optimizer)`` (unless `unscale_` was explicitly called for ``optimizer`` earlier in the iteration). As part of the `unscale_`, gradients are checked for infs/NaNs. 2. Invokes ``optimizer.step()`` using the unscaled gradients. Ensure that previous optimizer state or params carry over if we encounter NaNs in the gradients. ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``. Returns the return value of ``optimizer.step(*args, **kwargs)``. Args: optimizer (cerebras_pytorch.optim.Optimizer): Optimizer that applies the gradients. args: Any arguments. kwargs: Any keyword arguments. """ # pylint: disable=protected-access if optimizer._amp_stash.state == OptState.STEPPED: raise RuntimeError( "step() has already been called since the last update()." ) # must unscale all optimizers prior to step and update # so global grad norm can be computed correctly # guaranteed to also unscale the given optimizer if needed for _optimizer in self.backend.optimizer_registry: if _optimizer._amp_stash.state == OptState.READY: self.unscale_(_optimizer) with self.backend.name_scope("grad_scaler.step"): if self.dynamic and self.isfinite is None: self.isfinite = self.clip_gradients_and_return_isfinite( self.backend.optimizer_registry ) # Run optimizer's base step if self.isfinite is true return_val = self.step_if_finite(optimizer, *args, **kwargs) optimizer._amp_stash.state = OptState.STEPPED return return_val
[docs] def update_scale(self, optimizers): """ Update the scales of the optimizers """ if not self.dynamic: return # Compute gloal norm from all squared local norms # if not self.global_norm: self.global_norm = torch.sqrt( torch.sum(torch.stack(self._squared_local_norms)) ) def float32(value): return torch.tensor( value, dtype=torch.float32, device=self.global_norm.device ) def int64(value): return torch.tensor( value, dtype=torch.int64, device=self.global_norm.device ) # Reset local norms for next iteration self._squared_local_norms = [] # integer representation of isfinite isfinite_int = self.isfinite.long() # Increment the step counter self._steps_since_rescale.add_(1) # If overflow, increment the overflow counter self._overflows_since_rescale.add_(1 - isfinite_int) ratio = ( self._overflows_since_rescale.float() / self._steps_since_rescale.float() ) # Decrease loss scale # decrease loss scaling condition # 1 if we've exceeded our overflow tolerance # 0 if we haven't hit too many overflows overflow_tolerance_exceeded = ( float32(self._overflow_tolerance) < ratio ).long() # decrease loss scale 2x if we're decreasing, otherwise unchanged loss_scale_divisor = (1 + overflow_tolerance_exceeded).float() self._loss_scale.div_(loss_scale_divisor) # reset counters reset_because_decreasing = 1 - overflow_tolerance_exceeded self._overflows_since_rescale.mul_(reset_because_decreasing) self._steps_since_rescale.mul_(reset_because_decreasing) # Increasing loss scale # (done purposefully after decrease logic in case counter reset) # increase loss scaling condition # 1 if we've exceeded our steps per increase counter # 0 if we haven't yet. increase_counter_exceeded = ( int64(self._steps_per_increase) < self._steps_since_rescale ).long() # increase loss scale 2x if we're increasing, otherwise unchanged loss_scale_multipler = (1 + increase_counter_exceeded).float() self._loss_scale.mul_(loss_scale_multipler) # reset counters reset_because_increasing = 1 - increase_counter_exceeded self._overflows_since_rescale.mul_(reset_because_increasing) self._steps_since_rescale.mul_(reset_because_increasing) # clamp loss scale to within min/max max_ls = float32(self._max_loss_scale) self._loss_scale.copy_( torch.where(self._loss_scale < max_ls, self._loss_scale, max_ls,) ) min_ls = float32(self._min_loss_scale) self._loss_scale.copy_( torch.where(min_ls < self._loss_scale, self._loss_scale, min_ls,) )
[docs] def update(self, new_scale=None): """Update the gradient scalar after all optimizers have been stepped""" if new_scale: raise ValueError( "cstorch.amp.GradScaler does not support providing a `new_scale`" ) # Update scale if self.dynamic or self._loss_scale != 1.0: with self.backend.name_scope("grad_scaler.update"): self.update_scale(self.backend.optimizer_registry) # pylint: disable=protected-access,no-member _amp_state.handle._clear_cache() # clear all data from this iteration for the next self.isfinite = None for optimizer in self.backend.optimizer_registry: optimizer._amp_stash.state = OptState.READY