Source code for cerebras_pytorch.optim.optimizer

# Copyright 2016-2023 Cerebras Systems
# SPDX-License-Identifier: BSD-3-Clause

"""The Cerebras base optimizer class"""
from abc import ABC, abstractmethod

import torch

from cerebras_pytorch.backend import current_backend_impl


[docs]class Optimizer(torch.optim.Optimizer, ABC): """ The abstract Cerebras base optimizer class. Enforces that the `preinitialize` method is implemented wherein the optimizer state should be initialized ahead of time """
[docs] def __init__(self, *args, enable_global_step: bool = False, **kwargs): """ Args: enable_global_step: If True, the optimizer will keep track of the global step for each parameter. """ super().__init__(*args, **kwargs) self.enable_global_step = enable_global_step self.backend = current_backend_impl() with self.backend.device: self.preinitialize() if enable_global_step: for group in self.param_groups: for p in group["params"]: self.state[p]["step"] = torch.tensor( 0.0, dtype=torch.float32 ).to(p.device) self._lr_scheduler_registry = [] self.backend.register_optimizer(self)
[docs] def increment_global_step(self, p): """ Increases the global steps by 1 and returns the current value of global step tensor in torch.float32 format. """ if "step" not in self.state[p]: raise RuntimeError( "No global step in the state. " "Please pass in `enable_global_step=True` " "to initialize the global step" ) self.state[p]["step"] += 1.0 return self.state[p]["step"]
[docs] def state_dict(self, *args, **kwargs): s = super().state_dict(*args, **kwargs) return s
[docs] def load_state_dict(self, state_dict): with self.backend.device: super().load_state_dict(state_dict) self.backend.post_optimizer_load_state_dict(self)
[docs] def visit_state(self, fn): """ Applies a lambda to each stateful value. """ for state in self.state.values(): for key, val in state.items(): new_val = fn(val) if new_val is not None: state[key] = new_val
[docs] @abstractmethod def state_names_to_sparsify(self): """ Return the names of of per-parameter states that need to be sparsified when applying sparsity to the underlying parameters. """
[docs] @abstractmethod def preinitialize(self): """ The optimizer state must be initialized ahead of time in order to capture the full compute graph in the first iteration. This method must be overriden to perform the state preinitialization """
[docs] @abstractmethod def step(self, closure=None): """ Perform the optimizer step itself. Note, there should be no new state being created in this function. All state must be created ahead of time in `preinitialize` and only updated in this method. """