Source code for cerebras.modelzoo.trainer.callbacks.model

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Model Callback class."""

import logging
from typing import Callable, Union
from warnings import warn

import torch

import cerebras.pytorch as cstorch
from cerebras.modelzoo.common.utils.model.mup_utils import is_mup
from cerebras.modelzoo.trainer.callbacks import Callback


[docs]class ModelCallback(Callback): """Callback class that handles setting up and compiling the model.""" def __init__( self, model: Union[Callable[[], torch.nn.Module], torch.nn.Module], ): """ Args: model: The model to train. It must be one of the following: - If a callable is passed, it is assumed to be a function that takes in no arguments returns a torch.nn.Module. - If a torch.nn.Module is passed, it is used as is. """ self.model = model def setup(self, trainer): if callable(self.model) and not isinstance(self.model, torch.nn.Module): with trainer.backend.device: trainer.model = self.model() elif isinstance(self.model, torch.nn.Module): trainer.model = self.model else: raise ValueError( f"Expected model to be a torch.nn.Module or a callable that " f"returns a torch.nn.Module, but got {type(self.model)}." ) if is_mup(trainer.model): logging.info(f"This is a muP configured run") trainer.compiled_model = cstorch.compile(trainer.model, trainer.backend) def on_train_start(self, trainer, model, train_dataloader, loop, loop_idx): model.train() def on_validate_start(self, trainer, model, val_dataloader, loop): model.eval() def on_before_backward(self, trainer, model, outputs): if "loss" not in outputs: raise ValueError( f"Expected the {model.__class__.__name__}'s forward call to " f"return a dictionary with a 'loss' key. " f"Got: {sorted(outputs.keys())}" ) def on_save_checkpoint(self, trainer, state_dict): state_dict["model"] = trainer.model.state_dict() def on_load_checkpoint(self, trainer, state_dict): if "model" not in state_dict: warn( f"Checkpoint does not contain a model state dict. " f"Model state was not loaded" ) else: # This check is required for backward compatibility with checkpoints # saved with older versions of ModelZoo (pre rel-2.0.0) # We check that the model state dict keys start with "model." # and if they don't, we load the state dict into the model's model if hasattr(trainer.model, "model") and not all( k.startswith("model.") for k in state_dict["model"].keys() ): trainer.model.model.load_state_dict( state_dict["model"], strict=not trainer.checkpoint.disable_strict_checkpoint_loading, ) # This should be the case that is used for all checkpoints saved # post rel-2.0.0 else: trainer.model.load_state_dict( state_dict["model"], strict=not trainer.checkpoint.disable_strict_checkpoint_loading, ) trainer.logger.info( f"Model state found in checkpoint and loaded successfully." )