Source code for cerebras.modelzoo.config_manager.config_classes.base.model_config

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Config classes of Model Configs

"""

from dataclasses import dataclass
from typing import List, Literal, Optional, Union

from cerebras.modelzoo.config_manager.config_classes.base.base_config import (
    BaseConfig,
)


[docs]@dataclass
class InitializerConfig(BaseConfig):
    name: str = Literal[
        "constant",
        "ones",
        "zeros",
        "eye" "uniform",
        "normal",
        "xavier_normal",
        "glorot_normal",  # alias for `xavier_normal`
        "xavier_uniform",
        "glorot_uniform",  # alias for `xavier_uniform`
        "truncated_normal",
        "variance_scaling",
        "lecun_normal",
        "lecun_uniform",
        "kaiming_normal",
        "kaiming_uniform",
    ]
    mean: Optional[float] = None
    std: Optional[float] = None
    a: Optional[float] = None
    b: Optional[float] = None
    nonlinearity: Optional[
        Literal[
            "linear",
            "conv1d",
            "conv2d",
            "conv3d",
            "conv_transpose1d",
            "conv_transpose2d",
            "conv_transpose3d",
            "sigmoid",
            "tanh",
            "relu",
            "leaky_relu",
        ]
    ] = None
    mode: Optional[str] = None
    scale: Optional[float] = None
    distribution: Optional[str] = None
    gain: Optional[float] = None


[docs]@dataclass
class NormKWArgsConfig(BaseConfig):
    pass


[docs]@dataclass
class LoraConfig:
    r: int = 0
    "Rank of LoRA matrix projections"
    alpha: int = 1
    "Scaling factor (see paper for additional details)"
    dropout: float = 0.0
    "Dropout to apply to LoRA updates"
    fan_in_fan_out: bool = False
    merge_weights: bool = True
    """Determines whether lora weights should be merged/folded
    into underlying layers"""
    target_modules: Optional[list] = None
    """A list of module names that must all exist in layers
    that will be converted to LoRA. For example, setting target_modules
    to ["TransformerDecoderLayer", "Linear"] would mean that all linear
    layers that were children of a TransformerDecoderLayer would be
    converted to LoRA."""


[docs]@dataclass
class CompressionConfig(BaseConfig):
    format: Literal["mx8-e4m3", "mx8-e3m4"]
    "mx8 Compression formats"
    param_filter: Union[str, List[str]]
    """A glob or list of glob expressions to match against parameter names
    that are to be compressed with format"""


[docs]@dataclass
class SelectiveGradConfig(BaseConfig):
    param_filter: Optional[Union[str, List[str]]] = None
    """A glob or list of glob expressions to match against parameter names
    that are to have the selective gradient mask applied"""
    init_method: str = "outlier"
    "An initialization method that represents the mask to apply"


[docs]@dataclass()
class ModelConfig(BaseConfig):
    mixed_precision: bool = False
    "Enable to run the model in mixed precision mode"

    fp16_type: Optional[Literal["bfloat16", "float16", "cbfloat16"]] = None
    "Type of 16bit precision used"

    boundary_casting: Optional[bool] = False
    lora_params: Optional[Union[LoraConfig, List[LoraConfig]]] = None

    compression: Optional[Union[CompressionConfig, List[CompressionConfig]]] = (
        None
    )
    "Weight compression configuration as a single dictionary or a list of dictionaries"

    selective_grad: Optional[
        Union[SelectiveGradConfig, List[SelectiveGradConfig]]
    ] = None
    "Selective gradient configuration as a single dictionary or a list of dictionaries"