Source code for cerebras.modelzoo.common.utils.model.lora

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This code is adapted from
# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
#
# Copyright 2022 Cerebras Systems.
#
#  ------------------------------------------------------------------------------------------
#  Copyright (c) Microsoft Corporation. All rights reserved.
#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
#  ------------------------------------------------------------------------------------------

import logging
import math
from dataclasses import dataclass
from typing import List, Optional, Set, Union

import torch
import torch.nn as nn
import torch.nn.functional as F


[docs]@dataclass
class LoraConfig:
    r"""
    r: Rank of LoRA matrix projections
    alpha: Scaling factor (see paper for additional details)
    dropout: Dropout to apply to LoRA updates
    fan_in_fan_out:
    merge_weights: Determines whether lora weights should be merged/folded
        into underlying layers
    target_modules: A list of module names that must all exist in layers
        that will be converted to LoRA. For example, setting target_modules
        to ["TransformerDecoderLayer", "Linear"] would mean that all linear
        layers that were children of a TransformerDecoderLayer would be
        converted to LoRA.
    """

    r: int = 0
    alpha: int = 1
    dropout: float = 0.0
    fan_in_fan_out: bool = False
    merge_weights: bool = False
    target_modules: Optional[list] = None


[docs]def disable_lora_merge_weights(lora_params_dict: Union[dict, List[dict]]):
    r"""Sets merge_weights=False in LoRA parameters. This is helpful during
    eval mode to ensure that the weights don't get folded prior to checkpoint
    loading.
    """

    def _disable_merge_weights(params, printed_already=False):
        if params["merge_weights"] and not printed_already:
            logging.warning(
                "Automatically switching LoRA merge_weights to False in order "
                "to run evals."
            )
            printed_already = True

        params["merge_weights"] = False

        return printed_already

    if isinstance(lora_params_dict, list):
        printed = True
        for params in lora_params_dict:
            printed = _disable_merge_weights(params, printed)
    else:
        _disable_merge_weights(lora_params_dict)


[docs]class LoRALayer:
    r"""
    Base LoRA layer
    From https://github.com/microsoft/LoRA/blob/main/loralib/layers.py.
    """

    def __init__(
        self,
        r: int,
        lora_alpha: int,
        lora_dropout: float,
        merge_weights: bool,
    ):
        self.r = r
        self.lora_alpha = lora_alpha
        # Optional dropout
        if lora_dropout > 0.0:
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_dropout = lambda x: x
        # Mark the weight as unmerged
        self.merged = False
        self.merge_weights = merge_weights


[docs]class LoRA_Embedding(nn.Embedding, LoRALayer):
    r"""
    LoRA embedding layer
    From https://github.com/microsoft/LoRA/blob/main/loralib/layers.py.
    """

    def __init__(
        self,
        num_embeddings: int,
        embedding_dim: int,
        r: int = 0,
        lora_alpha: int = 1,
        merge_weights: bool = True,
        **kwargs,
    ):
        nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs)
        LoRALayer.__init__(
            self,
            r=r,
            lora_alpha=lora_alpha,
            lora_dropout=0,
            merge_weights=merge_weights,
        )
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(
                self.weight.new_zeros((r, num_embeddings))
            )
            self.lora_B = nn.Parameter(
                self.weight.new_zeros((embedding_dim, r))
            )
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()

    def reset_parameters(self):
        nn.Embedding.reset_parameters(self)
        if hasattr(self, 'lora_A'):
            # initialize A the same way as the default for nn.Linear and B to zero
            nn.init.zeros_(self.lora_A)
            nn.init.normal_(self.lora_B)

    def train(self, mode: bool = True):
        nn.Embedding.train(self, mode)
        if mode:
            if self.merge_weights and self.merged:
                # Make sure that the weights are not merged
                if self.r > 0:
                    self.weight.data -= (self.lora_B @ self.lora_A).transpose(
                        0, 1
                    ) * self.scaling
                self.merged = False
        else:
            if self.merge_weights and not self.merged:
                # Merge the weights and mark it
                if self.r > 0:
                    self.weight.data += (self.lora_B @ self.lora_A).transpose(
                        0, 1
                    ) * self.scaling
                self.merged = True

    def forward(self, x: torch.Tensor):
        if self.r > 0 and not self.merged:
            result = nn.Embedding.forward(self, x)
            if self.r > 0:
                after_A = F.embedding(
                    x,
                    self.lora_A.transpose(0, 1),
                    self.padding_idx,
                    self.max_norm,
                    self.norm_type,
                    self.scale_grad_by_freq,
                    self.sparse,
                )
                result += (after_A @ self.lora_B.transpose(0, 1)) * self.scaling
            return result
        else:
            return nn.Embedding.forward(self, x)


[docs]class LoRA_Linear(nn.Linear, LoRALayer):
    r"""
    LoRA linear layer
    From https://github.com/microsoft/LoRA/blob/main/loralib/layers.py.
    """

    def __init__(
        self,
        in_features: int,
        out_features: int,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,
        merge_weights: bool = True,
        **kwargs,
    ):
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        LoRALayer.__init__(
            self,
            r=r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            merge_weights=merge_weights,
        )

        self.fan_in_fan_out = fan_in_fan_out
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()
        if fan_in_fan_out:
            self.weight.data = self.weight.data.transpose(0, 1)

    def reset_parameters(self):
        nn.Linear.reset_parameters(self)
        if hasattr(self, 'lora_A'):
            # initialize A the same way as the default for nn.Linear and B to zero
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)

    def train(self, mode: bool = True):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w

        nn.Linear.train(self, mode)
        if mode:
            if self.merge_weights and self.merged:
                # Make sure that the weights are not merged
                if self.r > 0:
                    self.weight.data -= (
                        T(self.lora_B @ self.lora_A) * self.scaling
                    )
                self.merged = False
        else:
            if self.merge_weights and not self.merged:
                # Merge the weights and mark it
                if self.r > 0:
                    self.weight.data += (
                        T(self.lora_B @ self.lora_A) * self.scaling
                    )
                self.merged = True

    def forward(self, x: torch.Tensor):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w

        if self.r > 0 and not self.merged:
            result = F.linear(x, T(self.weight), bias=self.bias)
            if self.r > 0:
                result += (
                    self.lora_dropout(x)
[docs]                    @ self.lora_A.transpose(0, 1)
                    @ self.lora_B.transpose(0, 1)
                ) * self.scaling
            return result
        else:
            return F.linear(x, T(self.weight), bias=self.bias)


def get_lora_config_for_module(
    lora_params: Union[LoraConfig, List[LoraConfig]], module_names: List[str]
) -> Optional[LoraConfig]:
    r"""
    Gets lora parameters for a particular module

    Args:
        lora_params: LoRA top-level config.
        module_names: Hierarchical list of module names.

    Returns:
        lora parameters (LoraConfig) for the given module if applicable or None
        if the module is not targeted.
    """
    lora_params_list = (
        lora_params if isinstance(lora_params, list) else [lora_params]
    )

    for group_params in lora_params_list:
        target_modules = group_params.target_modules
        if target_modules is None or all(
            [e in module_names for e in target_modules]
        ):
            return group_params
    return None


[docs]def make_model_lora(
    model: nn.Module, lora_params_dict: Union[dict, List[dict]]
):
    r"""
    Create a Low Rank Adaptation (LoRA) model from a non-LoRA model. Note that
    the original non-LoRA model may be modified through this process.

    Args:
        model: Initial model to make LoRA
        lora_params_dict: LoRA parameters (in the form of a dict or list of
            dicts) which dictate how the supplied model will be converted into
            a LoRA model. The parameters should align with LoraConfig.

    Returns:
        LoRA model
    """
    if isinstance(lora_params_dict, list):
        lora_params = [LoraConfig(**e) for e in lora_params_dict]
    else:
        lora_params = LoraConfig(**lora_params_dict)

    loraified_modules = set()
    lora_model = make_model_lora_helper(
        model, lora_params, [], loraified_modules
    )

    if len(loraified_modules) == 0:
        raise RuntimeError(
            f"No modules were converted to LoRA. Please ensure that the "
            f"target_modules listed in the lora_params are valid."
        )

    logging.info(
        f"All layers matching the following module names were converted to LoRA"
        f": {loraified_modules}"
    )

    for n, p in lora_model.named_parameters():
        if not n.endswith(".lora_A") and not n.endswith(".lora_B"):
            p.requires_grad = False

    return lora_model


[docs]def make_model_lora_helper(
    model: nn.Module,
    lora_params: Union[LoraConfig, List[LoraConfig]],
    module_names: List[str],
    loraified_modules: Set[str],
):
    module_names = module_names + [type(model).__name__]
    for name, child in model.named_children():
        model.add_module(
            name,
            make_model_lora_helper(
                child, lora_params, module_names, loraified_modules
            ),
        )
    module_lora_params = get_lora_config_for_module(lora_params, module_names)
    if module_lora_params is not None and isinstance(model, nn.Embedding):
        loraified_modules.add(".".join(module_names))
        lora_embedding = LoRA_Embedding(
            # Embedding Args:
            model.num_embeddings,
            model.embedding_dim,
            padding_idx=model.padding_idx,
            max_norm=model.max_norm,
            norm_type=model.norm_type,
            scale_grad_by_freq=model.scale_grad_by_freq,
            sparse=model.sparse,
            device=model.weight.device,
            dtype=model.weight.dtype,
            # LoRA Args:
            r=module_lora_params.r,
            lora_alpha=module_lora_params.alpha,
            merge_weights=module_lora_params.merge_weights,
        )
        with torch.no_grad():
            lora_embedding.weight.copy_(model.weight)
        del model
        return lora_embedding
    elif module_lora_params is not None and isinstance(model, nn.Linear):
        loraified_modules.add(".".join(module_names))
        lora_linear = LoRA_Linear(
            # Linear Args:
            model.in_features,
            model.out_features,
            bias=model.bias is not None,
            device=model.weight.device,
            dtype=model.weight.dtype,
            # LoRA Args:
            r=module_lora_params.r,
            lora_alpha=module_lora_params.alpha,
            lora_dropout=module_lora_params.dropout,
            fan_in_fan_out=module_lora_params.fan_in_fan_out,
            merge_weights=module_lora_params.merge_weights,
        )
        with torch.no_grad():
            lora_linear.weight.copy_(model.weight)

        if model.bias is not None:
            with torch.no_grad():
                lora_linear.bias.copy_(model.bias)
        del model
        return lora_linear
    else:
        return model