Source code for modelzoo.common.pytorch.model_utils.checkpoint_converters.opt_hf_cs

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import re
from typing import Tuple

import torch

from modelzoo.common.pytorch.model_utils.checkpoint_converters.base_converter import (
    BaseCheckpointConverter_CS_CS,
    BaseCheckpointConverter_HF_CS,
    BaseConfigConverter,
    BaseConfigConverter_CS_CS,
    BaseConfigConverter_HF_CS,
    ConfigConversionError,
    ConversionRule,
    EquivalentSubkey,
    FormatVersions,
)
from modelzoo.common.pytorch.model_utils.checkpoint_converters.helper import (
    convert_use_rms_layer_norm_helper,
)


[docs]class Converter_OPT_Attention_HF_CS17(BaseCheckpointConverter_HF_CS):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                [
                    EquivalentSubkey("q_proj", "proj_q_dense_layer"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("k_proj", "proj_k_dense_layer"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("v_proj", "proj_v_dense_layer"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("out_proj", "proj_output_dense_layer"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return None


[docs]class Converter_OPT_Headless_HF_CS17(BaseCheckpointConverter_HF_CS):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            # word embeddings
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.embed_tokens",
                        "embedding_layer.word_embeddings",
                    ),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.embed_positions",
                        "embedding_layer.position_embeddings",
                    ),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            # final layer norm
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.final_layer_norm", "transformer_decoder.norm"
                    ),
                    "\.(?:weight|bias)",
                ],
                action=self.replace_final_norm,
            ),
            # attention
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("self_attn.", "self_attn."),
                    Converter_OPT_Attention_HF_CS17(),
                ],
                action=None,
            ),
            # attention norm
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("self_attn_layer_norm", "norm1"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            # ffn norm
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("final_layer_norm", "norm3"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            # intermediate ffn
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("fc1", "ffn.ffn.0.linear_layer"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "decoder.layers", "transformer_decoder.layers"
                    ),
                    "\.\d+\.",
                    EquivalentSubkey("fc2", "ffn.ffn.1.linear_layer"),
                    "\.(?:weight|bias)",
                ],
                action=self.replaceKey,
            ),
            ConversionRule(["lm_head\.(?:weight|bias)"], exists="right"),
            ConversionRule(["ln_f\.(?:weight|bias)"], exists="right"),
        ]

    def replace_final_norm(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        new_state_dict[new_key] = old_state_dict[old_key]
        # CS 1.7 has both "ln_f" and "transformer_decoder.norm"
        # we need to copy the original ("ln_f") too:
        if from_index == 0:
            ln_f_key = re.sub("transformer_decoder\.norm\.", "ln_f.", new_key)
            new_state_dict[ln_f_key] = old_state_dict[old_key]

[docs]    def pre_model_convert(
        self,
        old_state_dict,
        new_state_dict,
        configs,
        from_index,
        drop_unmatched_keys,
    ):
        if from_index == 0:
            logging.warning(
                "{} OPT has a language model head (lm_head) "
                "while {} OPTModel does not. Initializing lm_head to default.".format(
                    self.formats()[1], self.formats()[0]
                )
            )

        # Manually tie weights
        if from_index == 1 and configs[1]["model"]["share_embedding_weights"]:
            if (
                old_state_dict.get("embedding_layer.word_embeddings.weight", 0)
                is None
            ):
                old_state_dict[
                    "embedding_layer.word_embeddings.weight"
                ] = old_state_dict["lm_head.weight"]

[docs]    def post_model_convert(
        self,
        old_state_dict,
        new_state_dict,
        configs,
        from_index,
        drop_unmatched_keys,
    ):
        if from_index == 0:
            # We are converting from HF OPTModel (which is headless) -> CS OPTModel (which has a head)
            # We need to create 'lm_head' and init to default values
            vocab_size, embed_dim = new_state_dict[
                "embedding_layer.word_embeddings.weight"
            ].shape
            lm_head_weight = torch.zeros((vocab_size, embed_dim))
            lm_head_weight.normal_(mean=0.0, std=0.02)
            new_state_dict["lm_head.weight"] = lm_head_weight
            if configs[1]["model"]["use_bias_in_output"]:
                lm_head_bias = torch.zeros(vocab_size)
                new_state_dict["lm_head.bias"] = lm_head_bias
        super().post_model_convert(
            old_state_dict,
            new_state_dict,
            configs,
            from_index,
            drop_unmatched_keys,
        )

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))

    @classmethod
    def converter_note(cls) -> str:
        return (
            "{} OPTModel <-> {} GPT2LMHeadModel (configured as OPT)\n"
            "The HF model doesn't contain a language model head while the CS "
            "one does. When converting to CS, the exported checkpoint will "
            "contain a language model head initialized to default random "
            "values. When converting to HF, the language model head will be "
            "dropped."
        ).format(cls.formats()[0], cls.formats()[1])

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_HF_CS17


[docs]class Converter_OPT_Headless_HF_CS18(Converter_OPT_Headless_HF_CS17):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            # Catch checkpoints from Pytorch 2.0 API
            ConversionRule([Converter_OPT_Headless_HF_CS17(),], action=None,),
            # Catch checkpoints from 1.7/1.8
            ConversionRule(
                [
                    EquivalentSubkey("", "model."),
                    Converter_OPT_Headless_HF_CS17(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.8", "cs-1.9"))

    @classmethod
    def converter_note(cls) -> str:
        return (
            "{} OPTModel <-> {} GPT2LMHeadModel (configured as OPT)\n"
            "The HF model doesn't contain a language model head while the CS "
            "one does. When converting to CS, the exported checkpoint will "
            "contain a language model head initialized to default random "
            "values. When converting to HF, the language model head will be "
            "dropped."
        ).format(cls.formats()[0], cls.formats()[1])

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_HF_CS18


[docs]class Converter_OPT_LMHeadModel_HF_CS17(BaseCheckpointConverter_HF_CS):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["lm_head\.(?:weight|bias)"], action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey("model.", ""),
                    Converter_OPT_Headless_HF_CS17(),
                ],
                action=None,
            ),
        ]

[docs]    def pre_model_convert(
        self,
        old_state_dict,
        new_state_dict,
        configs,
        from_index,
        drop_unmatched_keys,
    ):
        # Manually tie weights
        if from_index == 1 and configs[1]["model"]["share_embedding_weights"]:
            if (
                old_state_dict.get("embedding_layer.word_embeddings.weight", 0)
                is None
            ):
                old_state_dict[
                    "embedding_layer.word_embeddings.weight"
                ] = old_state_dict["lm_head.weight"]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))

    @classmethod
    def converter_note(cls) -> str:
        return (
            "{} OPTForCausalLM <-> {} GPT2LMHeadModel (configured as OPT)\n"
        ).format(cls.formats()[0], cls.formats()[1])

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_HF_CS17


[docs]class Converter_OPT_LMHeadModel_HF_CS18(BaseCheckpointConverter_HF_CS):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            # Catch checkpoints from Pytorch 2.0 API
            ConversionRule(
                [Converter_OPT_LMHeadModel_HF_CS17(),], action=None,
            ),
            # Catch checkpoints from 1.7/1.8
            ConversionRule(
                [
                    EquivalentSubkey("", "model."),
                    Converter_OPT_LMHeadModel_HF_CS17(),
                ],
                action=None,
            ),
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.8", "cs-1.9"))

    @classmethod
    def converter_note(cls) -> str:
        return (
            "{} OPTForCausalLM <-> {} GPT2LMHeadModel (configured as OPT)\n"
        ).format(cls.formats()[0], cls.formats()[1])

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_HF_CS18


[docs]class ConfigConverter_OPTModel_HF_CS17(BaseConfigConverter_HF_CS):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["model_type"],
                action=BaseConfigConverter.assert_factory_fn(0, "opt"),
            ),
            # Embedding
            ConversionRule(["vocab_size"], action=self.replaceKey),
            ConversionRule(
                ["position_embedding_type"],
                exists="right",
                action=BaseConfigConverter.assert_factory_fn(1, "learned"),
            ),
            ConversionRule(
                ["use_position_embedding"],
                exists="right",
                action=BaseConfigConverter.assert_factory_fn(1, True),
            ),
            ConversionRule(
                ["position_embedding_offset"],
                exists="right",
                action=BaseConfigConverter.assert_factory_fn(1, 2),
            ),
            ConversionRule(
                [EquivalentSubkey("embd_pdrop", "embedding_dropout_rate")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "tie_word_embeddings", "share_embedding_weights"
                    )
                ],
                action=self.replaceKey,
            ),
            # Decoder Block
            ConversionRule(["hidden_size"], action=self.replaceKey,),
            ConversionRule(
                [EquivalentSubkey("num_attention_heads", "num_heads")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("num_hidden_layers", "num_hidden_layers")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "max_position_embeddings", "max_position_embeddings"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["attention_type"],
                action=BaseConfigConverter.assert_factory_fn(
                    1, "scaled_dot_product"
                ),
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "enable_bias", "use_projection_bias_in_attention"
                    )
                ],
                action=self.convert_bias,
            ),
            ConversionRule(
                [EquivalentSubkey("enable_bias", "use_ffn_bias_in_attention")],
                action=self.convert_bias,
            ),
            ConversionRule(
                [EquivalentSubkey("enable_bias", "use_ffn_bias")],
                action=self.convert_bias,
            ),
            ConversionRule(
                [EquivalentSubkey("ffn_dim", "filter_size")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("activation_function", "nonlinearity")],
                action=self.replaceKey,
            ),
            ConversionRule(
                [
                    EquivalentSubkey(
                        "attention_dropout", "attention_dropout_rate"
                    )
                ],
                action=self.replaceKey,
            ),
            ConversionRule(
                [EquivalentSubkey("dropout", "dropout_rate")],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["use_bias_in_output"],
                action=BaseConfigConverter.assert_factory_fn(1, False),
            ),
            ConversionRule(
                [EquivalentSubkey("init_std", "initializer_range")],
                action=self.replaceKey,
            ),
            ConversionRule(
                ["embedding_layer_norm"],
                action=BaseConfigConverter.assert_factory_fn(1, False),
            ),
            ConversionRule(
                ["fixed_sparse_attention"],
                action=BaseConfigConverter.assert_factory_fn(1, None),
            ),
            ConversionRule(
                ["do_layer_norm_before"],
                action=BaseConfigConverter.assert_factory_fn(
                    0, True
                ),  # False isn't supported since HF removes final layer norm
            ),
            ConversionRule(
                ["norm_first"],
                action=BaseConfigConverter.assert_factory_fn(1, True),
            ),
            ConversionRule(
                ["use_ff_layer1_dropout"],
                action=BaseConfigConverter.assert_factory_fn(1, False),
            ),
            ConversionRule(
                ["scale_attn_by_inverse_layer_idx"],
                action=BaseConfigConverter.assert_factory_fn(0, False),
            ),
            ConversionRule(
                ["reorder_and_upcast_attn"],
                action=BaseConfigConverter.assert_factory_fn(0, False),
            ),
            ConversionRule(["layer_norm_epsilon"], action=self.replaceKey),
            ConversionRule(
                ["word_embed_proj_dim"],
                exists="left",
                action=self.assert_word_embed_proj_dim,
            ),
            ConversionRule(
                ["layerdrop"],
                action=BaseConfigConverter.assert_factory_fn(0, 0.0),
            ),
            ConversionRule(
                ["layer_norm_elementwise_affine"],
                action=BaseConfigConverter.assert_factory_fn(0, True),
            ),
            ConversionRule(
                ["_remove_final_layer_norm"],
                action=BaseConfigConverter.assert_factory_fn(0, False),
            ),
            ConversionRule(
                ["attention_module"],
                action=BaseConfigConverter.assert_factory_fn(
                    1, "aiayn_attention"
                ),
            ),
            ConversionRule(
                ["use_rms_norm"],
                action=BaseConfigConverter.assert_factory_fn(1, False),
            ),
        ]

        self.pre_convert_defaults[0].update(
            {
                "vocab_size": 50272,
                "hidden_size": 768,
                "num_hidden_layers": 12,
                "ffn_dim": 3072,
                "num_attention_heads": 12,
                "activation_function": "relu",
                "max_position_embeddings": 2048,
                "do_layer_norm_before": True,
                "dropout": 0.1,
                "attention_dropout": 0.0,
                "init_std": 0.02,
                "layer_norm_epsilon": 1e-5,
                "tie_word_embeddings": True,
                "enable_bias": True,
            }
        )
        self.pre_convert_defaults[1].update(
            {
                "max_position_embeddings": 1024,
                "position_embedding_offset": 0,
                "share_embedding_weights": True,
                "dropout_rate": 0.1,
                "nonlinearity": "gelu",
                "layer_norm_epsilon": 1.0e-5,
                "use_ffn_bias": True,
                "use_projection_bias_in_attention": True,
                "use_ffn_bias_in_attention": True,
                "initializer_range": 0.02,
                "norm_first": True,
            },
        )

        self.post_convert_defaults[0].update({"model_type": "opt"})
        self.post_convert_defaults[1].update(
            {
                "use_bias_in_output": False,
                "attention_type": "scaled_dot_product",
                "position_embedding_offset": 2,
            }
        )

    def convert_attention_type(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            new_state_dict[new_key] = (
                "scaled_dot_product"
                if old_state_dict[old_key]
                else "dot_product"
            )
        else:
            assert (
                old_state_dict[old_key] == "scaled_dot_product"
                or old_state_dict[old_key] == "dot_product"
            )
            new_state_dict[new_key] = old_state_dict[old_key].startswith(
                "scaled_"
            )

    def assert_word_embed_proj_dim(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if old_state_dict[old_key] != old_state_dict["hidden_size"]:
            raise ConfigConversionError(
                "CS only supports word_embed_proj_dim = hidden_size"
            )

    def convert_bias(
        self,
        old_key,
        new_key,
        old_state_dict,
        new_state_dict,
        from_index,
        action_fn_args,
    ):
        if from_index == 0:
            # enable_bias in HF controls all three of the following:
            new_state_dict["use_projection_bias_in_attention"] = old_state_dict[
                old_key
            ]
            new_state_dict["use_ffn_bias_in_attention"] = old_state_dict[
                old_key
            ]
            new_state_dict["use_ffn_bias"] = old_state_dict[old_key]
        else:
            if (
                new_key in new_state_dict
                and new_state_dict[new_key] != old_state_dict[old_key]
            ):
                # We have already set 'enable_bias' and see a param that conflicts
                # with this setting:
                raise ConfigConversionError(
                    "The following params must all be the set the same when \
                     converting to HF: use_projection_bias_in_attention, \
                     use_ffn_bias_in_attention, use_ffn_bias"
                )
            else:
                new_state_dict[new_key] = old_state_dict[old_key]

    def pre_config_convert(
        self, config, from_index,
    ):
        config = super().pre_config_convert(config, from_index)

        if from_index == 0:
            if "ffn_dim" not in config or config["ffn_dim"] is None:
                config["ffn_dim"] = 4 * config["hidden_size"]

        return config

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.7"))


[docs]class ConfigConverter_OPTModel_HF_CS18(ConfigConverter_OPTModel_HF_CS17):
[docs]    def __init__(self):
        super().__init__()

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-1.8", "cs-1.9"))


[docs]class Converter_OPT_LMHeadModel_CS18_CS20(BaseCheckpointConverter_CS_CS):
[docs]    def __init__(self):
        super().__init__()
        # Model didn't change between 1.8/1.9 and 2.0. Copy all keys.
        self.rules = [
            ConversionRule([".*"], action=self.replaceKey),
        ]

    @classmethod
    def converter_note(cls) -> str:
        return "GPT2LMHeadModel class"

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("cs-1.8", "cs-1.9"), FormatVersions("cs-2.0"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_CS18_CS20


[docs]class ConfigConverter_OPTModel_CS18_CS20(BaseConfigConverter_CS_CS):
[docs]    def __init__(self):
        super().__init__()
        # Only difference between 1.8/1.9 and 2.0 is introduction of norm_type
        self.rules = [
            ConversionRule(
                [EquivalentSubkey("use_rms_norm", "norm_type")],
                action=self.convert_use_rms_layer_norm,
            ),
            ConversionRule([".*"], action=self.replaceKey),
        ]

        self.pre_convert_defaults[0]["use_rms_norm"] = False
        self.pre_convert_defaults[1]["norm_type"] = "layernorm"

    def convert_use_rms_layer_norm(self, *args):
        convert_use_rms_layer_norm_helper(self, *args)

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("cs-1.8", "cs-1.9"), FormatVersions("cs-2.0"))


[docs]class Converter_OPT_Headless_HF_CS20(Converter_OPT_Headless_HF_CS18):
[docs]    def __init__(self):
        super().__init__()

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.0"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_HF_CS20


[docs]class Converter_OPT_LMHeadModel_HF_CS20(Converter_OPT_LMHeadModel_HF_CS18):
[docs]    def __init__(self):
        super().__init__()

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.0"))

    @staticmethod
    def get_config_converter_class() -> BaseConfigConverter:
        return ConfigConverter_OPTModel_HF_CS20


[docs]class ConfigConverter_OPTModel_HF_CS20(ConfigConverter_OPTModel_HF_CS18):
[docs]    def __init__(self):
        super().__init__()
        self.rules = [
            ConversionRule(
                ["norm_type"],
                action=BaseConfigConverter.assert_factory_fn(1, "layernorm"),
            ),
            *self.rules,
        ]

    @staticmethod
    def formats() -> Tuple[FormatVersions, FormatVersions]:
        return (FormatVersions("hf"), FormatVersions("cs-2.0"))