Source code for modelzoo.common.pytorch.run_gpt_inference

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""GPT Inference script built using the cstorch API"""
import argparse
import inspect
import logging
import os
import sys
from copy import deepcopy
from pathlib import Path
from warnings import warn

import numpy as np
import torch

# isort: off
sys.path.append(os.path.join(os.path.dirname(__file__), "../../.."))
# isort: on


[docs]def get_parser():
    parser = argparse.ArgumentParser(
        "Script for running inference for GPT style models", add_help=False
    )
    parser.add_argument(
        "--inference_steps",
        type=int,
        default=None,
        help="Specifies the number of steps to run for inference.",
    )
    return parser


[docs]def get_cluster_config(params):
    from modelzoo.common.pytorch.run_cstorch_flow import (
        get_cluster_config as _get_cluster_config,
    )

    cs_config = _get_cluster_config(params)

    if cs_config.num_csx is not None and cs_config.num_csx != 1:
        raise ValueError(
            "Multi-box inference is not yet supported. "
            "Please specify num_csx=1 in the runconfig"
        )

    if cs_config.max_act_per_csx is not None and cs_config.max_act_per_csx > 1:
        warn("max_act_per_csx is forced to 1 for inference")

    cs_config.max_act_per_csx = 1

    if (
        cs_config.num_workers_per_csx is not None
        and cs_config.num_workers_per_csx > 1
    ):
        warn("num_workers_per_csx is forced to 1 for inference")

    cs_config.num_workers_per_csx = 1

    return cs_config


[docs]def inference_input_dataloader(params):
    from copy import deepcopy

    params = deepcopy(params)
    inference_input = params["inference_input"]
    data_processor = inference_input["data_processor"]
    if data_processor == "Gpt2SyntheticDataProcessor":
        from modelzoo.transformers.pytorch.gpt2.input.Gpt2SyntheticDataset import (  # noqa
            Gpt2SyntheticDataProcessor,
        )

        return Gpt2SyntheticDataProcessor(inference_input).create_dataloader()

    elif data_processor == "GptHDF5MapDataProcessor":
        from modelzoo.transformers.pytorch.gpt2.input.GptHDF5MapDataProcessor import (  # noqa
            GptHDF5MapDataProcessor,
        )
        from modelzoo.common.pytorch.input_utils import PaddingSample

        def map_fn(x):
            # If the input is undefined then it is padding the last batch
            # We fill it with the first start token here so that it can be special cased
            if isinstance(x, PaddingSample):
                start_token = params["model"]["start_token"]
                if isinstance(start_token, list):
                    start_token = start_token[0]
                x.fill_(start_token)
            # In the inference case, we only have a single input_ids feature
            return {"input_ids": x}

        return GptHDF5MapDataProcessor(
            # Only provide keys needed for inference
            {
                "batch_size": inference_input["batch_size"],
                "data_dir": inference_input["data_dir"],
                "max_sequence_length": inference_input.get(
                    "max_sequence_length"
                ),
                "drop_last": False,
                "pad_last": True,
                "dataset_map_fn": map_fn,
            }
        ).create_dataloader()
    else:
        raise ValueError(
            f"Invalid data processsor. Expected one of "
            f"'GptHDF5MapDataProcessor' or 'Gpt2SyntheticDataProcessor'. "
            f"Got: {data_processor}"
        )


[docs]def main():
    from modelzoo.common.pytorch.utils import (
        RunConfigParamsValidator,
        setup_artifact_dir,
        setup_logging,
    )
    from modelzoo.common.run_utils.cli_parser import get_params_from_args
    from modelzoo.common.run_utils.utils import DeviceType

    # Parse args
    parent = inspect.getouterframes(inspect.currentframe())[1]
    run_dir = os.path.dirname(os.path.abspath(parent.filename))
    parser_fn = lambda: [get_parser()]
    parser_args = {
        "parser_epilog": (
            "Please run 'python run_gpt_inference.py CSX -h'. \n \n"
            "Here is an example command for running on CSX: \n \n"
            "    python run_gpt_inference.py CSX --params /path/to/params --checkpoint_path "
            "/path/to/checkpoint \n \n"
            "Note that inference is currently only supported for device CSX"
        ),
        "csx_parser_epilog": (
            "To see a complete list of all available arguments, \n"
            "please run 'python run_gpt_inference.py CSX -h'. \n\n"
            "Here is an example command for running with CSX: \n \n"
            "    python run_gpt_inference.py CSX --params /path/to/params --checkpoint_path "
            "/path/to/checkpoint \n \n"
            "Inference flow resides in the Cerebras Model Zoo. Please specify --python_paths and \n"
            "--mount_dirs here or in your params.yaml under the 'runconfig' section with \n"
            "the path to the directory in which the Cerebras Model Zoo resides. \n"
        ),
        "modes": ["inference"],
    }

    params = get_params_from_args(
        run_dir,
        argv=sys.argv[1:],
        extra_args_parser_fn=parser_fn,
        device_type=DeviceType.CSX,
        **parser_args,
    )

    # Set default model parameters
    from modelzoo.transformers.pytorch.gpt2.utils import set_defaults

    set_defaults(params)

    # Validate runconfig
    runconfig = params["runconfig"]
    RunConfigParamsValidator(parser_fn).validate(runconfig)

    log_steps = runconfig.get("log_steps")

    # Set up logging level and env vars
    artifact_dir = Path(
        setup_artifact_dir(runconfig["model_dir"], mode="inference")
    )
    setup_logging(
        runconfig.get("logging"),
        runconfig.get("streamer_logging"),
        logging_dir=artifact_dir,
    )

    # Initialize the backend
    cs_config = get_cluster_config(params)

    from torch.utils._pytree import tree_map

    import cerebras_pytorch as cstorch
    from modelzoo.common.pytorch.half_dtype import set_half_dtype_from_params
    from modelzoo.common.pytorch.run_cstorch_flow import get_model_checkpoint
    from modelzoo.transformers.pytorch.gpt2.model import Gpt2Model

    compile_only = runconfig.get("compile_only", False)
    validate_only = runconfig.get("validate_only", False)

    backend = cstorch.backend(
        "CSX",
        artifact_dir=artifact_dir,
        compile_dir=runconfig.get("compile_dir"),
        compile_only=compile_only,
        validate_only=validate_only,
        use_cs_grad_accum=False,  # grad acc not supported in inference
    )
    backend.device.config.lazy_initialization = runconfig.get(
        "lazy_initialization", True
    )

    # Set the 16 bit dtype we want the automatic mixed precision module to use
    set_half_dtype_from_params(params["model"])

    class GptInferenceModel(Gpt2Model):
        def __init__(self, params):
            params = deepcopy(params)

            if "start_token" not in params["model"]:
                raise KeyError(
                    "Inference requires a start token. "
                    "Please provide `start_token` in the model params."
                )
            if "end_token" not in params["model"]:
                raise KeyError(
                    "Inference requires a end token. "
                    "Please provide `end_token` in the model params."
                )

            self.loop_dim = params["model"].pop("loop_dim", 1)
            self.start_token = params["model"].pop("start_token")
            self.end_token = params["model"].pop("end_token")
            self.max_tokens = params["model"].pop("max_tokens", None)

            super().__init__(params)

        def forward(self, data):
            """The forward pass on the input data. This method
            returns the predictions of the network as tokens.
            """
            if "input_ids" not in data:
                raise KeyError(
                    "GPT-2 model expects these data fields: input_ids"
                )
            elif data["input_ids"].dtype != torch.int32:
                raise TypeError(
                    "The dtype for all inputs should be torch.int32"
                )

            input_ids = data["input_ids"]
            # Note: attention_mask is a misnomer in this model; its contents are
            # ignored and only its shape is used.
            lm_logits = self.model(
                input_ids=input_ids,
                attention_mask=input_ids,  # doesn't actually mask anything
            )

            predictions = torch.argmax(lm_logits, dim=-1).int()
            cstorch.experimental.run_implicit_autoregressive_loop(
                input_tensor=input_ids,
                output_tensor=predictions,
                loop_dim=self.loop_dim,
                start_token=self.start_token,
                stop_token=self.end_token,
                max_tokens=self.max_tokens,
            )
            return predictions

    # Iniialize model
    with backend.device:
        model = GptInferenceModel(params)

    compiled_model = cstorch.compile(model, backend)
    compiled_model.eval()

    # Load weights
    checkpoint_path = get_model_checkpoint(runconfig)
    if checkpoint_path:
        logging.info(f"Loading weights from checkpoint {checkpoint_path}")
        state_dict = cstorch.load(checkpoint_path)
        model.load_state_dict(state_dict["model"], strict=True)
    else:
        raise RuntimeError(
            "Expected a checkpoint to load for inference but got none."
        )

    predictions_dir = artifact_dir / "predictions"
    predictions_dir.mkdir(parents=True, exist_ok=True)

    dataloader = cstorch.utils.data.DataLoader(
        inference_input_dataloader, params
    )

    writer = cstorch.utils.tensorboard.SummaryWriter(
        log_dir=runconfig.get("summary_dir", artifact_dir / "inference")
    )

    executor = cstorch.utils.data.DataExecutor(
        dataloader,
        num_steps=runconfig.get("inference_steps"),
        cs_config=cs_config,
        writer=writer,
    )

    global_step = 0

    @cstorch.trace
    def inference_step(batch):
        return compiled_model(batch)

    @cstorch.step_closure
    def post_inference_step(predictions, step):
        is_log_step = executor.on_final_iteration or (
            log_steps and global_step % log_steps == 0
        )

        if is_log_step:
            rate = executor.profiler.rate_tracker.rate
            global_rate = executor.profiler.rate_tracker.global_rate

            logging.info(
                f"| Inference Device={backend.device}, "
                f"Step={step}, "
                f"Rate={rate:.2f} samples/sec, "
                f"GlobalRate={global_rate:.2f} samples/sec"
            )

        # Save the predictions to a file
        np.savez(
            predictions_dir / f"prediction_{global_step}.npz",
            predictions=tree_map(cstorch.to_numpy, predictions),
            global_step=global_step,
        )

    try:
        for step, batch in enumerate(executor, start=1):
            predictions = inference_step(batch)
            global_step += 1
            post_inference_step(predictions, step)
    finally:
        if not (compile_only or validate_only) and executor.profiler:
            # compute the total samples processed based on the number of steps
            # and the number of Cerebras systems in the cluster
            total_samples = int(executor.profiler.rate_tracker.total_samples)
            total_time = executor.profiler.rate_tracker.total_time

            logging.info(
                f"Processed {total_samples} sample(s) "
                f"in {total_time} seconds."
            )


if __name__ == '__main__':
    main()