Source code for cerebras.modelzoo.data_preparation.nlp.gpt2.data_processor_utils

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random

import numpy as np

from cerebras.modelzoo.data_preparation.nlp.tokenizers.BPETokenizer import (
    BPETokenizer,
)


[docs]def training_data_generator(
    input_files,
    vocab_file,
    encoder_file,
    max_sequence_length,
    buffer_size=1e6,
    overlap_size=None,
    short_seq_prob=0,
    inverted_mask=False,
    add_special_tokens=True,
    eos_token="<|endoftext|>",
    pad_token="<|endoftext|>",
    input_ids_dtype="int32",
    input_mask_dtype="int32",
    labels_dtype="int32",
):
    """
    Generator function used to create input dataset
    for GPT2Model.

    :param list[str] input_files: List of input files.
    :param str vocab_file: Vocabulary file, to build tokenization from
    :param str encoder_file: Encoder file, map from word-pieces to
         token IDs for tokenization
    :param int max_sequence_length: Maximum length of the sequence to generate
    :param int short_seq_prob: Probability of a short sequence. Defaults to 0.
        Sometimes we want to use shorter sequences to minimize the mismatch
        between pre-training and fine-tuning.
    :param int buffer_size: Read buffer size. Defaults to 1MB.
    :param int overlap_size: Size of overlap when forming sequences from
      buffered token ids in a sliding window fashion.
      Defaults to None, which sets the overlap of max_sequence_length/4.
    :param bool inverted_mask: If set to False, has 0's on padded positions and
        1's elsewhere. Otherwise, "inverts" the mask, so that 1's are on padded
        positions and 0's elsewhere.
    :param str eos_token: End of sequence token. Defaults to "<|endoftext|>".
    :param str pad_token: Pad token. Defaults to "<|endoftext|>".
    :param str input_ids_dtype: Type of input ids. Defaults to "int32".
    :param str input_mask_dtype: Type of mask. Defaults to "int32".
    :param str labels_dtype: Type of labels. Defaults to "int32".

    :returns: yields training examples (feature, label)
    """
    assert (
        eos_token == "<|endoftext|>" and pad_token == "<|endoftext|>"
    ), "EOS and PAD tokens are given by '<|endoftext|>' for now."

    num_input_files = len(input_files)

    rng = random.Random()

    tokenizer = BPETokenizer(vocab_file, encoder_file)
    # id("<|endoftext|>") = 50256
    eos_id = 50256
    pad_id = 50256

    def _generate_train_example(token_ids):
        return _create_features_labels(
            token_ids,
            max_sequence_length,
            short_seq_prob,
            inverted_mask,
            pad_id,
            input_ids_dtype,
            input_mask_dtype,
            labels_dtype,
            rng,
        )

    if overlap_size is None:
        overlap_size = int(max_sequence_length / 4)
    assert overlap_size >= 0, "overlap_size must be non-negative."

    buffer_token_ids = [eos_id] if add_special_tokens else []

    for _file_num, _file in enumerate(input_files):
        with open(_file, 'r') as _fin:
            document_text = _fin.read()

        token_ids = tokenizer.encode(document_text)

        # skip empty documents
        if len(token_ids) == 0:
            continue

        if add_special_tokens:
            token_ids.append(eos_id)
        buffer_token_ids.extend(token_ids)

        # if buffer_size hasn't been reached
        # and current document is not the last one,
        # then continue adding elements into buffer
        if (
            len(buffer_token_ids) < buffer_size
            and _file_num < num_input_files - 1
        ):
            continue

        # generate sequences from buffer
        # in a sliding window fashion
        start_idx = 0
        end_idx = max_sequence_length + 1

        while end_idx <= len(buffer_token_ids):
            # need n+1 tokens to generate
            # (feature, label) of length n
            yield _generate_train_example(
                buffer_token_ids[start_idx:end_idx]
            ), _file_num
            start_idx = end_idx - overlap_size - 1
            end_idx = start_idx + max_sequence_length + 1

        # generate last example from  buffer
        if start_idx < len(buffer_token_ids) - 1:
            yield _generate_train_example(
                buffer_token_ids[-max_sequence_length - 1 :],
            ), _file_num

        buffer_token_ids = [eos_id] if add_special_tokens else []


def _create_features_labels(
    token_ids,
    max_sequence_length,
    short_seq_prob=0,
    inverted_mask=False,
    pad_id=0,
    input_ids_dtype="int32",
    input_mask_dtype="int32",
    labels_dtype="int32",
    rng=random.Random(),
):
    """
    Given a list of token_ids, generate input sequence
    and labels.
    """

    assert len(token_ids) >= 2, "token_ids must have at least 2 elements."

    if rng.random() < short_seq_prob:
        token_ids = token_ids[0 : rng.randint(2, max_sequence_length - 1)]

    input_ids = token_ids[:-1]
    labels = token_ids[1:]
    input_mask = [1] * len(input_ids)

    # padding
    num_pad = max_sequence_length - len(input_ids)
    padding = [pad_id] * num_pad

    input_ids.extend(padding)
    labels.extend(padding)
    input_mask.extend([0] * num_pad)

    # assertions to ensure correct output shapes
    assert (
        len(input_ids) == max_sequence_length
        and len(labels) == max_sequence_length
        and len(input_mask) == max_sequence_length
    ), "Wrong sequence length"

    # create feature dict
    features = dict()
    features["input_ids"] = getattr(np, input_ids_dtype)(input_ids)
    features["attention_mask"] = getattr(np, input_mask_dtype)(input_mask)

    if inverted_mask:
        features['attention_mask'] = np.equal(
            features['attention_mask'], 0
        ).astype(features['attention_mask'].dtype)
    labels = getattr(np, labels_dtype)(labels)
    features['labels'] = labels

    return features