Source code for cerebras.modelzoo.data_preparation.nlp.bert.mlm_only_processor

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random
from functools import reduce

import spacy
from tqdm import tqdm

from cerebras.modelzoo.data_preparation.nlp.tokenizers.Tokenization import (
    FullTokenizer,
)
from cerebras.modelzoo.data_preparation.utils import (
    convert_to_unicode,
    create_masked_lm_predictions,
    pad_instance_to_max_seq_length,
    text_to_tokenized_documents,
)


[docs]class MLMOnlyInstance:
    """
    A single training MLMOnly instance.
    :param list tokens: List of tokens for MLM example
    :param list masked_lm_positions: List of masked lm positions for sentence
    pair
    :param list masked_lm_labels: List of masked lm labels for example
    """

    def __init__(
        self,
        tokens,
        masked_lm_positions,
        masked_lm_labels,
    ):
        self.tokens = tokens
        self.masked_lm_labels = masked_lm_labels
        self.masked_lm_positions = masked_lm_positions

    def __str__(self):
        tokens = " ".join([convert_to_unicode(x) for x in self.tokens])
        mlm_positions = " ".join([str(x) for x in self.masked_lm_positions])
        mlm_labels = " ".join(
            [convert_to_unicode(x) for x in self.masked_lm_labels]
        )

        s = f"MLMOnlyInstance: \n"
        s += f"tokens: {tokens}\n"
        s += f"masked_lm_positions: {mlm_positions}\n"
        s += f"masked_lm_labels: {mlm_labels}\n"
        s += "\n"

        return s

    def __repr__(self):
        return self.__str__()


[docs]def data_generator(
    metadata_files,
    vocab_file,
    do_lower,
    disable_masking,
    mask_whole_word,
    max_seq_length,
    max_predictions_per_seq,
    masked_lm_prob,
    dupe_factor,
    output_type_shapes,
    multiple_docs_in_single_file=False,
    multiple_docs_separator="\n",
    single_sentence_per_line=False,
    buffer_size=1e6,
    min_short_seq_length=None,
    overlap_size=None,
    short_seq_prob=0,
    spacy_model="en_core_web_sm",
    inverted_mask=False,
    allow_cross_document_examples=True,
    document_separator_token="[SEP]",
    seed=None,
    input_files_prefix="",
):
    """
    Generator function used to create input dataset
    for MLM only dataset.

    1. Generate raw examples with tokens based on "overlap_size",
    "max_sequence_length", "allow_cross_document_examples"
    and "document_separator_token" and using a sliding window approach.
    The exact steps are detailed in "_create_examples_from_document" function
    2. Mask the raw examples based on "max_predictions_per_seq"
    3. Pad the masked example to "max_sequence_length" if less that msl

    :param str or list[str] metadata_files: A string or strings list each
        pointing to a metadata file. A metadata file contains file paths for
        flat text cleaned documents. It has one file path per line.
    :param str vocab_file: Vocabulary file, to build tokenization from
    :param bool do_lower: Boolean value indicating if words should be
        converted to lowercase or not
    :param bool disable_masking: whether masking should be disabled
    :param bool mask_whole_word: If True, all subtokens corresponding to a word
        will be masked.
    :param int max_seq_length: Maximum length of the sequence to generate
    :param int max_predictions_per_seq: Maximum number of Masked tokens
        in a sequence
    :param float masked_lm_prob: Proportion of tokens to be masked
    :param int dupe_factor: Number of times to duplicate the dataset
        with different static masks
    :param dict output_type_shapes: Dictionary indicating the shapes of
        different outputs
    :param bool multiple_docs_in_single_file: True, when a single text
        file contains multiple documents separated by <multiple_docs_separator>
    :param str multiple_docs_separator: String which separates multiple
        documents in a single text file.
    :param single_sentence_per_line: True,when the document is already split
        into sentences with one sentence in each line and there is no
        requirement for further sentence segmentation of a document
    :param int buffer_size: Number of tokens to be processed at a time
    :param int min_short_seq_length: When short_seq_prob > 0, this number
        indicates the least number of tokens that each example should have i.e
        the num_tokens (excluding pad) would be in the range
        [min_short_seq_length, MSL]
    :param int overlap_size: Number of tokens that overlap with previous example
        when processing buffer with a sliding window approach.
        If None, defaults to overlap to max_seq_len/4.
    :param int short_seq_prob: Probability of a short sequence. Defaults to 0.
        Sometimes we want to use shorter sequences to minimize the mismatch
        between pre-training and fine-tuning.
    :param spacy_model: spaCy model to load, i.e. shortcut
        link, package name or path. Used to segment text into sentences.
    :param bool inverted_mask: If set to False, has 0's on padded positions and
        1's elsewhere. Otherwise, "inverts" the mask, so that 1's are on padded
        positions and 0's elsewhere.
    :param bool allow_cross_document_examples: If True, the sequences can
        contain tokens from the next document.
    :param str document_separator_token: String to separate tokens from
        one document and the next when sequences span documents
    :param int seed: Random seed.
    :param str input_file_prefix: Prefix to be added to paths of the input files.

    :returns: yields training examples (feature, [])
    """
    ## Set defaults if values passed are None
    if overlap_size is None:
        overlap_size = int(max_seq_length / 4)
        print(
            f"--- Setting overlap_size to {overlap_size} since None value passed"
        )

    if not allow_cross_document_examples and document_separator_token:
        print(
            f"--- Since example cannot span documents "
            f"(allow_cross_document_examples: {allow_cross_document_examples}),"
            f" document_separator_token: {document_separator_token} will be ignored"
        )

    if min_short_seq_length is None:
        min_short_seq_length = 2 + overlap_size

    elif (min_short_seq_length < (2 + overlap_size)) or (
        min_short_seq_length > max_seq_length - 2
    ):

        raise ValueError(
            f"The min_short_seq_len param {min_short_seq_length} is invalid. \n"
            f"Allowed values are [{2 + overlap_size}, {max_seq_length - 2})"
        )

    # define tokenizer
    tokenizer = FullTokenizer(vocab_file, do_lower)
    vocab_words = tokenizer.get_vocab_words()

    if do_lower:
        document_separator_token = document_separator_token.lower()

    assert (
        document_separator_token in vocab_words
    ), f" document_separator_token: {document_separator_token} not present in vocab file"

    rng = random.Random(seed)

    # get all text files by reading metadata files
    if isinstance(metadata_files, str):
        metadata_files = [metadata_files]

    input_files = []
    for _file in metadata_files:
        with open(_file, "r") as _fin:
            input_files.extend(_fin.readlines())
    input_files = [x.strip() for x in input_files if x]
    num_input_files = len(input_files)
    rng.shuffle(input_files)

    def _generate_train_feature(example):
        if disable_masking:
            return example
        else:
            return create_masked_lm_features(
                example,
                vocab_words,
                max_seq_length,
                mask_whole_word,
                max_predictions_per_seq,
                masked_lm_prob,
                document_separator_token,
                rng,
                tokenizer,
                output_type_shapes,
                inverted_mask,
            )

    current_buffer_length = 0
    buffer_documents = []
    # to speed up processing load spacy module once here
    # disable the ununsed pipeline stages to speed up processing
    nlp = spacy.load(spacy_model, disable=['tagger', 'ner'])
    for _ in tqdm(range(dupe_factor)):
        # Reset buffers
        prev_tokens = []

        for _file_num, _file in enumerate(input_files):
            _fin_path = os.path.abspath(os.path.join(input_files_prefix, _file))
            with open(_fin_path, "r") as _fin:
                _fin_data = _fin.read()
            processed_doc, num_tokens = text_to_tokenized_documents(
                _fin_data,
                tokenizer,
                multiple_docs_in_single_file,
                multiple_docs_separator,
                single_sentence_per_line,
                nlp,
            )

            # Flatten one level
            buffer_documents.extend(
                [
                    reduce(lambda x, y: x + y, doc_list)
                    for doc_list in processed_doc
                ]
            )
            current_buffer_length += num_tokens

            # Continue if we don't have enough tokens
            if (
                current_buffer_length < buffer_size
                and _file_num < num_input_files - 1
            ):
                continue

            rng.shuffle(buffer_documents)

            # When enough tokens available, yield examples
            for document_index, document in enumerate(buffer_documents):
                _example_generator = _create_examples_from_document(
                    document,
                    allow_cross_document_examples,
                    document_separator_token,
                    overlap_size,
                    prev_tokens,
                    max_seq_length,
                    short_seq_prob,
                    min_short_seq_length,
                    rng,
                )
                for example, prev_tokens in _example_generator:
                    if example:
                        yield _generate_train_feature(example)

            # Fix buffer lengths, buffer etc
            buffer_documents = []
            current_buffer_length = 0

        # Last few tokens remaining after processing all input_files
        if prev_tokens:
            yield _generate_train_feature(["[CLS]"] + prev_tokens + ["[SEP]"])


[docs]def create_masked_lm_features(
    example,
    vocab_words,
    max_seq_length,
    mask_whole_word,
    max_predictions_per_seq,
    masked_lm_prob,
    document_separator_token,
    rng,
    tokenizer,
    output_type_shapes,
    inverted_mask,
):

    exclude_from_masking = list(
        set(["[CLS]", "[SEP]", document_separator_token])
    )
    (
        masked_example_tokens,
        masked_lm_positions,
        masked_lm_labels,
    ) = create_masked_lm_predictions(
        example,
        vocab_words,
        mask_whole_word,
        max_predictions_per_seq,
        masked_lm_prob,
        rng,
        exclude_from_masking,
    )

    masked_lm_instance = MLMOnlyInstance(
        tokens=masked_example_tokens,
        masked_lm_positions=masked_lm_positions,
        masked_lm_labels=masked_lm_labels,
    )

    # pad to MSL
    feature, label = pad_instance_to_max_seq_length(
        instance=masked_lm_instance,
        mlm_only=True,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        max_predictions_per_seq=max_predictions_per_seq,
        output_type_shapes=output_type_shapes,
        inverted_mask=inverted_mask,
    )

    return feature, label


def _create_examples_from_document(
    document,
    allow_cross_document_examples,
    document_separator_token,
    overlap_size,
    prev_tokens,
    max_seq_length,
    short_seq_prob,
    min_short_seq_length,
    rng,
):
    # Process for generating an example
    # 1. The text from metadata files is read and accumulated in a buffer
    # until the buffer_size limit is hit. Note that documents are always
    # read entirely before the buffer_size limit is checked
    # 2. Next, reading one document at a time from "buffer",
    # we slide a window of size "max_sequence_length" and construct an example.
    # 3. If "overlap_size" is set, then when generating the next example,
    # the window is slided back by "overlap_size" and the next example is constructed
    # 4. If an example can span multiple documents,
    # i.e "allow_cross_document_examples" is set to True,
    # then we use "document_separator_token" to separate tokens
    # from the two documents
    # i.e [CLS] <tokens-doc1> <document_separator_token><tokens-doc2>[SEP]
    # 5. The last remaining tokens are used to contruct the final example
    # and this example most of the times will have tokens less than "max_sequence_length"
    # and would be padded to "max_sequence_length"

    max_num_tokens = max_seq_length - 2
    start_idx = 0
    # We usually want to fill up the entire sequence since we are padding
    # to `max_seq_length` anyways, so short sequences are generally not
    # needed. However, we sometimes (i.e., short_seq_prob = 0.1 == 10% of
    # the time) want to use shorter sequences to minimize the mismatch
    # between pre-training and fine-tuning. The `target_seq_len` is just a
    # rough target however, whereas `max_seq_length` is a hard limit
    target_seq_len = max_num_tokens
    if rng.random() < short_seq_prob:
        target_seq_len = rng.randint(min_short_seq_length, max_num_tokens)

    assert (
        len(prev_tokens) <= max_seq_length
    ), "Number of leftover tokens i.e len(prev_tokens) > max_seq_length"

    # NOTE: prev_tokens cannot be more than MSL.
    # Basically, we do a windowing to construct examples and "overlap_size"
    # refers to the amount we push the window back.
    # "buffer_size", on the other hand enables us to process more than one
    # document at once if the document contains fewer tokens.
    # So, at a single time we can process "buffer_size" number of tokens

    if prev_tokens:
        document = prev_tokens + [document_separator_token] + document
        prev_tokens = []

    start_idx = 0  # inclusive of this element
    end_idx = start_idx + target_seq_len - 1  # inclusive of this element

    while end_idx < len(document):
        example = document[
            start_idx : end_idx + 1
        ]  # All elements from start_idx to end_idx (inclusive)

        # add special token for input start and end
        assert (
            len(example) > overlap_size
        ), f"Length of example {len(example)} less than overlap_size {overlap_size}"
        assert (
            len(example) <= max_num_tokens
        ), f"Length of example greater than max_num_tokens {max_num_tokens}"

        example.insert(0, "[CLS]")
        example.append("[SEP]")
        yield example, prev_tokens

        start_idx = end_idx - overlap_size + 1

        # Recalculate target_seq_len,
        if rng.random() < short_seq_prob:
            target_seq_len = rng.randint(min_short_seq_length, max_num_tokens)

        end_idx = start_idx + target_seq_len - 1

        assert (
            end_idx > 0
        ), f" When generating example, end_idx {end_idx} is less than zero."
        assert (
            start_idx >= 0
        ), f" When generating example, start_idx {start_idx} is less than zero."

    if allow_cross_document_examples:
        example = []
        prev_tokens = document[start_idx:]
    else:
        example = ["[CLS]"] + document[start_idx:] + ["[SEP]"]
        prev_tokens = []

    yield example, prev_tokens