Source code for cerebras.modelzoo.data.nlp.bert.BertSumCSVDataProcessor

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Processor for PyTorch BERT fine tuning - Summarization.
"""
import csv

import numpy as np
import torch

from cerebras.modelzoo.common.input_utils import get_streaming_batch_size
from cerebras.modelzoo.common.registry import registry
from cerebras.modelzoo.data.common.input_utils import (
    check_sharding_sanity,
    get_data_for_task,
    num_tasks,
    task_id,
)
from cerebras.modelzoo.data.nlp.bert.bert_utils import (
    build_vocab,
    get_meta_data,
    shard_and_shuffle_data,
)
from cerebras.modelzoo.data_preparation.utils import (
    convert_str_to_int_list,
    pad_input_sequence,
)


[docs]@registry.register_datasetprocessor("BertSumCSVDataProcessor")
class BertSumCSVDataProcessor(torch.utils.data.IterableDataset):
    """
    Reads csv file containing the `input_token_ids`, and `label_ids`.
    Creates `attention_mask` and `segment_ids` on the fly
    :param <dict> params: dict containing input parameters for creating dataset.
    """

    def __init__(self, params):
        super(BertSumCSVDataProcessor, self).__init__()

        # Input params.
        self.meta_data = get_meta_data(params["data_dir"])

        self.meta_data_values = list(self.meta_data.values())
        self.meta_data_filenames = list(self.meta_data.keys())
        # Please note the appending of [0], 0 for the header
        self.meta_data_values_cum_sum = np.cumsum([0] + self.meta_data_values)

        self.num_examples = sum(map(int, self.meta_data.values()))
        self.batch_size = get_streaming_batch_size(params["batch_size"])

        self.num_batches = self.num_examples // self.batch_size
        assert (
            self.num_batches
        ), "Dataset does not contain enough samples for one batch. Please choose a smaller batch size."
        self.num_tasks = num_tasks()
        self.num_batch_per_task = self.num_batches // self.num_tasks

        assert (
            self.num_batch_per_task
        ), "Dataset cannot be evenly distributed across the given tasks. Please choose fewer tasks to run with."

        self.num_examples_per_task = self.num_batch_per_task * self.batch_size
        self.files_in_task = get_data_for_task(
            task_id(),
            self.meta_data_values_cum_sum,
            self.num_examples_per_task,
            self.meta_data_values,
            self.meta_data_filenames,
        )

        self.shuffle = params.get("shuffle", True)
        self.shuffle_seed = params.get("shuffle_seed", None)
        self.shuffle_buffer = params.get("shuffle_buffer", 10 * self.batch_size)
        self.mask_whole_word = params.get("mask_whole_word", False)
        self.do_lower = params.get("do_lower", False)

        # Multi-processing params.
        self.num_workers = params.get("num_workers", 0)
        self.drop_last = params.get("drop_last", True)
        self.prefetch_factor = params.get("prefetch_factor", 10)
        self.persistent_workers = params.get("persistent_workers", True)

        # Check that our sharding will produce at least one batch
        check_sharding_sanity(
            [num_examples for _, num_examples, _ in self.files_in_task],
            self.batch_size,
            self.num_workers,
            self.drop_last,
        )

        self.special_tokens = {
            "oov_token": "[UNK]",
            "class_token": "[CLS]",
            "pad_token": "[PAD]",
            "document_separator_token": "[SEP]",
        }
        if self.do_lower:
            self.special_tokens = {
                key: value.lower() for key, value in self.special_tokens.items()
            }

        # Get vocab file and size.
        self.vocab_file = params["vocab_file"]
        self.vocab, self.vocab_size = build_vocab(
            self.vocab_file, self.do_lower, self.special_tokens["oov_token"]
        )

        # Init tokenizer.
        self.tokenize = self.vocab.forward

        # Getting indices for special tokens.
        self.special_tokens_indices = {
            key: self.tokenize([value])[0]
            for key, value in self.special_tokens.items()
        }

        # Padding indices.
        self.pad_id = params.get(
            "pad_id", self.special_tokens_indices["pad_token"]
        )

        assert (
            self.pad_id >= 0
        ), f"`pad_id` must be non-negative, got {self.pad_id}"

        self.max_sequence_length = params["max_sequence_length"]
        self.max_cls_tokens = params["max_cls_tokens"]

        self.csv_files_per_task_per_worker = []
        self.processed_buffers = 0

[docs]    def create_dataloader(self):
        """
        Classmethod to create the dataloader object.
        """
        dataloader = torch.utils.data.DataLoader(
            self,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            drop_last=self.drop_last,
            prefetch_factor=self.prefetch_factor if self.num_workers else None,
            persistent_workers=(
                self.persistent_workers if self.num_workers else False
            ),
        )
        return dataloader

[docs]    def load_buffer(self):
        """
        Generator to read the data in chunks of size of `data_buffer`.

        :returns: Yields the data stored in the `data_buffer`.
        """
        data_buffer = []
        while self.processed_buffers < len(self.csv_files_per_task_per_worker):
            (
                current_file_path,
                num_examples,
                start_id,
            ) = self.csv_files_per_task_per_worker[self.processed_buffers]
            with open(current_file_path, "r", newline="") as fid:
                data_reader = csv.DictReader(
                    fid, delimiter=",", quoting=csv.QUOTE_MINIMAL
                )
                for row_id, row in enumerate(data_reader):
                    if row_id < start_id:
                        continue
                    if row_id >= start_id + num_examples:
                        break

                    if not self.shuffle:
                        yield row
                    else:
                        if len(data_buffer) < self.shuffle_buffer:
                            data_buffer.append(row)
                        else:
                            index = self.rng.randrange(self.shuffle_buffer)
                            yield data_buffer[index]
                            data_buffer[index] = row

                self.processed_buffers += 1

        if self.shuffle:
            self.rng.shuffle(data_buffer)

        for ind in range(len(data_buffer)):
            yield data_buffer[ind]
        data_buffer = []

    def __len__(self):
        """
        Returns the length of the dataset on each slurm task.
        """
        return self.num_examples_per_task

    def __iter__(self):
        """
        Iterator over the data to construct input feature.

        :return: A tuple with training feature:
            * np.array[int.32] input_ids: Numpy array with input token indices.
                Shape: (`max_sequence_length`).
            * np.array[int.32] labels: Numpy array with labels.
               Shape: (`max_cls_tokens`).
            * np.array[int.32] attention_mask
               Shape: (`max_sequence_length`).
            * np.array[int.32] token_type_ids: Numpy array with segment indices.
               Shape: (`max_sequence_length`).
            * np.array[int.32] cls_indices: Numpy array with class indices.
               Shape: (`max_cls_tokens`).
            * np.array[float.32] cls_weights: Numpy array with class weights.
               Shape: (`max_cls_tokens`).
        """
        (
            self.processed_buffers,
            self.csv_files_per_task_per_worker,
            self.shuffle_seed,
            self.rng,
        ) = shard_and_shuffle_data(
            self.files_in_task,
            self.shuffle,
            self.shuffle_seed,
        )

        # Iterate over the data rows to create input feature.
        for data_row in self.load_buffer():
            # `data_row` is a dict with keys:
            # ["input_token_ids", "labels", "segment_ids", "cls_indices"].
            (
                input_token_ids,
                labels,
                segment_ids,
                cls_indices,
            ) = tuple(map(convert_str_to_int_list, data_row.values()))
            feature = create_bertsum_feature(
                input_token_ids,
                segment_ids,
                cls_indices,
                labels,
                self.max_sequence_length,
                self.max_cls_tokens,
                self.pad_id,
            )

            yield feature


[docs]def create_bertsum_feature(
    input_ids,
    segment_ids,
    cls_indices,
    labels,
    max_sequence_length,
    max_cls_tokens,
    pad_id,
):
    """
    Creates the feature dict for bertsum model after applying padding.

    :param list input_ids: Token ids to pad.
    :param list segment_ids: Segment ids to pad.
    :param list cls_indices: Class ids to pad.
    :param list labels: Labels to pad.
    :param int max_sequence_length: Maximum sequence length.
    :param int max_cls_tokens: Max class tokens.
    :param int pad_id: Padding id.
    :param callable tokenize: Method to tokenize the input sequence.

    :return: dict for feature which includes keys:
            * 'input_tokens': Numpy array with input token indices.
                shape: (`max_sequence_length`), dtype: int32.
            * 'attention_mask': Numpy array with attention mask.
               shape: (`max_sequence_length`), dtype: int32.
            * 'token_type_ids': Numpy array with segment ids.
               shape: (`max_sequence_length`), dtype: int32.
            * 'labels': Numpy array with labels.
               shape: (`max_cls_tokens`), dtype: int32.
            * 'cls_indices': Numpy array with class indices.
               Shape: (`max_cls_tokens`).
            * 'cls_weights': Numpy array with class weights.
               Shape: (`max_cls_tokens`).
    """
    input_ids = pad_input_sequence(input_ids, pad_id, max_sequence_length)
    labels = pad_input_sequence(labels, pad_id, max_cls_tokens)
    input_mask = np.not_equal(input_ids, pad_id).astype(np.int32)
    segment_ids = pad_input_sequence(segment_ids, pad_id, max_sequence_length)
    cls_indices = pad_input_sequence(cls_indices, pad_id, max_cls_tokens)
    cls_weights = np.not_equal(cls_indices, pad_id).astype(np.float32)

    feature = {
        "input_ids": input_ids,
        "token_type_ids": segment_ids,
        "attention_mask": input_mask,
        "labels": labels,
        "cls_indices": cls_indices,
        "cls_weights": cls_weights,
    }

    return feature