Source code for modelzoo.transformers.pytorch.layers_api_demo.data

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random

import torch
from torch.utils.data import DataLoader, Dataset

from modelzoo.common.pytorch.input_utils import get_streaming_batch_size

# vocabulary that represents the English alphabet a to z
VOCABS = [i for i in range(26)]


[docs]class AlphabetDataset(Dataset):
[docs]    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def collate(self, unpacked_data):
        tensor_data = torch.tensor(unpacked_data, dtype=torch.int32)
        attention_mask = torch.ones(
            (tensor_data.shape[0], self.seq_length), dtype=torch.int32
        )
        return {
            "input_ids": tensor_data[:, :-1].contiguous(),
            "target_ids": tensor_data[:, 1:].contiguous(),
            "attention_mask": attention_mask,
        }


[docs]def train_input_dataloader(params):
    seed = params["runconfig"]["seed"]
    torch.manual_seed(seed)
    input_params = params["train_input"]
    num_samples = input_params["num_samples"]
    seq_length = input_params["seq_length"]
    batch_size = get_streaming_batch_size(input_params["batch_size"])
    train_data = []
    for _ in range(num_samples):
        start_index = random.randint(0, len(VOCABS) - seq_length - 1)
        end_index = start_index + seq_length + 1
        train_data.append(VOCABS[start_index:end_index])
    train_dataset = AlphabetDataset(train_data, seq_length)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate,
    )
    return train_dataloader


[docs]def eval_input_dataloader(params):
    seed = params["runconfig"]["seed"]
    torch.manual_seed(seed)
    input_params = params["eval_input"]
    num_samples = input_params["num_samples"]
    seq_length = input_params["seq_length"]
    batch_size = input_params["batch_size"]
    test_data = []
    for _ in range(num_samples):
        start_index = random.randint(0, len(VOCABS) - seq_length - 1)
        end_index = start_index + seq_length + 1
        test_data.append(VOCABS[start_index:end_index])

    test_dataset = AlphabetDataset(test_data, seq_length)
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=test_dataset.collate,
    )
    return test_dataloader