Source code for modelzoo.transformers.data_processing.scripts.chunk_preprocessing.chunk_data_preprocessor

# Copyright 2022 Cerebras Systems.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

This module implements a generic data preprocessor called `ChunkDataPreprocessor`.
It internally uses `DataFrame` and `DataReader` to read and process data.

import math
import multiprocessing
import os
import sys
import time
from multiprocessing import Event, Process, Queue, Value, cpu_count
from multiprocessing.synchronize import Event
from threading import Event, Thread
from typing import Any, Dict, Tuple

import h5py
import numpy as np
from tqdm import tqdm

from modelzoo.common.input.utils import check_and_create_output_dirs
from modelzoo.transformers.data_processing.scripts.chunk_preprocessing.data_reader import (
from modelzoo.transformers.data_processing.scripts.chunk_preprocessing.lm_data_token_generator import (
from modelzoo.transformers.data_processing.scripts.hdf5_preprocessing.utils import (
from modelzoo.transformers.data_processing.tokenizers.BPETokenizer import (
from modelzoo.transformers.data_processing.tokenizers.HFTokenizer import (

[docs]def get_compression_factor(filename: str) -> int: """ Calculate and return the compression factor based on a file's extension. Args: filename (str): The name of the file. Returns: int: Compression factor. Returns 3 for all compressed and parquet formats, otherwise returns 1 for uncompressed formats. """ compressed_formats = [ ".jsonl.zst", ".jsonl.zst.tar", ".json.gz", ".parquet", ] for format in compressed_formats: if filename.endswith(format): return 3 # compression factor for compressed/parquet formats return 1 # default factor for uncompressed formats
[docs]def update_progress( pbar: tqdm, progress_counter: Value, total_chunks: int, start_time: float, stop_event: Event, ) -> None: """ Update the progress bar based on the current progress. Args: pbar (tqdm): The progress bar instance. progress_counter (Value): A shared counter to track progress across processes. total_chunks (int): Total chunks to process. start_time (float): The start time of the process. stop_event (Event): Event to signal when to stop updating progress. Returns: None """ while not stop_event.is_set(): if progress_counter.value > pbar.n: num_processed = progress_counter.value - pbar.n pbar.update(num_processed) elapsed_time = time.time() - start_time avg_time_per_chunk = elapsed_time / pbar.n estimated_remaining = avg_time_per_chunk * (total_chunks - pbar.n) # Update progress bar description with processed/total chunks pbar.set_description(f"Processing {pbar.n}/{total_chunks} chunks") # Update the progress bar postfix with avg processing time and estimated time pbar.set_postfix( avg_time=f"{avg_time_per_chunk:.4f}s/chunk", est_remaining=f"{estimated_remaining:.2f}s", refresh=True, ) time.sleep(0.5)
[docs]class ChunkDataPreprocessor:
[docs] def __init__(self, params, logger): """ Initialize the class with given parameters and logger. Args: params (dict): Configuration parameters. logger (Logger): Logging interface. """ self.params = params self.logger = logger self.json_params_file = None self.running_avg_processing_time = 0 self.chunks_processed = 0 self.process_params()
[docs] def process_params(self) -> None: """ Process parameters by calling various initialization methods. """ self.setup_output_directory() self.handle_metadata_files() self.process_setup_params() self.process_dataset_params() self.process_processing_params() self.initialize_miscellaneous_attributes() self.check_unused_params()
[docs] def setup_output_directory(self) -> None: """ Set up the output directory based on provided configuration. """ self.output_dir = self.params["setup"].get("output_dir", "./output/") if not self.params["processing"].get("resume_from_checkpoint", False): check_and_create_output_dirs(self.output_dir, filetype="h5")"\nWriting data to {self.output_dir}.\n") self.max_chunk_size = self.params["processing"].get( "max_chunk_size", 64 )"\nChunk size in kB: {self.max_chunk_size}.\n") self.json_params_file = os.path.join( self.output_dir, "data_params.json" ) self.checkpoint_path = os.path.join(self.output_dir, "checkpoint.txt") dump_args(self.params, self.json_params_file)
[docs] def handle_metadata_files(self) -> None: """ Handle metadata files based on provided configuration. """ metadata_files = self.params["setup"].pop("metadata_files", None) if metadata_files: metadata_files = metadata_files.split(",") input_dir = self.params["setup"].pop("input_dir", None) self.input_files = sorted( get_files(input_dir=input_dir, metadata_files=metadata_files) )
[docs] def process_setup_params(self) -> None: """ Setup the number of processes based on provided configuration. """ self.processes = self.params["setup"].pop("processes", 0) if self.processes == 0: self.processes = cpu_count() self.reader_process_num = 1 self.tokenize_process_num = self.processes - 2 self.writer_process_num = 1 # Create tokenizer queues for each tokenizer process self.tokenizer_queues = None if self.tokenize_process_num > 0: # Set up communication queues self.tokenizer_queues = [ Queue() for _ in range(self.tokenize_process_num) ] self.writer_queues = [ Queue() for _ in range(self.tokenize_process_num) ] self.stats_queue = Queue() ds_processor = self.params["setup"].pop( "dataset_processor", "LMDataPreprocessor" ) token_generator_map = { "LMDataPreprocessor": "LMDataTokenGenerator", "SummarizationPreprocessor": "SummarizationTokenGenerator", "FIMDataPreprocessor": "FIMTokenGenerator", } self.token_generator_name = token_generator_map[ds_processor] if ( self.token_generator_name == "LMDataTokenGenerator" or self.token_generator_name == "FIMTokenGenerator" ): self.prefix_queue = Queue()
[docs] def check_unused_params(self) -> None: """ Check for any unused parameters and log them as warnings. """ unused_setup_params = [ key for key in self.params["setup"].keys() if key != "output_dir" ] if unused_setup_params: self.logger.warning( f"\nThe following setup params are unused: {', '.join(unused_setup_params)}" ) if self.params["dataset"]: self.logger.warning( "The following dataset params are unused: " + ", ".join(self.params["dataset"].keys()) )
[docs] def process_dataset_params(self) -> None: """ Process dataset specific parameters. """ dataset_params = self.params["dataset"] self.jsonl_key = dataset_params.pop("jsonl_key", None) # Summarization specific fields self.prompt_key = dataset_params.pop("prompt_key", None) self.completion_key = dataset_params.pop("completion_key", None) if ( self.token_generator_name == "LMDataTokenGenerator" or self.token_generator_name == "FIMTokenGenerator" ): assert ( self.prompt_key is None and self.completion_key is None ), f"Prompt/Completion key can't be provided when performing LM or FIM tasks. Provided prompt_key: {self.prompt_key}, completion_key: {self.completion_key}" elif self.token_generator_name == "SummarizationTokenGenerator": assert ( self.jsonl_key is None ), f"jsonl key can't be provided when performing summarization tasks. Provided jsonl_key: {self.jsonl_key}" self.sep_token = dataset_params.get("sep_token", None) self.data_keys = dataset_params.get("keys", None) if self.data_keys: self.data_keys = [key.strip() for key in self.data_keys.split(",")] elif self.prompt_key and self.completion_key: self.data_keys = [self.prompt_key, self.completion_key] elif self.jsonl_key: self.data_keys = [self.jsonl_key] else: self.data_keys = ["text"] ## default jsonl key is text
[docs] def process_processing_params(self) -> None: """ Process the processing parameters and initialize relevant class attributes. """ processing_params = self.params["processing"] self.output_name = processing_params.pop("output_name", "examples") self.resume_from_checkpoint = processing_params.pop( "resume_from_checkpoint", False ) self.max_seq_length = processing_params.get("max_seq_length", 2048) self.display_pbar = processing_params.pop("display_pbar", True) self.write_in_batch = processing_params.pop("write_in_batch", False) self.shuffle = processing_params.get("shuffle", False) if self.shuffle: self.shuffle_seed = processing_params.get("shuffle_seed", 0) self.set_shuffle_seed() self.initialize_tokenizer(processing_params) if self.sep_token: self.add_token(self.sep_token) self.logger.warning( f"A sep token {self.sep_token} was added to tokenizer. This " "will change the vocab size. If you are using a pretrained " "model, you will need to avoid adding this." )
[docs] def set_shuffle_seed(self): "Sets shuffle seed for numpy" np.random.seed(self.shuffle_seed)
[docs] def add_token(self, token): """ Add token to the tokenizer Args: token (str): token to be added to the tokenizer """ if self.tokenizer_type == "gpt2tokenizer": self.tokenizer.add_token(token) elif self.tokenizer_type == "neoxtokenizer": self.tokenizer.add_token([token]) elif self.tokenizer_type == "huggingfacetokenizer": self.tokenizer.add_token([token])
[docs] def initialize_tokenizer(self, processing_params: Dict[str, Any]) -> None: """ Initialize tokenizer based on the provided `tokenizer_type` parameter. Args: processing_params (Dict[str, Any]): Dictionary of processing parameters. """ self.tokenizer_type = processing_params.pop( "tokenizer_type", "none" ).lower() assert ( self.tokenizer_type != "none" ), "`tokenizer_type` is missing, please provide it using `args.tokenizer_type`." if self.tokenizer_type == "gpt2tokenizer": self.initialize_gpt2tokenizer(processing_params) elif self.tokenizer_type == "neoxtokenizer": self.initialize_neoxtokenizer(processing_params) elif self.tokenizer_type == "huggingfacetokenizer": self.initialize_huggingfacetokenizer(processing_params) else: raise NotImplementedError( f"{self.tokenizer_type} is not implemented. Acceptable values are: `gpt2tokenizer`, `neoxtokenizer`, `huggingfacetokenizer`." ) # Override eos id and pad id from user args if ( processing_params.get("eos_id") is not None ): # important as id could be set to 0 f"Overriding the eos id {self.eos_id} from the tokenizer with supplied eos id: {processing_params['eos_id']}." ) self.eos_id = processing_params["eos_id"] self.pad_id = processing_params[ "eos_id" ] # set pad id same as eos id if processing_params.get("pad_id") is not None: f"Overriding the pad id {self.pad_id} from the tokenizer with supplied pad id: {processing_params['pad_id']}." ) self.pad_id = processing_params["pad_id"] if ( self.pad_id != self.eos_id and self.tokenizer_type == "gpt2tokenizer" ): f"Pad id {self.pad_id} supplied from command line is different from eos id {self.eos_id}. For GPT2 tokenizer, pad id and eos id must be the same. Setting pad id to eos id." ) self.pad_id = self.eos_id self.token_generator = getattr( sys.modules[__name__], self.token_generator_name )(self.params, self.tokenizer, self.eos_id, self.pad_id)
[docs] def initialize_gpt2tokenizer( self, processing_params: Dict[str, Any] ) -> None: """ Initialize GPT-2 tokenizer. Args: processing_params (Dict[str, Any]): Dictionary of processing parameters. """ vocab_file = processing_params.pop("vocab_file", None) encoder_file = processing_params.pop("encoder_file", None) self.tokenizer = BPETokenizer(vocab_file, encoder_file) assert ( vocab_file ), "`vocab_file` is missing, please provide it using `args.vocab_file`." assert ( encoder_file ), "`encoder_file` is missing, please provide it using `args.encoder_file`." self.eos_id = self.tokenizer.get_token_id("<|endoftext|>") self.pad_id = self.tokenizer.get_token_id("<|endoftext|>")
[docs] def initialize_neoxtokenizer( self, processing_params: Dict[str, Any] ) -> None: """ Initialize Neox tokenizer. Args: processing_params (Dict[str, Any]): Dictionary of processing parameters. """ encoder_file = processing_params.pop("encoder_file", None) assert ( encoder_file ), "`encoder_file` is missing, please provide it using `args.encoder_file`." self.tokenizer = HFTokenizer(encoder_file) self.eos_id = self.tokenizer.eos_id self.pad_id = ( self.eos_id if self.tokenizer.pad_id is None else self.tokenizer.pad_id )
[docs] def initialize_huggingfacetokenizer( self, processing_params: Dict[str, Any] ) -> None: """ Initialize Hugging Face tokenizer. Args: processing_params (Dict[str, Any]): Dictionary of processing parameters. """ from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( processing_params.pop("huggingface_tokenizer") ) self.eos_id = self.tokenizer.eos_token_id self.pad_id = ( self.eos_id if self.tokenizer.pad_token_id is None else self.tokenizer.pad_token_id )
[docs] def initialize_miscellaneous_attributes(self) -> None: """ Initialize miscellaneous attributes. """ self.n_examples = ( 0 ## stores the total number of sequences in the current dataset )
[docs] def get_params_file(self) -> str: """ Retrieve the path to the JSON parameters file. Returns: str: Path to the JSON parameters file. """ return self.json_params_file
[docs] def get_output_dir(self) -> str: """ Retrieve the output directory path. Returns: str: Path to the output directory. """ return self.output_dir
[docs] def calculate_total_size(self) -> int: """ Calculate the total size of all input files, taking compression factors into consideration. Returns: int: The total size of all input files in bytes. """"Input files = {self.input_files}") total_size = sum( os.path.getsize(file) * get_compression_factor(file) for file in self.input_files ) return total_size
[docs] def calculate_total_chunks(self, total_size: int) -> int: """ Calculate the total number of chunks based on the given total size and the predefined max chunk size. Parameters: total_size (int): The total size of the data in bytes. Returns: int: Total number of chunks. """ max_chunk_size_bytes = self.max_chunk_size * 1024 return math.ceil(total_size / max_chunk_size_bytes)
def task_split_read_checkpoint(self): if self.resume_from_checkpoint and os.path.isfile(self.checkpoint_path): try: with open(self.checkpoint_path, "r") as file: file_idx, doc_idx, hdf5_idx = [ int(i) for i in", ") ] f"Resuming from file number: {file_idx}, " f"with number of documents processed: {doc_idx} and number of hdf5 files written = {hdf5_idx}" ) return file_idx, doc_idx, hdf5_idx except Exception as e: # if checkpoint path is at initialization, # file may exist, but no data might be written in the file # in that event, do not do anything, go to the final return self.logger.error(e) return 0, 0, 0 def file_split_read_checkpoint(self): process_checkpoints = [(0, 0, 0) for process in range(self.processes)] root, extension = os.path.splitext(self.checkpoint_path) for pid in range(self.processes): process_checkpoint_path = root + f'_process_{pid}.txt' if self.resume_from_checkpoint and os.path.isfile( process_checkpoint_path ): try: with open(process_checkpoint_path, "r") as file: file_idx, doc_idx, hdf5_idx = [ int(i) for i in", ") ] process_checkpoints[pid] = (file_idx, doc_idx, hdf5_idx) f"Process {pid} resuming from file number: {file_idx}, " f"with number of documents processed: {doc_idx} and number of hdf5 files written = {hdf5_idx}" ) except Exception as e: # if checkpoint path is at initialization, # file may exist, but no data might be written in the file # in that event, do not do anything, go to the final return self.logger.error(e) return process_checkpoints def verify_hdf5_files(self, chunk_data): ## check that the expected shape of data matches and ## the indices in the data is less than voacb size chunk_number, df_chunk = chunk_data vocab_size = self.get_vocab_size() expected_shape = (3, self.max_seq_length) data_arr = np.concatenate(df_chunk.tokenized_data, axis=0) data_shape = data_arr.shape assert data_shape[1:] == expected_shape or self.max_seq_length == -1, ( f"Error in dataframe with number {chunk_number}, conversion is corrupted as the " f"shape of example is unexpected. Expected:" f" {expected_shape}, received {data_shape[1:]}." ) assert (data_arr < vocab_size).all(), ( f"Error in dataframe with number {chunk_number}, conversion is corrupted as the " f"input ids are greater than vocab size." f"Please ensure that a correct tokenizer is used " f"and the eos_id and pad_id are correct within the " f"tokenizer vocabulary size." ) def write_remaining_prefix(self, output_file_name): prefix_sequences = 0 prefix_stats = { "num_pad_tokens": 0, "num_masked_tokens": 0, "loss_valid_tokens": 0, "num_tokens": 0, } pid = os.getpid() if isinstance(self.token_generator, LMDataTokenGenerator): prefix = [] sentinels_received = 0 while True: curr_prefix = self.prefix_queue.get() if curr_prefix is None: sentinels_received += 1 if sentinels_received == self.tokenize_process_num: break else: prefix = prefix + curr_prefix if len(prefix) > 0: ( encoded_prefix, prefix_stats, ) = self.token_generator.encode_leftover_prefix(prefix) if len(encoded_prefix) > 0: base_name, extension = os.path.splitext(output_file_name) chunk_number = int(base_name.split('_')[-1]) df_chunk = DataFrame(self.data_keys) df_chunk.tokenized_data.append(encoded_prefix) chunk_data = chunk_number, df_chunk self.verify_hdf5_files(chunk_data) if not self.shuffle: with h5py.File(output_file_name, "w") as h5f: df_chunk.save_to_hdf5(h5f, self.write_in_batch) prefix_sequences += int(h5f.attrs["n_examples"]) else: n_examples = df_chunk.append_to_hdf5( self.output_dir, self.total_chunks, pid ) prefix_sequences += n_examples return prefix_sequences, prefix_stats
[docs] def process_files( self, file_paths, process_idx, checkpoint_args, progress_counter, ) -> int: """ Process the given files, tokenize the data chunks, and save to HDF5 format. Parameters: - file_paths: list of file_paths. - process_idx: Index of current process among all process spawned for file split Returns: - int: The count of processed chunks. """ cum_data_stats = { "discarded": 0, "processed": 0, "successful": 0, "raw_chars_count": 0, "raw_bytes_count": 0, "num_pad_tokens": 0, "num_masked_tokens": 0, "loss_valid_tokens": 0, "num_tokens": 0, "normalized_chars_count": 0, "normalized_bytes_count": 0, } total_examples = 0 # Initial setup reader = Reader( file_paths, max_chunk_size=self.max_chunk_size * 1024, logger=self.logger, keys=self.data_keys, ) file_idx, doc_start_idx, hdf5_written = checkpoint_args process_chunk_number = hdf5_written checkpoint_args = (file_idx, doc_start_idx) for df_chunk in reader.stream_data(checkpoint_args): # Tokenize chunk df_chunk.tokenize( self.token_generator, self.pad_id, ) cum_data_stats = { key: cum_data_stats[key] + df_chunk.data_stats[key] for key in df_chunk.data_stats } if df_chunk.tokenized_data == []: process_chunk_number += 1 # Update progress counter progress_counter.value += 1 continue # Save chunk to HDF5 if not self.shuffle: output_file_name = os.path.join( self.output_dir, f"output_chunk_{process_idx}_{df_chunk.file_idx}_{df_chunk.start_doc_idx}_{process_chunk_number}.h5", ) with h5py.File(output_file_name, "w") as h5f: df_chunk.save_to_hdf5(h5f, self.write_in_batch) total_examples += int(h5f.attrs["n_examples"]) else: n_examples = df_chunk.append_to_hdf5( self.output_dir, self.total_chunks, process_idx ) total_examples += n_examples root, extension = os.path.splitext(self.checkpoint_path) process_checkpoint_path = root + f'_process_{process_idx}.txt' with open(process_checkpoint_path, "w") as file: file.write( f"{df_chunk.file_idx}, {df_chunk.end_doc_idx + 1}, {process_chunk_number+1}" ) process_chunk_number += 1 # Update progress counter progress_counter.value += 1 if isinstance(self.token_generator, LMDataTokenGenerator): if self.token_generator.prefix != []: self.prefix_queue.put(self.token_generator.prefix) self.prefix_queue.put(None) cum_data_stats["examples"] = total_examples self.stats_queue.put(cum_data_stats) self.stats_queue.put(None)
[docs] def file_split_process_dataset(self) -> None: """ Process the dataset by splitting files across multiple processes. """ start_time = time.time() self.tokenize_process_num = self.processes total_size = self.calculate_total_size() self.total_chunks = self.calculate_total_chunks(total_size) final_data_stats = { "discarded": 0, "processed": 0, "successful": 0, "raw_chars_count": 0, "raw_bytes_count": 0, "num_pad_tokens": 0, "num_masked_tokens": 0, "loss_valid_tokens": 0, "num_tokens": 0, "normalized_chars_count": 0, "normalized_bytes_count": 0, "examples": 0, } process_file_lists = [[] for _ in range(self.processes)] process_checkpoints = self.file_split_read_checkpoint() hdf5_files_written = sum( [checkpoint_args[-1] for checkpoint_args in process_checkpoints] ) for idx, file in enumerate(self.input_files): target_process = idx % self.processes process_file_lists[target_process].append(file) # Setup the shared progress counter progress_counter = multiprocessing.Value("i", hdf5_files_written) # Using tqdm for progress bar with tqdm( total=self.total_chunks, desc="Processing", dynamic_ncols=True ) as pbar: stop_event = ( Event() ) # To signal the progress update thread when to stop progress_thread = Thread( target=update_progress, args=( pbar, progress_counter, self.total_chunks, start_time, stop_event, ), ) progress_thread.start() processes = [ Process( target=self.process_files, args=( files, pid, process_checkpoints[pid], progress_counter, ), ) for pid, files in enumerate(process_file_lists) ] for p in processes: p.start() # Wait for all processes to finish for p in processes: # TODO: We had to add a timeout here # as a workaround to avoid hanging at the # join. We need to figure out a better # solution. p.join(timeout=1e-6) ## write remaining prefix from all processes if isinstance(self.token_generator, LMDataTokenGenerator): ## Set prefix df chunk file index to be same as last chunks file index ## and it's starting doc be one greater then previous chunk's ending doc index. ( prefix_df_chunk_file_idx, prefix_df_chunk_start_doc_idx, prefix_df_chunk_idx, ) = self.file_split_read_checkpoint()[-1] output_file_name = os.path.join( self.output_dir, f"output_chunk_{self.processes-1}_{prefix_df_chunk_file_idx}_{prefix_df_chunk_start_doc_idx}_{prefix_df_chunk_idx}.h5", ) prefix_examples, stats = self.write_remaining_prefix( output_file_name ) final_data_stats["loss_valid_tokens"] += stats[ "loss_valid_tokens" ] final_data_stats["num_tokens"] += stats["num_tokens"] final_data_stats["num_pad_tokens"] += stats["num_pad_tokens"] final_data_stats["num_masked_tokens"] += stats[ "num_masked_tokens" ] final_data_stats["examples"] += prefix_examples end_time = time.time() elapsed_time = end_time - start_time stop_event.set() # Signal the progress update thread to stop progress_thread.join() # Wait for the progress update thread to finish f"The process_dataset function took {elapsed_time:.2f} seconds to complete." ) sentinels_received = 0 while True: data_stats = self.stats_queue.get() if data_stats == None: sentinels_received += 1 if sentinels_received == self.processes: break # Exit loop after receiving all sentinels else: final_data_stats = { key: final_data_stats[key] + data_stats[key] for key in data_stats } final_data_stats["average_chars_per_sequence"] = 0 final_data_stats["average_bytes_per_sequence"] = 0 try: final_data_stats["average_chars_per_sequence"] = math.ceil( final_data_stats["raw_chars_count"] / final_data_stats["examples"] ) final_data_stats["average_bytes_per_sequence"] = math.ceil( final_data_stats["raw_bytes_count"] / final_data_stats["examples"] ) except ZeroDivisionError: # Handle the division by zero error"No output hdf5 files were created .") return final_data_stats
[docs] def reader_process(self, checkpoint_args: Tuple) -> None: """ Reads data from input files and distributes them to the tokenizer queues. Args: checkpoint_args (Tuple[int, int, int]): File index, doc start index, and hdf5 index. """ reader = Reader( self.input_files, max_chunk_size=self.max_chunk_size * 1024, logger=self.logger, keys=self.data_keys, ) file_idx, doc_start_idx, hdf5_idx = checkpoint_args checkpoint_args = (file_idx, doc_start_idx) chunk_number = hdf5_idx # Initialize chunk number counter for df_chunk in reader.stream_data(checkpoint_args): # Distribute chunks in a round-robin fashion across tokenizer queues # while making sure the first chunk is given to first tokenizer process tokenizer_queue = self.tokenizer_queues[ (chunk_number - hdf5_idx) % (self.tokenize_process_num) ] tokenizer_queue.put( (chunk_number, df_chunk) ) # Send chunk number with df_chunk chunk_number += 1 # Place sentinel values in each tokenizer queue to indicate end of reading for tq in self.tokenizer_queues: tq.put(None)
[docs] def tokenizer_process(self, idx: int) -> None: """ Tokenizes data and forwards the tokenized data to the writer queue. Args: tokenizer_queue (Queue): Queue containing chunks of data for tokenization. idx (int): Queue ID to forward tokenized chunks of data. """ while True: chunk_data = self.tokenizer_queues[idx].get() if chunk_data is None: # Sentinel value indicates termination if isinstance(self.token_generator, LMDataTokenGenerator): if self.token_generator.prefix != []: self.prefix_queue.put(self.token_generator.prefix) self.prefix_queue.put(None) self.writer_queues[idx].put(None) break ( chunk_number, df_chunk, ) = chunk_data # Unpack chunk number and data frame chunk df_chunk.tokenize(self.token_generator, self.pad_id) # Forward the chunk number along with the df_chunk to the writer self.writer_queues[idx].put((chunk_number, df_chunk))
[docs] def writer_process(self, progress_counter: "Value[int]",) -> None: """ Process that writes tokenized data to HDF5 format. Args: writer_queue (Queue): Queue from which tokenized chunks of data are taken for writing. progress_counter (Value[int]): Shared counter tracking number of processed chunks. """ cum_data_stats = { "discarded": 0, "processed": 0, "successful": 0, "raw_chars_count": 0, "raw_bytes_count": 0, "num_pad_tokens": 0, "num_masked_tokens": 0, "loss_valid_tokens": 0, "num_tokens": 0, "normalized_chars_count": 0, "normalized_bytes_count": 0, "examples": 0, } total_examples = 0 sentinels_received = 0 tokenizer_idx = 0 while True: chunk_data = self.writer_queues[tokenizer_idx].get() tokenizer_idx = (tokenizer_idx + 1) % (self.tokenize_process_num) if chunk_data is None: sentinels_received += 1 if sentinels_received == self.tokenize_process_num: ## write remaining prefix from all processes if isinstance(self.token_generator, LMDataTokenGenerator,): ## Get prefix df chunk's file index and start doc index ## from checkpoint file ( prefix_df_chunk_file_idx, prefix_df_chunk_start_doc_idx, prefix_df_chunk_idx, ) = self.task_split_read_checkpoint() output_file_name = os.path.join( self.output_dir, f"output_chunk_{0}_{prefix_df_chunk_file_idx}_{prefix_df_chunk_start_doc_idx}_{prefix_df_chunk_idx}.h5", ) prefix_examples, stats = self.write_remaining_prefix( output_file_name ) cum_data_stats["loss_valid_tokens"] += stats[ "loss_valid_tokens" ] cum_data_stats["num_tokens"] += stats["num_tokens"] cum_data_stats["num_pad_tokens"] += stats[ "num_pad_tokens" ] cum_data_stats["num_masked_tokens"] += stats[ "num_masked_tokens" ] total_examples += prefix_examples break continue chunk_number, df_chunk = chunk_data output_file_name = os.path.join( self.output_dir, f"output_chunk_{0}_{df_chunk.file_idx}_{df_chunk.start_doc_idx}_{chunk_number}.h5", ) cum_data_stats = { key: cum_data_stats[key] + df_chunk.data_stats[key] for key in df_chunk.data_stats } if df_chunk.tokenized_data == []: progress_counter.value += 1 continue else: self.verify_hdf5_files(chunk_data) if not self.shuffle: with h5py.File(output_file_name, "w") as h5f: df_chunk.save_to_hdf5(h5f, self.write_in_batch) total_examples += int(h5f.attrs["n_examples"]) else: n_examples = df_chunk.append_to_hdf5( self.output_dir, self.total_chunks, 0 ) total_examples += n_examples with open(self.checkpoint_path, "w") as file: file.write( f"{df_chunk.file_idx}, {df_chunk.end_doc_idx + 1}, {chunk_number+1}" ) progress_counter.value += 1 cum_data_stats["examples"] = total_examples self.stats_queue.put(cum_data_stats) self.stats_queue.put(None)
[docs] def task_split_process_dataset(self) -> None: """ Split the dataset processing tasks across multiple processes. """ start_time = time.time() total_size = self.calculate_total_size()"Total size of dataset: {total_size} bytes") self.total_chunks = self.calculate_total_chunks(total_size)"Total chunks to process: {self.total_chunks}") final_data_stats = { "discarded": 0, "processed": 0, "successful": 0, "raw_chars_count": 0, "raw_bytes_count": 0, "num_pad_tokens": 0, "num_masked_tokens": 0, "loss_valid_tokens": 0, "num_tokens": 0, "normalized_chars_count": 0, "normalized_bytes_count": 0, "examples": 0, } ( file_index, doc_start_index, hdf5_written, ) = self.task_split_read_checkpoint() checkpoint_args = (file_index, doc_start_index, hdf5_written) progress_counter = multiprocessing.Value("i", hdf5_written) # Log process information"Total processes: {self.processes}")"Reader processes: 1")"Tokenizer processes: {self.processes - 2}")"Writer processes: 1") # Initialize and start processes tokenizers = [ Process(target=self.tokenizer_process, args=(idx,),) for idx in range(self.tokenize_process_num) ] writer = Process(target=self.writer_process, args=(progress_counter,),) for t in tokenizers: t.start() writer.start() # Use tqdm for the progress bar with tqdm( total=self.total_chunks, desc="Processing", dynamic_ncols=True ) as pbar: stop_event = Event() progress_thread = Thread( target=update_progress, args=( pbar, progress_counter, self.total_chunks, start_time, stop_event, ), ) progress_thread.start() self.reader_process(checkpoint_args) for t in tokenizers: t.join() writer.join() stop_event.set() progress_thread.join() end_time = time.time() elapsed_time = end_time - start_time f"The process_dataset function took {elapsed_time:.2f} seconds to complete." ) sentinels_received = 0 while True: data_stats = self.stats_queue.get() if data_stats == None: break # Exit loop after receiving all sentinels else: final_data_stats = { key: final_data_stats[key] + data_stats[key] for key in data_stats } final_data_stats["average_chars_per_sequence"] = 0 final_data_stats["average_bytes_per_sequence"] = 0 try: final_data_stats["average_chars_per_sequence"] = math.ceil( final_data_stats["raw_chars_count"] / final_data_stats["examples"] ) final_data_stats["average_bytes_per_sequence"] = math.ceil( final_data_stats["raw_bytes_count"] / final_data_stats["examples"] ) except ZeroDivisionError: # Handle the division by zero error"No output hdf5 files were created .") return final_data_stats
[docs] def process_dataset(self) -> dict: """ Process the dataset either through file split or task split methods. """ data_stats = None if self.processes < 3: data_stats = self.file_split_process_dataset() else: data_stats = self.task_split_process_dataset() return data_stats
[docs] def get_vocab_size(self): """ Get tokenizer vocabulary size Returns: vocab_size (int): text to tokenize """ if self.tokenizer_type == "gpt2tokenizer": vocab_size = len(self.tokenizer.encoder) elif self.tokenizer_type == "neoxtokenizer": vocab_size = self.tokenizer.tokenizer.get_vocab_size() elif self.tokenizer_type == "huggingfacetokenizer": return self.tokenizer.vocab_size return vocab_size