# Copyright 2022 Cerebras Systems.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

[docs]def add_common_parser_args(parser): parser.add_argument( "--metadata_files", type=str, required=True, nargs='+', help="path to text file containing a list of file names " "corresponding to the raw input documents to be " "processed and stored; can handle multiple metadata files " "separated by space", ) parser.add_argument( "--multiple_docs_in_single_file", action="store_true", help="Pass this flag when a single text file contains multiple" " documents separated by <multiple_docs_separator>", ) parser.add_argument( "--multiple_docs_separator", type=str, default="\n", help="String which separates multiple documents in a single text file. " "If newline character, pass \\n" "There can only be one separator string for all the documents.", ) parser.add_argument( "--single_sentence_per_line", action="store_true", help="Pass this flag when the document is already split into sentences with" "one sentence in each line and there is no requirement for " "further sentence segmentation of a document ", ) parser.add_argument( '--input_files_prefix', type=str, default="", help='prefix to be added to paths of the input files. ' 'For example, can be a directory where raw data is stored ' 'if the paths are relative', ) parser.add_argument( "--vocab_file", type=str, required=True, help="path to vocabulary" ) parser.add_argument( "--split_num", type=int, default=1000, help="number of input files to read at a given time for processing. " "Defaults to 1000.", ) parser.add_argument( "--do_lower_case", action="store_true", help="pass this flag to lower case the input text; should be " "True for uncased models and False for cased models", ) parser.add_argument( "--max_seq_length", type=int, default=128, help="maximum sequence length", ) parser.add_argument( "--short_seq_prob", type=float, default=0.1, help="probability of creating sequences which are shorter " "than the maximum sequence length", ) parser.add_argument( "--min_short_seq_length", type=int, default=None, help="The minimum number of tokens to be present in an example" "if short sequence probability > 0." "If None, defaults to 2 " "Allowed values are [2, max_seq_length - 3)", ) parser.add_argument( "--masked_lm_prob", type=float, default=0.15, help="masked LM probability", ) parser.add_argument( "--max_predictions_per_seq", type=int, default=20, help="maximum number of masked LM predictions per sequence", ) parser.add_argument( "--spacy_model", type=str, default="en_core_web_sm", help="spaCy model to load, i.e. shortcut link, package name or path.", ) parser.add_argument( "--mask_whole_word", action="store_true", help="whether to use whole word masking rather than per-WordPiece " "masking.", ) parser.add_argument( "--output_dir", type=str, default=None, help="directory where HDF5 files will be stored.", ) parser.add_argument( "--num_output_files", type=int, default=10, help="number of output files in total i.e each process writes num_output_files//num_processes number of files" "Defaults to 10.", ) parser.add_argument( "--name", type=str, default="preprocessed_data", help="name of the dataset; i.e. prefix to use for hdf5 file names. " "Defaults to 'preprocessed_data'.", ) parser.add_argument( "--seed", type=int, default=0, help="random seed. Defaults to 0.", ) parser.add_argument( "--num_processes", type=int, default=0, help="Number of parallel processes to use, defaults to cpu count", ) return parser
[docs]def add_mlm_only_specific_args(parser): parser = add_common_parser_args(parser) parser.add_argument( "--overlap_size", type=int, default=None, help="overlap size for generating sequences from buffered data for mlm only sequences" "Defaults to None, which sets the overlap to max_seq_len/4.", ) parser.add_argument( "--buffer_size", type=int, default=1e6, help="buffer_size number of elements to be processed at a time", ) parser.add_argument( "--allow_cross_document_examples", action="store_true", help="Pass this flag when examples can cross document boundaries", ) parser.add_argument( "--document_separator_token", type=str, default="[SEP]", help="If examples can span documents, " "use this separator to indicate separate tokens of current and next document", ) # This is a suppressed argument that will not show in --help. # Users MUST NOT specify this arg. It is used to switch between the two modes # defined to generate HDF5 files. parser.add_argument( '--__mode', default="mlm_only", help=argparse.SUPPRESS, required=False ) return parser
[docs]def add_mlm_nsp_specific_args(parser): parser = add_common_parser_args(parser) # This is a suppressed argument that will not show in --help. # Users MUST NOT specify this arg. It is used to switch between the two modes # defined to generate HDF5 files. parser.add_argument( '--__mode', default="mlm_nsp", help=argparse.SUPPRESS, required=False ) return parser
[docs]def create_arg_parser(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) subparsers = parser.add_subparsers() mlm_subparser = subparsers.add_parser("mlm_only") mlm_subparser = add_mlm_only_specific_args(mlm_subparser) mlm_nsp_subparser = subparsers.add_parser("mlm_nsp") mlm_nsp_subparser = add_mlm_nsp_specific_args(mlm_nsp_subparser) return parser
[docs]def get_parser_args(): parser = create_arg_parser() args = parser.parse_args() return args