modelzoo.transformers.data_processing.scripts.chunk_preprocessing.data_reader.Reader#

class modelzoo.transformers.data_processing.scripts.chunk_preprocessing.data_reader.Reader[source]#

Bases: object

Initialize the Reader instance.

Parameters

file_list (List[str]) – List of file paths to be read.
max_chunk_size (int) – Maximum chunk size for accumulated data.
keys (Optional[List[str]]) – List of keys to filter data. Defaults to [‘text’].

Methods

`accumulate_and_yield`	Accumulate data and yield in chunks.
`handle_jsonl`	Handle JSONL data and yield processed entries.
`read_jsongz`	Read and process gzipped JSON file.
`read_jsonl`	Read and process JSONL file.
`read_jsonl_tar`	Read and process TAR archive containing ZST compressed JSONL files.
`read_jsonl_zst`	Read and process ZST compressed JSONL file.
`read_parquet`	Read and process Parquet file.
`read_txt`	Read and process text file.
`stream_data`	Stream and process data from multiple file formats.

__init__(file_list: List[str], max_chunk_size: int, logger: logging.Logger, keys: Optional[List[str]] = None) → None[source]#

Initialize the Reader instance.

Parameters

file_list (List[str]) – List of file paths to be read.
max_chunk_size (int) – Maximum chunk size for accumulated data.
keys (Optional[List[str]]) – List of keys to filter data. Defaults to [‘text’].

accumulate_and_yield(data_gen: Iterator[Dict[str, Any]], file_idx) → Iterator[Any][source]#

Accumulate data and yield in chunks.

Parameters

data_gen (Iterator[Dict[str, Any]]) – Generator yielding data entries.
file_idx (int) – Current file index

Returns

Yields accumulated data chunks.

Return type

Iterator[Any]

handle_jsonl(jsonl_reader: Any, start_doc_idx: int, get_meta: bool, autojoin_paragraphs: bool, para_joiner: str) → Iterator[Dict[str, Any]][source]#

Handle JSONL data and yield processed entries.

Parameters

jsonl_reader (Any) – The JSONL reader object.
start_doc_idx (int) – Contains the current document starting index
get_meta (bool) – Flag to determine if meta data should be extracted.
autojoin_paragraphs (bool) – Flag to auto join paragraphs.
para_joiner (str) – Paragraph joiner string.

Returns

Yields processed data entries.

Return type

Iterator[Dict[str, Any]]

read_jsongz(file: str, checkpoint_args: tuple) → Iterator[Any][source]#

Read and process gzipped JSON file.

Parameters

file (str) – Path to the .json.gz file.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index

Returns

Yields processed data entries.

Return type

Iterator[Any]

read_jsonl(file: str, checkpoint_args: tuple, get_meta: bool = False, autojoin_paragraphs: bool = True, para_joiner: str = '\n\n') → Iterator[Any][source]#

Read and process JSONL file.

Parameters

file (str) – Path to the .jsonl file.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index
get_meta (bool) – Flag to determine if meta data should be extracted.
autojoin_paragraphs (bool) – Flag to auto join paragraphs.
para_joiner (str) – Paragraph joiner string.

Returns

Yields processed data entries.

Return type

Iterator[Any]

read_jsonl_tar(file: str, checkpoint_args: tuple, get_meta: bool = False, autojoin_paragraphs: bool = True, para_joiner: str = '\n\n') → Iterator[Any][source]#

Read and process TAR archive containing ZST compressed JSONL files.

Parameters

file (str) – Path to the .jsonl.zst.tar file.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index
get_meta (bool) – Flag to determine if meta data should be extracted.
autojoin_paragraphs (bool) – Flag to auto join paragraphs.
para_joiner (str) – Paragraph joiner string.

Returns

Yields processed data entries.

Return type

Iterator[Any]

read_jsonl_zst(file: str, checkpoint_args: tuple, get_meta: bool = False, autojoin_paragraphs: bool = True, para_joiner: str = '\n\n') → Iterator[Any][source]#

Read and process ZST compressed JSONL file.

Parameters

file (str) – Path to the .jsonl.zst file.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index
get_meta (bool) – Flag to determine if meta data should be extracted.
autojoin_paragraphs (bool) – Flag to auto join paragraphs.
para_joiner (str) – Paragraph joiner string.

Returns

Yields processed data entries.

Return type

Iterator[Any]

read_parquet(file: str, checkpoint_args: tuple) → Iterator[Any][source]#

Read and process Parquet file.

Parameters

file (str) – Path to the .parquet file.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index

Returns

Yields processed data rows.

Return type

Iterator[Any]

read_txt(file: str, checkpoint_args: tuple) → Iterator[Any][source]#

Read and process text file.

Parameters

file (str) – Path to the .txt file.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index

Returns

Yields processed data lines.

Return type

Iterator[Any]

stream_data(checkpoint_args, get_meta: bool = False) → Iterator[Any][source]#

Stream and process data from multiple file formats.

Parameters

get_meta (bool) – Flag to determine if meta data should be extracted.
checkpoint_args (tuple) – Contains the current file starting index , current document starting index

Returns

Yields processed data chunks.

Return type

Iterator[Any]

modelzoo.transformers.data_processing.scripts.chunk_preprocessing.data_reader.DataFrame

modelzoo.transformers.data_processing.scripts.chunk_preprocessing.fim_data_token_generator