Source code for cerebras.modelzoo.data_preparation.nlp.slimpajama.preprocessing.normalize_text

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Script that normalizes text
"""
import argparse
import io
import json
from itertools import repeat
from math import ceil
from multiprocessing import Pool, cpu_count
from os import listdir, makedirs, path

import ftfy
import jsonlines
import zstandard
from tqdm import tqdm


[docs]def parse_args():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-d",
        "--data_dir",
        type=str,
        required=True,
        help="Path to directory containing data files.",
    )
    parser.add_argument(
        "-t",
        "--target_dir",
        type=str,
        help="Path to directory where normlaized data files will be stored.",
    )
    parser.add_argument(
        "--zst",
        action="store_true",
        help="files with zst compression",
    )
    parser.add_argument(
        "--idx",
        type=int,
        default=-1,
        help="index for spltting files in a directory for multiple parallel runs",
    )

    return parser.parse_args()


[docs]def recreate_dataset(params):
    files, args, process_no = params
    pbar = tqdm(
        desc=f"Parsed 0 input files. Files written ",
        disable=False,
    )
    for _file in files:
        file_path = path.join(args.data_dir, _file)
        target_path = path.join(args.target_dir, _file)
        if args.zst:
            with open(file_path, 'rb') as fh:
                dcctx = zstandard.ZstdDecompressor()
                reader = io.BufferedReader(dcctx.stream_reader(fh))
                rdr = jsonlines.Reader(reader)
                with open(target_path, "wb") as f:
                    cctx = zstandard.ZstdCompressor()
                    wrt = cctx.stream_writer(f)
                    writer = io.BufferedWriter(wrt)
                    for ob in rdr:
                        doc = ob["text"]
                        doc = ftfy.fix_text(doc, normalization="NFC")
                        record = {
                            "text": doc,
                            "pred_label": ob["pred_label"],
                            "pred_label_prob": ob["pred_label_prob"],
                            "wiki_prob": ob["wiki_prob"],
                            "source": ob["source"],
                        }
                        s = json.dumps(record) + "\n"
                        writer.write(s.encode("utf-8"))
                    writer.flush()
                    wrt.flush(zstandard.FLUSH_FRAME)
        else:
            with jsonlines.open(file_path) as rdr:
                with open(target_path, "w") as f:
                    for ob in rdr:
                        doc = ob["text"]
                        doc = ftfy.fix_text(doc, normalization="NFC")
                        record = {"text": doc, "meta": ob["meta"]}
                        f.write(json.dumps(record) + "\n")

    return True


[docs]def normalize_text(args):
    makedirs(args.target_dir, exist_ok=True)
    files = sorted(listdir(args.data_dir))
    files = list(filter(lambda file_: '.jsonl' in file_, files))

    if args.idx != -1:
        files = files[args.idx * 64 : (args.idx + 1) * 64]

    n_proc = cpu_count()
    n_chunks = ceil(len(files) / n_proc)
    remain = len(files) % n_proc
    if n_chunks == 1 and remain:
        n_proc = remain
    print(f"resetting to {n_proc} for number of processes")
    files = [files[i : i + n_chunks] for i in range(0, len(files), n_chunks)]

    with Pool(processes=n_proc) as pool:
        pbar = tqdm(
            pool.imap(
                recreate_dataset,
                zip(
                    files,
                    repeat(args),
                    range(len(files)),
                ),
            ),
            total=len(files),
        )
        for test in pbar:
            pbar.update()
            if test:
                continue


if __name__ == "__main__":
    args = parse_args()
    normalize_text(args)