Source code for cerebras.modelzoo.data_preparation.nlp.transformer.create_meta

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Create meta file for transformer in pytorch.
Stores meta file in source directory (`src_dir`).
"""
import argparse
import os
from subprocess import run


[docs]def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--src_dir",
        type=str,
        required=True,
        help="Path to the original source language dataset.",
    )
    parser.add_argument(
        "--tgt_dir",
        type=str,
        required=True,
        help="Path to the translated target language dataset.",
    )
    args = parser.parse_args()
    result = []
    for file_name in sorted(os.listdir(args.src_dir)):
        # Counting number of lines in the files with subprocess in bash.
        cmd = f"wc -l {args.src_dir}/{file_name}"
        with open("foo.txt", "w") as fout:
            run(cmd.split(), stdout=fout)
        with open("foo.txt", "r") as fin:
            num_examples = int(fin.read().split()[0])
        result.append((file_name, num_examples))

    total_num_examples = 0
    with open(f"{args.src_dir}/meta.dat", "w") as fout:
        for i, (file_name, num_examples) in enumerate(result):
            total_num_examples += num_examples
            fout.write(
                f"{args.src_dir}/{file_name} {args.tgt_dir}/{file_name.split('en')[0]}de{file_name.split('en')[1]} {num_examples}"
            )
            if i != len(result) - 1:
                fout.write("\n")

    print(f"Number of examples: {total_num_examples}.")


if __name__ == "__main__":
    main()