[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-06 11:32:10 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/examples/community/roberta/preprocessing/tokenize_mask.py
+++ b/examples/community/roberta/preprocessing/tokenize_mask.py
@@ -1,7 +1,6 @@
 import argparse
 import multiprocessing
 import os
-import socket
 import time
 from random import shuffle

@@ -29,8 +28,7 @@ def get_raw_instance(document, max_sequence_length=512):
    curr_seq = []
    sz_idx = 0
    while sz_idx < len(sizes):
-
-        if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed:    # or len(curr_seq)==0:
+        if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed:  # or len(curr_seq)==0:
            curr_seq += document[sz_idx]
            sz_idx += 1
        elif sizes[sz_idx] >= max_sequence_length_allowed:
@@ -43,7 +41,7 @@ def get_raw_instance(document, max_sequence_length=512):
            result_list.append(curr_seq)
            curr_seq = []

-    if len(curr_seq) > max_sequence_length_allowed / 2:    # /2
+    if len(curr_seq) > max_sequence_length_allowed / 2:  # /2
        result_list.append(curr_seq)

    # num_instance=int(len(big_list)/max_sequence_length_allowed)+1
@@ -58,33 +56,30 @@ def get_raw_instance(document, max_sequence_length=512):


 def split_numpy_chunk(path, tokenizer, pretrain_data, host):
-
    documents = []
    instances = []

    s = time.time()
-    with open(path, encoding='utf-8') as fd:
+    with open(path, encoding="utf-8") as fd:
        document = []
        for i, line in enumerate(tqdm(fd)):
            line = line.strip()
            # document = line
            # if len(document.split("<sep>")) <= 3:
            #     continue
-            if len(line) > 0 and line[:2] == "]]":    # This is end of document
+            if len(line) > 0 and line[:2] == "]]":  # This is end of document
                documents.append(document)
                document = []
            elif len(line) >= 2:
                document.append(line)
        if len(document) > 0:
            documents.append(document)
-    print('read_file ', time.time() - s)
+    print("read_file ", time.time() - s)

    # documents = [x for x in documents if x]
    # print(len(documents))
    # print(len(documents[0]))
    # print(documents[0][0:10])
-    import multiprocessing
-    from typing import List

    ans = []
    for docs in tqdm(documents):
@@ -98,7 +93,7 @@ def split_numpy_chunk(path, tokenizer, pretrain_data, host):
        instances.extend(raw_ins)
    del ans

-    print('len instance', len(instances))
+    print("len instance", len(instances))

    sen_num = len(instances)
    seq_len = 512
@@ -114,7 +109,7 @@ def split_numpy_chunk(path, tokenizer, pretrain_data, host):
        segment_ids[index] = mask_dict[2]
        masked_lm_output[index] = mask_dict[3]

-    with h5py.File(f'/output/{host}.h5', 'w') as hf:
+    with h5py.File(f"/output/{host}.h5", "w") as hf:
        hf.create_dataset("input_ids", data=input_ids)
        hf.create_dataset("input_mask", data=input_ids)
        hf.create_dataset("segment_ids", data=segment_ids)
@@ -124,45 +119,44 @@ def split_numpy_chunk(path, tokenizer, pretrain_data, host):


 def split_numpy_chunk_pool(input_path, output_path, pretrain_data, worker, dupe_factor, seq_len, file_name):
-
-    if os.path.exists(os.path.join(output_path, f'{file_name}.h5')):
-        print(f'{file_name}.h5 exists')
+    if os.path.exists(os.path.join(output_path, f"{file_name}.h5")):
+        print(f"{file_name}.h5 exists")
        return

    documents = []
    instances = []

    s = time.time()
-    with open(input_path, 'r', encoding='utf-8') as fd:
+    with open(input_path, "r", encoding="utf-8") as fd:
        document = []
        for i, line in enumerate(tqdm(fd)):
            line = line.strip()
-            if len(line) > 0 and line[:2] == "]]":    # This is end of document
+            if len(line) > 0 and line[:2] == "]]":  # This is end of document
                documents.append(document)
                document = []
            elif len(line) >= 2:
                document.append(line)
        if len(document) > 0:
            documents.append(document)
-    print(f'read_file cost {time.time() - s}, length is {len(documents)}')
+    print(f"read_file cost {time.time() - s}, length is {len(documents)}")

    ans = []
    s = time.time()
    pool = multiprocessing.Pool(worker)
    encoded_doc = pool.imap_unordered(pretrain_data.tokenize, documents, 100)
-    for index, res in tqdm(enumerate(encoded_doc, start=1), total=len(documents), colour='cyan'):
+    for index, res in tqdm(enumerate(encoded_doc, start=1), total=len(documents), colour="cyan"):
        ans.append(res)
    pool.close()
    print((time.time() - s) / 60)
    del documents

    instances = []
-    for a in tqdm(ans, colour='MAGENTA'):
+    for a in tqdm(ans, colour="MAGENTA"):
        raw_ins = get_raw_instance(a, max_sequence_length=seq_len)
        instances.extend(raw_ins)
    del ans

-    print('len instance', len(instances))
+    print("len instance", len(instances))

    new_instances = []
    for _ in range(dupe_factor):
@@ -171,7 +165,7 @@ def split_numpy_chunk_pool(input_path, output_path, pretrain_data, worker, dupe_

    shuffle(new_instances)
    instances = new_instances
-    print('after dupe_factor, len instance', len(instances))
+    print("after dupe_factor, len instance", len(instances))

    sentence_num = len(instances)
    input_ids = np.zeros([sentence_num, seq_len], dtype=np.int32)
@@ -182,7 +176,7 @@ def split_numpy_chunk_pool(input_path, output_path, pretrain_data, worker, dupe_
    s = time.time()
    pool = multiprocessing.Pool(worker)
    encoded_docs = pool.imap_unordered(pretrain_data.create_training_instance, instances, 32)
-    for index, mask_dict in tqdm(enumerate(encoded_docs), total=len(instances), colour='blue'):
+    for index, mask_dict in tqdm(enumerate(encoded_docs), total=len(instances), colour="blue"):
        input_ids[index] = mask_dict[0]
        input_mask[index] = mask_dict[1]
        segment_ids[index] = mask_dict[2]
@@ -190,7 +184,7 @@ def split_numpy_chunk_pool(input_path, output_path, pretrain_data, worker, dupe_
    pool.close()
    print((time.time() - s) / 60)

-    with h5py.File(os.path.join(output_path, f'{file_name}.h5'), 'w') as hf:
+    with h5py.File(os.path.join(output_path, f"{file_name}.h5"), "w") as hf:
        hf.create_dataset("input_ids", data=input_ids)
        hf.create_dataset("input_mask", data=input_mask)
        hf.create_dataset("segment_ids", data=segment_ids)
@@ -199,50 +193,48 @@ def split_numpy_chunk_pool(input_path, output_path, pretrain_data, worker, dupe_
    del instances


-if __name__ == '__main__':
-
+if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('--tokenizer_path', type=str, required=True, default=10, help='path of tokenizer')
-    parser.add_argument('--seq_len', type=int, default=512, help='sequence length')
-    parser.add_argument('--max_predictions_per_seq',
-                        type=int,
-                        default=80,
-                        help='number of shards, e.g., 10, 50, or 100')
-    parser.add_argument('--input_path', type=str, required=True, help='input path of shard which has split sentence')
-    parser.add_argument('--output_path', type=str, required=True, help='output path of h5 contains token id')
-    parser.add_argument('--backend',
-                        type=str,
-                        default='python',
-                        help='backend of mask token, python, c++, numpy respectively')
+    parser.add_argument("--tokenizer_path", type=str, required=True, default=10, help="path of tokenizer")
+    parser.add_argument("--seq_len", type=int, default=512, help="sequence length")
    parser.add_argument(
-        '--dupe_factor',
+        "--max_predictions_per_seq", type=int, default=80, help="number of shards, e.g., 10, 50, or 100"
+    )
+    parser.add_argument("--input_path", type=str, required=True, help="input path of shard which has split sentence")
+    parser.add_argument("--output_path", type=str, required=True, help="output path of h5 contains token id")
+    parser.add_argument(
+        "--backend", type=str, default="python", help="backend of mask token, python, c++, numpy respectively"
+    )
+    parser.add_argument(
+        "--dupe_factor",
        type=int,
        default=1,
-        help='specifies how many times the preprocessor repeats to create the input from the same article/document')
-    parser.add_argument('--worker', type=int, default=32, help='number of process')
-    parser.add_argument('--server_num', type=int, default=10, help='number of servers')
+        help="specifies how many times the preprocessor repeats to create the input from the same article/document",
+    )
+    parser.add_argument("--worker", type=int, default=32, help="number of process")
+    parser.add_argument("--server_num", type=int, default=10, help="number of servers")
    args = parser.parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
-    pretrain_data = PreTrainingDataset(tokenizer,
-                                       args.seq_len,
-                                       args.backend,
-                                       max_predictions_per_seq=args.max_predictions_per_seq)
+    pretrain_data = PreTrainingDataset(
+        tokenizer, args.seq_len, args.backend, max_predictions_per_seq=args.max_predictions_per_seq
+    )

    data_len = len(os.listdir(args.input_path))

    for i in range(data_len):
-        input_path = os.path.join(args.input_path, f'{i}.txt')
+        input_path = os.path.join(args.input_path, f"{i}.txt")
        if os.path.exists(input_path):
            start = time.time()
-            print(f'process {input_path}')
-            split_numpy_chunk_pool(input_path, args.output_path, pretrain_data, args.worker, args.dupe_factor,
-                                   args.seq_len, i)
+            print(f"process {input_path}")
+            split_numpy_chunk_pool(
+                input_path, args.output_path, pretrain_data, args.worker, args.dupe_factor, args.seq_len, i
+            )
            end_ = time.time()
-            print(u'memory：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024))
-            print(f'has cost {(end_ - start) / 60}')
-            print('-' * 100)
-            print('')
+            print("memory：%.4f GB" % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024))
+            print(f"has cost {(end_ - start) / 60}")
+            print("-" * 100)
+            print("")

    # if you have multiple server, you can use code below or modify code to openmpi