-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
34 lines (27 loc) · 981 Bytes
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pickle
from transformers import BertTokenizer
import argparse
import os
def preprocess(args):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
path = args.data_path
save_path = args.save_data_path
for file in os.listdir(path):
p = path + str(file)
with open(p, "r") as f:
x = f.readlines()
ret = []
for s in x:
ret_dict = dict()
encoded = tokenizer.encode(s, add_special_tokens=False)
ret_dict["encoded_txt"] = encoded
ret.append(ret_dict)
save_file = file.split(".")[0] + ".pickle"
with open(save_path + save_file, "wb") as f:
pickle.dump(ret, f)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--raw_data_path", default = "./raw_data/", type = str)
parser.add_argument("--data_path", default = "./data/", type = str)
args = parser.parse_args()
preprocess(args)