-
Notifications
You must be signed in to change notification settings - Fork 0
/
script_regex.py
87 lines (78 loc) · 2.61 KB
/
script_regex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import copy
import glob
import os
import sys
def ja_regex():
print("日本語スクリプトの正規表現処理を行います")
if not os.path.isdir("./ted_script_ja_regex"):
os.mkdir("./ted_script_ja_regex")
text_list = glob.glob('./ted_script_ja/*.txt')
#df = pd.read_csv("./ted-link.csv")
new_dict ={}
for text in text_list:
text1 = text.split("/")[2]
text_num = text1.split("_")[0]
### 文字列数値から整数型数値へ変換
new_dict[int(text_num)]=text
### ソート
sorted_dict = sorted(new_dict.items(), key=lambda x: x[0])
for N,file in sorted_dict:
file1 = file.split("/")[-1]
print(file1)
#file_number = file1.split("_")[0]
#name = df["link"][int(file_number)].split("/")[-1]
file_ja = open("./ted_script_ja_regex/"+file1,"w")
#print(name)
f = open(file,"r")
f = f.readlines()
#print(f)
#f3 = copy.deepcopy(f)
#f3[0] ="*"*5 + name + "*"*5 +"\n" + f[0]
for line in f:
line=line.replace("\t","")
line=line.replace("<p>","")
line=line.replace("</p>","")
file_ja.write(line)
#print(line.strip())
file_ja.close()
#if N == 2:
# sys.exit(0)
def en_regex():
print("英語スクリプトの正規表現処理を行います")
if not os.path.isdir("./ted_script_en_regex"):
os.mkdir("./ted_script_en_regex")
text_list = glob.glob('./ted_script_en/*.txt')
#df = pd.read_csv("./ted-link.csv")
new_dict ={}
for text in text_list:
text1 = text.split("/")[2]
text_num = text1.split("_")[0]
### 文字列数値から整数型数値へ変換
new_dict[int(text_num)]=text
### ソート
sorted_dict = sorted(new_dict.items(), key=lambda x: x[0])
for N,file in sorted_dict:
file1 = file.split("/")[-1]
print(file1)
#file_number = file1.split("_")[0]
#name = df["link"][int(file_number)].split("/")[-1]
file_en = open("./ted_script_en_regex/"+file1,"w")
#print(name)
f = open(file,"r")
f = f.readlines()
#f3 = copy.deepcopy(f)
#f3[0] ="*"*5 + name + "*"*5 +"\n" + f[0]
#print(len(f3))
for line in f:
line=line.replace("\t","")
line=line.replace("<p>","")
line=line.replace("</p>","")
file_en.write(line)
#print(line.strip())
file_en.close()
#if N == 1:
# sys.exit(0)
if __name__ == "__main__":
en_regex()
ja_regex()