-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess_data.py
69 lines (53 loc) · 2.18 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
def collect_ppdb():
sources = []
targets = []
with open('ppdb_all.txt', 'r+') as f:
for line in f:
line = line.split('|||')
if float(line[-1]) >= 3.0:
sources.append(line[0])
targets.append(line[1])
return sources, targets
def collect_quora():
sources = []
targets = []
data = pd.read_csv('quora_duplicate_questions.tsv', sep="\t")
data = np.array(data)
data = data[data[:,-1]==1] # only collect true paraphrases
for row in data:
sources.append(row[-3])
targets.append(row[-2])
return sources, targets
def collect_language_net():
sources = []
targets = []
with open('2016_Oct_10--2017_Jan_08_paraphrase.txt', 'r+') as f:
for line in f:
line = line.split('\t')
if len(line) == 2:
sources.append(line[0].strip())
targets.append(line[1].strip())
return sources, targets
def save_to_file(out_file, sources, targets):
for i in range(len(sources)):
source_string = re.sub(r'\W+ ', '', sources[i])
target_string = re.sub(r'\W+ ', '', targets[i])
out_file.write('{},{}\n'.format(source_string, target_string))
out_file.close()
if __name__ == '__main__':
out_f_train = open('train_data_all.csv', 'w+')
out_f_val = open('val_data_all.csv', 'w+')
out_f_test = open('test_data_all.csv', 'w+')
ppdb_sources, ppdb_targets = collect_ppdb()
quora_sources, quora_targets = collect_quora()
ln_sources, ln_targets = collect_language_net()
all_data = list(zip(ppdb_sources + quora_sources + ln_sources, ppdb_targets + quora_targets + ln_targets))
source_train, source_val, target_train, target_val = train_test_split([x[0] for x in all_data], [x[1] for x in all_data], test_size=0.05)
source_val, source_test, target_val, target_test = train_test_split(source_val, target_val, test_size=0.2)
save_to_file(out_f_train, source_train, target_train)
save_to_file(out_f_val, source_val, target_val)
save_to_file(out_f_test, source_test, target_test)