forked from huggingface/pytorch-openai-transformer-lm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
datasets.py
51 lines (45 loc) · 1.69 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import csv
import numpy as np
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
seed = 3535999445
def _rocstories(path):
with open(path, encoding='utf_8') as f:
f = csv.reader(f)
st = []
ct1 = []
ct2 = []
y = []
for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
if i > 0:
s = ' '.join(line[1:5])
c1 = line[5]
c2 = line[6]
st.append(s)
ct1.append(c1)
ct2.append(c2)
y.append(int(line[-1])-1)
return st, ct1, ct2, y
def rocstories(data_dir, n_train=1497, n_valid=374):
storys, comps1, comps2, ys = _rocstories(os.path.join(data_dir, 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'))
teX1, teX2, teX3, _ = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
tr_storys, va_storys, tr_comps1, va_comps1, tr_comps2, va_comps2, tr_ys, va_ys = train_test_split(storys, comps1, comps2, ys, test_size=n_valid, random_state=seed)
trX1, trX2, trX3 = [], [], []
trY = []
for s, c1, c2, y in zip(tr_storys, tr_comps1, tr_comps2, tr_ys):
trX1.append(s)
trX2.append(c1)
trX3.append(c2)
trY.append(y)
vaX1, vaX2, vaX3 = [], [], []
vaY = []
for s, c1, c2, y in zip(va_storys, va_comps1, va_comps2, va_ys):
vaX1.append(s)
vaX2.append(c1)
vaX3.append(c2)
vaY.append(y)
trY = np.asarray(trY, dtype=np.int32)
vaY = np.asarray(vaY, dtype=np.int32)
return (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)