-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocess.py
106 lines (97 loc) · 3.48 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import sys
from pathlib import Path
import os
import csv
import shutil
DATASET_LIST = [
'NCBI-disease',
'BC2GM',
'BC4CHEMD',
'BC5CDR-chem',
'BC5CDR-disease',
'JNLPBA',
'linnaeus',
's800'
]
def make_dir():
for dataset in DATASET_LIST:
if not os.path.isdir("input/NERdataset_preproc/"+dataset+"/"):
os.mkdir("input/NERdataset_preproc/"+dataset+"/")
if not os.path.isdir("input/NERdataset_preproc_temp/"+dataset+"/"):
os.mkdir("input/NERdataset_preproc_temp/"+dataset+"/")
def replace_blank_lines_with_token(FILEPATH, TEMPPATH):
'''
Blank lines here denote the ends of sentences. They are taken as NaN by
the pd.read_csv. But missing data is also NaN. Replace blanklines with a special token
= [newline] to aid preprocessing
'''
file = open(FILEPATH, "r")
temp = open(TEMPPATH, "w+")
for line in file:
if line.isspace():
temp.write('[newline]\tX\n')
else:
temp.write(line)
file.close()
temp.close()
def preprocess():
for dataset in DATASET_LIST:
SOURCE = Path("input/NERdataset/"+dataset)
DEST = Path("input/NERdataset_preproc/"+dataset)
TEMP = Path("input/NERdataset_preproc_temp/"+dataset)
pathlist = Path(SOURCE).glob('*.tsv')
for path in pathlist:
PATH = str(SOURCE/path.name)
NEWPATH = str(DEST/path.name)
TEMPPATH = str(TEMP/path.name)
replace_blank_lines_with_token(PATH,TEMPPATH)
print(PATH, '->' ,NEWPATH)
do_preprocess(TEMPPATH,NEWPATH)
print(dataset+' Done!')
def do_preprocess(PATH,NEWPATH):
'''
Preprocessing script for NCBI-disease data
The original dataset is just a csv of words and tags
we convert it into sentences and the tags of that sentence
Output df:
Sentence | Tags
['Bert','is','a','good','model'] : ['B','O','O','O','O']
'''
df = pd.read_csv(PATH,delimiter='\t',names=['word','label'],quoting=csv.QUOTE_NONE)
new_data = []
sent, label_list = [],[]
for r in df.itertuples():
if r[1]!=r[1]: #Skip NaNs
continue
if r[1]=='[newline]': #Check for newlines
if(len(sent)==0):
continue
if(sent[-1]!='.'):
sent.append('.')
label_list.append('O')
new_data.append([sent,label_list])
# To check that sentence length and label_list length are consistent
assert(len(sent)==len(label_list))
if(label_list[0]=='I'):
print('Should not happend found - > ',sent,label_list)
raise ValueError('The first label of a sentence in I')
# To check there are no Named entities overlapping with a sentence end
assert(label_list[0]!='I')
sent, label_list = [],[]
else:
sent.append(r[1])
try:
label_list.append(r[2])
except:
print(r)
raise ValueError('out of range?')
df_new = pd.DataFrame(new_data,columns = ['sentence','labels'])
df_new.to_csv(NEWPATH, index=False)
if __name__ == "__main__":
if not os.path.isdir("input/NERdataset_preproc_temp/"):
os.mkdir("input/NERdataset_preproc_temp/")
make_dir()
preprocess()
if os.path.isdir("input/NERdataset_preproc_temp/"):
shutil.rmtree("input/NERdataset_preproc_temp/")