-
Notifications
You must be signed in to change notification settings - Fork 0
/
conll_df.py
executable file
·230 lines (197 loc) · 7 KB
/
conll_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import pandas as pd
# UD 1.0
CONLL_COLUMNS = ['i', 'w', 'l', 'p', 'n', 'm', 'g', 'f', 'd', 'c']
# UD 2.0
CONLL_COLUMNS_V2 = ['i', 'w', 'l', 'x', 'p', 'm', 'g', 'f', 'e', 'o']
# possible morphological attributes
MORPH_ATTS = ['type',
'animacy',
#'gender',
'number'
"Abbr",
"Animacy",
"Aspect",
"Case",
"Definite",
"Degree",
"Evident",
"Foreign",
"Gender",
"Mood",
"NumType",
"Number",
"Person",
"Polarity",
"Polite",
"Poss",
"PronType",
"Reflex",
"Tense",
"VerbForm",
"Voice",
"Type"]
def _make_sent_csv(sentstring, fname, meta, splitter, i, skip_meta=False):
"""
Take one CONLL-U sentence and add all metadata to each row
Return: str (CSV data) and dict (sent level metadata)
"""
fixed_lines = []
raw_lines = sentstring.splitlines()
for line in raw_lines:
if not line:
continue
if line.startswith('#'):
if not '=' in line:
line = line + " = null" # add null value to metadata without =
if not skip_meta:
try:
k, v = line.lstrip('# ').split(splitter, 1)
except ValueError:
k, v = line.lstrip('# ').split(splitter.strip(), 1)
meta[k.lower().strip()] = v.strip()
else:
line = '%s\t%s\t%s' % (fname, i, line)
fixed_lines.append(line)
return '\n'.join(fixed_lines), meta
def _add_governors_to_df(df):
"""
Add governor info to a DF. Increases memory usage quite a bit.
"""
# save the original index
i = df.index.get_level_values('i')
# add g
dfg = df.set_index('g', append=True)
# remove i
dfg = dfg.reset_index('i')
dfg = df.loc[dfg.index]
dfg = dfg[['w', 'l', 'p', 'f']]
dfg['i'] = i
dfg = dfg.set_index('i', append=True)
dfg.index.names = ['file', 's', 'g', 'i']
dfg = dfg.reset_index('g', drop=True)
for c in list(dfg.columns):
try:
dfg[c] = dfg[c].cat.add_categories(['ROOT'])
except (AttributeError, ValueError):
pass
dfg = dfg.fillna('ROOT')
dfg.columns = ['gw', 'gl', 'gp', 'gf']
dfg = df.join(dfg, how="inner")
return dfg
def conll_df(path,
corpus_name=False,
corp_folder=False,
v2="auto",
skip_morph=False,
skip_meta=False,
add_gov=False,
drop=['text', 'newdoc id'],
file_index=True,
categories=True,
extra_fields='auto',
drop_redundant=True,
**kwargs):
"""
Optimised CONLL-U reader for v2.0 data
Args:
path (str): the file to prepare
Returns:
pd.DataFrame: 2d array representation of file data
"""
import os
import re
try:
from io import StringIO
except ImportError:
from StringIO import StringIO
splitter = ' = ' if v2 else '='
encoding=kwargs.get('encoding')
with open(path, 'r', encoding=encoding) as fo:
data = fo.read().strip('\n')
if v2 == 'auto':
v2 = 'sent_id = ' in data[:9999]
fname = os.path.basename(path)
# metadata that applies filewide
# a little bonus for those with annual data
basedict = {}
if not skip_meta:
year = re.search(r'[12][0-9][0-9][0-9]', fname)
if year:
basedict['year'] = year.group(0)
sents = [x for x in data.split('\n\n') if len([y for y in x.splitlines() if not y.startswith("#")]) >= 2] # only process sentences with 2 or more tokens
sents_meta = [_make_sent_csv(sstring, fname, dict(basedict), splitter, i, skip_meta=skip_meta) \
for i, sstring in enumerate(sents, start=1)]
sents, metadata = zip(*sents_meta)
# make the sent df
sents = '\n\n'.join(sents)
sents = StringIO(sents)
if v2:
cols = ['file', 's'] + CONLL_COLUMNS_V2
else:
cols = ['file', 's'] + CONLL_COLUMNS
df = pd.read_csv(sents, sep="\t", header=None, names=cols, quoting=kwargs.pop('quoting', 3),
index_col=[0, 1, 2], engine='c', na_filter=False, **kwargs)
if v2 and not skip_morph:
df['m'] = df['m'].fillna('')
df['o'] = df['o'].fillna('')
if extra_fields == 'auto':
# evil line to get all possible keys in the final column
extra_fields = list(df['o'].str.extractall(r'(?:^|\|)([^=]+?)=')[0].unique())
cats = MORPH_ATTS + extra_fields
if 'SpaceAfter' not in cats:
cats.append('SpaceAfter')
cats = list(set(cats))
om = df['o'].str.cat(df['m'], sep='|').str.strip('|_')
# this is a very slow list comp, but i can't think of a better way to do it.
# the 'extractall' solution makes columns for not just the value, but the key...
extra = [om.str.extract('%s=([^|$]+)' % cat.title(), expand=True) for cat in cats]
extra = pd.concat(extra, axis=1)
extra.columns = cats
df = pd.concat([df, extra], axis=1)
# make and join the meta df
if not skip_meta:
metadata = {i: d for i, d in enumerate(metadata, start=1)}
metadata = pd.DataFrame(metadata).T
metadata.index.name = 's'
df = metadata.join(df, how='inner')
# we never want these to show up as a dataframe column
#badcols = ['sent_id', 's', 'i', 'file']
badcols = ['s', 'i', 'file']
# if we aren't parsing morph and extra columns, we should at least keep them
if not skip_morph:
badcols += ['o', 'm']
if drop:
badcols = badcols + drop
df = df.drop(badcols, axis=1, errors='ignore')
# some evil code to handle conll-u files where g col could be a string
if 'g' in df.columns:
df['g'] = df['g'].fillna(0)
if df['g'].dtype in [object, str]:
df['g'] = df['g'].str.replace('_', '0').astype(int)
df['g'] = df['g'].astype(int)
df = df.fillna('_')
# attempt to categorise data
if categories:
for c in list(df.columns):
if c in ['g', 'date']:
continue
try:
df[c] = df[c].astype('category')
except:
pass
if add_gov:
df = _add_governors_to_df(df)
if not file_index:
df.index = df.index.droplevel('file')
if drop_redundant:
empty_cols = []
for c in df.columns:
if len(df[c].unique()) == 1:
empty_cols.append(c)
df = df.drop(empty_cols, axis=1)
#reorder columns so that important things are first
firsts = CONLL_COLUMNS_V2 if v2 else CONLL_COLUMNS
firsts = [i for i in firsts if i in list(df.columns)]
lasts = [i for i in list(df.columns) if i not in firsts]
df = df[firsts + lasts]
return df