-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathxml2standoff
executable file
·318 lines (273 loc) · 13.7 KB
/
xml2standoff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2024 Pavel Vondřička <[email protected]>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
# dated June, 1991.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
import html
import re
from io import StringIO
def scan_to(text, ptr, mark):
"""
Scan text up to the given mark and return preceding contents and the mark
Parameters:
-----------
text : str
Text string
ptr : integer
Pointer to the current character position in the text (used as scan start position)
mark: str
(Sub)string to seek until
Returns:
--------
segment : str
Text segment (str) from start (original ptr position) to the start of the searched *mark*
mark : str
The found mark (str) or empty string if not found until EOF
ptr : integer
Pointer to the last position (the end of the found *mark*)
"""
start = ptr
while ptr < len(text) and text[ptr:ptr+len(mark)] != mark:
ptr += 1
if text[ptr:ptr+len(mark)] != mark:
mark = ''
pre = ptr
ptr += len(mark)
return text[start:pre], mark, ptr
def text_continues(text, ptr, str):
"""
Test whether text contents at position `ptr` continues with the given string
Parameters:
-----------
text : str
text string
ptr : integer
Pointer to the current character position in the text (to test)
str : str
String to test for
Returns:
--------
result : boolean
Match or not
"""
return text[ptr:ptr+len(str)] == str
def re_partition(pattern, string):
match = re.search(pattern, string)
if not match:
return string, '', ''
return string[:match.start()], match.group(0), string[match.end():]
def split_xml(text, txtelements=None, excludes=None, keep_linebreaks=False):
"""
Split XML into plaintext and stand-off XML markup description
Parameters:
-----------
text : str
XML input as text string
txtelements : list of str
list of names of elements containg text to be extracted
excludes : list of str
list of names of elements to be skipped
keep_linebreaks : boolean
keep linebreaks within text elements or remove them?
Returns:
--------
plaintext : str
Extracted plaintext contents (str) stripped of all XML mark-up
markup : list of dict
XML markup description in the form of list of XML element descriptions:
- name (str) : name of the XML element
- contents (str) : other contents of the XML element tag (attributes with values, XML comment contents etc.)
- start (integer) : character position in the plain text where the XML element's span starts
- end (integer) : character position in the plain text where the XML element's span ends
- level (integher) : level of nesting (depth) in the XML tree
- order (integer) : order of element in the XML tree
- text (str): (optional) text contents at the beginning of the element (if not a text element, i.e. it has not been extracted into plain text file)
- tail (str): (optional) text contents at the end of the element (have not been extracted into plain text file)
- skip (int): (optional) skip following plain text contents at the start of the element (provisional line break)
- cut (int): (optional) cut plain text contents from the end of the element (provisional line break)
"""
# buffer for the extracted plaintext
plaintext = StringIO()
# character counter for the extracted plain text contents
charcnt = 0
# heap of open element spans
heap = []
# buffer for the XML mark-up descxription
markup = []
# XML element level and order counters
slevel = 0
order = 0
# pointer to the current character position in the source XML string
ptr = 0
# content aware extraction:
# depth within text elements (may be nested)
txtel_depth = 0
# depth within excluded elements (may be nested)
exclude_depth = 0
# tuple with a current mark-up element description and its attribute to keep the text which shall not be extracted
text_store = None
# shall we remove line breaks within text elements?
normalize_linebreaks = txtelements is not None and not keep_linebreaks
lbre = re.compile(r"[\n\r]+")
while True:
# scan to the beginning of next XML mark-up and add the contents to the plain text buffer
(contents, mark, ptr) = scan_to(text, ptr, "<")
contents = html.unescape(contents)
if text_store is None:
# default: extract text contents into the output plain text file
if normalize_linebreaks:
contents = lbre.sub(" ", contents)
charcnt += len(contents)
plaintext.write(contents)
else:
# mark-up element description to store/keep contents within
text_store[0][text_store[1]] = contents
text_store = None
if len(mark):
# treat XML elements, comments and processing instructions and their tag contents
if text_continues(text, ptr, '?'):
# processing instruction
(element, mark, ptr) = scan_to(text, ptr, "?>")
(ename, csep, econtents) = re_partition(r"(\s+)", element)
econtents += '?'
elif text_continues(text, ptr, '!--'):
# XML comment
(element, mark, ptr) = scan_to(text, ptr, "-->")
ename = '!--'
econtents = element[3:] + '--'
else:
# XML element
(element, mark, ptr) = scan_to(text, ptr, ">")
(ename, csep, econtents) = re_partition(r"(\s+)", element)
# store the mark-up data into the mark-up buffer
if ename.startswith("/"):
# end tag: add ending position and append the element span description into the mark-up buffer
ename = ename.lstrip("/")
last = heap.pop()
slevel -= 1
if ename != last['name']:
raise Exception("XML error at char {0}: Element {1} not closed before end of {2}."
.format(charcnt, last['name'], ename))
last['end'] = charcnt
markup.append(last)
# is the ending element a text element or an excluded element?
if txtelements is not None:
if ename in txtelements:
txtel_depth -= 1
if exclude_depth == 0:
# insert provisional line break into the plain text that will be cut in the reverse conversion (unless within excluded scope)
charcnt += 1
plaintext.write("\n")
last['cut'] = 1
last['end'] = charcnt
# outside the scope of text elements: do not extract following text anymore, but keep it within the mark-up description
if txtel_depth == 0:
text_store = (last, 'tail')
if excludes is not None:
if ename in excludes:
exclude_depth -= 1
# still within the scope of excluded elements: no text extraction
if exclude_depth > 0:
text_store = (last, 'tail')
else:
# start tag, empty element, comment or processing instruction
order += 1
if ename.startswith("?") or ename.startswith("!") or ename.endswith("/") or econtents.endswith("/"):
# zero-span: empty tags, comments and proc. instructions have no end tag and can be inserted into the buffer immediately
eldesc = {"name": ename, "contents": econtents,
"start": charcnt, "end": charcnt, "level": slevel, 'order': order}
if csep and csep != ' ':
eldesc['csep'] = csep
# should the empty element add a provisional line break?
if txtelements is not None and exclude_depth == 0 and ename.endswith("/") and ename[:-1] in txtelements:
charcnt += 1
plaintext.write("\n")
eldesc['skip'] = 1
markup.append(eldesc)
if (txtelements is not None and txtel_depth == 0) or (excludes is not None and exclude_depth > 0):
text_store = (eldesc, 'tail')
else:
# pair start tags: add description to the heap to be inserted into the buffer when a matching end tag is found
eldesc = {"name": ename, "contents": econtents,
"start": charcnt, "level": slevel, 'order': order}
if csep and csep != ' ':
eldesc['csep'] = csep
# if included elements are defined, check whether we shall extract the text contents or store/keep them within the mark-up description
if txtelements is not None:
if ename in txtelements:
txtel_depth += 1
text_store = None
if exclude_depth == 0:
# insert provisional line break into the plain text that will be skipped in the reverse conversion
charcnt += 1
plaintext.write("\n")
eldesc['skip'] = 1
elif txtel_depth == 0:
text_store = (eldesc, 'text')
# if excluded elements are defined, keep text contents within their description and do not extract them
if excludes is not None:
if ename in excludes:
exclude_depth += 1
if exclude_depth > 0:
text_store = (eldesc, 'text')
heap.append(eldesc)
slevel += 1
if not len(mark):
# end of XML contents: exit
break
return plaintext.getvalue(), markup
if __name__ == '__main__':
"""
Split XML into plain text and stand-off XML mark-up description
===============================================================
Input: input XML file name (UTF-8 encoding)
(Only XML elements (with atrributes), comments and processing instructions are treated. Anything else is ignored.)
Output: creates two new files with the same base name as the input file, but with the extension `.txt` and `.json`
Options: '-t <text_elements>' List of comma separated names of basic text elements (e.g. paragraph lebvel elements). Turns
on content aware extraction where only text contents from the specified elements is extracted into the plain text
output. Provisional line breaks are added to the plain text output between the text fragments, which will be removed
later by `standoff2xml`. The text elements may also be nested within each other.
'-kl' Keep line breaks within the specified text elements. By default, line breaks within the text elements are
(irreversibly) converted to spaces. (This option has thus no effect if `-t` is not used!)
'-e <element_names>' List of comma separated names of elements to ignore/skip when extracting plain text contents.
Anything within their scope is ignored, even nested elements of the type specified by `-t`.
"""
import sys
import argparse
import json
from pathlib import Path
parser = argparse.ArgumentParser(description="Split XML into plain text and stand-off XML mark-up")
parser.add_argument("infile", help="input XML file name (UTF-8)")
parser.add_argument("-t", "--text-elements", help="text elements to extract contents from", type=str)
parser.add_argument("-e", "--exclude-elements", help="elements to ignore", type=str)
parser.add_argument("-kl", "--keep-linebreaks", help="keep line breaks within text elements", action="store_true")
args = parser.parse_args()
if args.keep_linebreaks and args.text_elements is None:
sys.stderr.write("WARNING: Option '-kl/--keep-linebreaks' has no effect unless text elements are specified.\n")
exit(1)
xmlin = Path(args.infile)
txtout = xmlin.with_suffix('.txt')
jsonout = xmlin.with_suffix('.json')
includes = args.text_elements.split(',') if args.text_elements else None
excludes = args.exclude_elements.split(',') if args.exclude_elements else None
# read the whole input file
with xmlin.open(encoding='utf-8') as infile:
text = infile.read()
# generate separated plain text contents and mark-up description
plaintext, markup = split_xml(text, includes, excludes, args.keep_linebreaks)
# write out plain text contents
with txtout.open('w', encoding='utf-8') as txtfile:
txtfile.write(plaintext)
txtfile.close()
# write out XML mark-up description as JSON
with jsonout.open('w', encoding='utf-8') as jsonfile:
json.dump(markup, jsonfile, indent=0)
jsonfile.close()