-
Notifications
You must be signed in to change notification settings - Fork 0
/
variable_extractor.py
88 lines (66 loc) · 2.32 KB
/
variable_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from Naked.toolshed.shell import muterun_js
from lxml import etree
import click
import os
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
def unparse(node):
if isinstance(node, str):
return node
if node.tag == 'msub':
return '{%s}_{%s}' % (unparse(node[0]), unparse(node[1]))
if node.tag == 'msup':
return '{%s}^{%s}' % (unparse(node[0]), unparse(node[1]))
if node.tag == 'mrow':
return '{'+''.join(unparse(n) for n in node)+'}'
return node.text
def get_variables(latex):
processed = latex.replace(' ', '[space]')
processed = processed.replace('\\', '[backslash]')
processed = processed.replace('(', '{')
processed = processed.replace(')', '}')
response = muterun_js(os.path.join(CURRENT_DIR, 'katex/parse.js'), processed)
# print(response.stderr)
if response.stdout == b'-1\n' or not response.stdout:
yield -1
return
tree = etree.fromstring(response.stdout)
ast = tree.xpath('.//semantics')[0]
# print(etree.tostring(ast, pretty_print=True))
count = 0
for c in ast.xpath('.//*'):
c.attrib['id'] = str(count)
count += 1
ngram_kv = {}
ngram = []
for row in ast.xpath('.//mrow'):
ngram.append([])
for mi in row:
if mi.text and mi.tag == 'mi' and mi.text:
ngram_kv[mi.attrib['id']] = row.attrib['id']
if mi.text in ['=', '+', '-', '*', '/']:
ngram.append([])
continue
ngram[-1].append(mi.text)
else:
ngram.append([])
for sup in tree.xpath('.//msup'):
if 'None' not in unparse(sup):
yield sup
for sub in tree.xpath('.//msub'):
if 'None' not in unparse(sub):
yield sub
for id in tree.xpath('.//mi'):
if id.attrib['id'] not in ngram_kv:
if 'None' not in unparse(id):
yield id.text
for _x in ngram:
if len(_x) > 0:
yield ''.join(_x)
@click.command()
@click.option('--latex', required=True, help='Source latex code to be parsed')
def get_variables_cli(latex):
'''Variable extractor that finds variables in the Latex code'''
for g in get_variables(latex):
click.echo(unparse(g))
if __name__ == '__main__':
get_variables_cli()