-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLatexParser.py
120 lines (104 loc) · 3.41 KB
/
LatexParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Parses the Latex Expression and create a MathObject: a tree of connected nodes representing
the mathematical expression.
Author: John Bell
Email: [email protected]
"""
import os
import re
import logging
import traceback
import weakref
from collections import namedtuple
from functools import wraps
import pprint
from itertools import groupby
from lxml import etree
from MathObject import MathElement
#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(filename='latex.log',level=logging.DEBUG)
Token = namedtuple('Token', ['type', 'value'])
def parse_latex(latex):
ignore_tags = ['MM_DMATH', 'MM_BILMATH', 'MM_EINMATH']
tokens = tokenizer(latex, ignore_tags)
root = MathElement()
token_tree = _group(tokens)
pprint.pprint(token_tree)
#math_object = build_math_object(token_tree, MathElement())
root.parse_token_tree(token_tree)
#root.pprint()
print(etree.tostring(root.to_mathml(), pretty_print=True, encoding='unicode'))
#print('ran_sucessfully')
return root
def tokenizer(latex, ignore_types = ['WS']):
_patterns = [
r'(?P<NUM>[0-9]+\.?[0-9]*)',
r'(?P<MM_B_ALIGN>\\begin\{align\*?\})',
r'(?P<MM_E_ALIGN>\\end\{align\*?\})',
r'(?P<MM_DMATH>\\\[|\$\$|\\\])',
r'(?P<MM_BILMATH>^\$)',
r'(?P<MM_EINMATH>\$$)',
r'(?P<BEG>\\begin\{[^}]*\})',
r'(?P<END>\\end\{[^}]*\})',
r'(?P<NL>\\\\)',
r'(?P<MP>\\[{}|])',
r'(?P<TEXT>\\text\w*)',
r'(?P<MATHFONT>\\math\w*\{[^}]*})',
r'(?P<GB>\{)',
r'(?P<GE>\})',
r'(?P<ROOT>\\sqrt\[([^]]*)\])',
r'(?P<SQRT>\\sqrt)',
r'(?P<SUB>\_)',
r'(?P<SUP>\^)',
r'(?P<LEFT>\\left(?!\w))',
r'(?P<RIGHT>\\right(?!\w))',
r'(?P<FRAC>\\frac)',
r'(?P<BIN>\\binom)',
r'(?P<OVERUNDER>\\over[a-z]*|\\under[a-z]*)',
r'(?P<COM>\\[A-Za-z]+|\\[,:;\s])',
r'(?P<WS>\s+)',
r'(?P<AMP>&)',
r'(?P<SYMB>.)'
]
patterns = re.compile('|'.join(_patterns))
sn = patterns.scanner(latex)
Token = namedtuple('Token', ['type', 'value'])
for m in iter(sn.match, None):
token = Token(m.lastgroup,m.group())
if m.lastgroup not in ignore_types:
yield token
def _group(tokens):
begin_group_tokens = ['GB', 'BEG', 'LEFT', 'MM_B_ALIGN']
end_group_tokens = ['GE', 'END', 'RIGHT', 'MM_E_ALIGN']
l_tokens = [t for t in tokens]
#print(l_tokens)
tokens = iter(l_tokens)
group = []
stack = []
tree = []
for i, token in enumerate(tokens):
if token.type in begin_group_tokens:
if len(stack) == 0:
if token.type != 'GB':
group.append(token)
if group:
#print('Group : ', group)
tree.extend(group)
group = []
else:
group.append(token)
stack.append(token)
elif token.type in end_group_tokens:
stack.pop()
group.append(token)
if group and len(stack) == 0:
#print('Group : ', group)
tree.append(_group(iter(group[:-1])))
group = []
if token.type != 'GE':
group.append(token)
else:
group.append(token)
if group:
tree.extend(group)
return tree