Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

code clean up on backend ec2, push up to end of SP23 #51

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ __pycache__/
ml-model/formula_images/
ml-model/output/
ml-model/crop_formula_images/

ml-model/yolov5/config.py
.vscode/

output.zip
*.zip
Expand All @@ -19,6 +20,7 @@ data/
simese_data/
venv/
ml-model/model.pt
ml-model/yolov5/preprocess_data/
training_data/
*.png
im2latex/
Expand All @@ -29,3 +31,4 @@ venv/
ml-model/paths_output.csv

ml-model/web/__pycache__/
datasets/
22 changes: 22 additions & 0 deletions ml-model/model/image_to_latex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from rapid_latex_ocr import LatexOCR

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you delete this files from the PR? Lots of people have made changes on these files and will screw it up

image_resizer_path = 'models/image_resizer.onnx'
encoder_path = 'models/encoder.onnx'
decoder_path = 'models/decoder.onnx'
tokenizer_json = 'models/tokenizer.json'
model = LatexOCR(image_resizer_path=image_resizer_path,
encoder_path=encoder_path,
decoder_path=decoder_path,
tokenizer_json=tokenizer_json)

img_path = "tests/test_files/6.png"
with open(img_path, "rb") as f:
data = f. read()

result, elapse = model(data)

print(result)
# {\frac{x^{2}}{a^{2}}}-{\frac{y^{2}}{b^{2}}}=1

print(elapse)
# 0.4131628000000003
93 changes: 93 additions & 0 deletions ml-model/model/latex_to_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import sympy as sp
from sympy.parsing.latex import parse_latex
from zss import Node, distance
import networkx as nx
import matplotlib.pyplot as plt


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Delete this file as well

def zss_to_nx(node, graph=None, parent=None):
if graph is None:
graph = nx.DiGraph()
graph.add_node(id(node), label=node.label)
if parent is not None:
graph.add_edge(id(parent), id(node))
for child in node.children:
zss_to_nx(child, graph, node)
return graph


# Define some complex LaTeX expressions
# expr1 represents our query
latex_expr1 = r"\nabla J(\theta) = \frac{1}{m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^{(i)}) x^{(i)}"
# expr represents our OCR'd expression from the file
latex_expr2 = r"\nabla J(\Theta) = \frac{1}{m} \sum_{i=1}^m (h_\theta(z^{(i)}) - y^{(i)}) z^{(i)}"

# Convert LaTeX to SymPy
sympy_expr1 = parse_latex(latex_expr1)
sympy_expr2 = parse_latex(latex_expr2)


def sympy_to_zss(expr):
if isinstance(expr, sp.Symbol) or isinstance(expr, sp.Number):
return Node(str(expr))
else:
node = Node(str(expr.func))
for arg in expr.args:
child_node = sympy_to_zss(arg)
node.addkid(child_node)
return node


# Convert the SymPy expression to a ZSS tree
zss_tree1 = sympy_to_zss(sympy_expr1)
zss_tree2 = sympy_to_zss(sympy_expr2)
# print(zss_tree1)
# print(zss_tree2)

# Assuming zss_tree1 and zss_tree2 are your ZSS trees
nx_tree1 = zss_to_nx(zss_tree1)
nx_tree2 = zss_to_nx(zss_tree2)


def hierarchy_pos(G, root=None, width=1., vert_gap=0.2, vert_loc=0, xcenter=0.5):
pos = _hierarchy_pos(G, root, width, vert_gap, vert_loc, xcenter)
return pos


def _hierarchy_pos(G, root, width=1., vert_gap=0.2, vert_loc=0, xcenter=0.5, pos=None, parent=None, parsed=[]):
if pos is None:
pos = {root: (xcenter, vert_loc)}
else:
pos[root] = (xcenter, vert_loc)
children = list(G.neighbors(root))
if not isinstance(G, nx.DiGraph) and parent is not None:
children.remove(parent)
if len(children) != 0:
dx = width / len(children)
nextx = xcenter - width/2 - dx/2
for child in children:
nextx += dx
pos = _hierarchy_pos(G, child, width=dx, vert_gap=vert_gap,
vert_loc=vert_loc-vert_gap, xcenter=nextx,
pos=pos, parent=root, parsed=parsed)
return pos


def draw_tree(tree):
pos = hierarchy_pos(tree, root=list(tree.nodes())
[0]) # Specify the root node
labels = nx.get_node_attributes(tree, 'label')
nx.draw(tree, pos, labels=labels, with_labels=True,
node_size=3000, node_color='lightblue', font_size=10)
plt.show()


# Draw the trees
# draw_tree(nx_tree1)
# draw_tree(nx_tree2)

# Compare ZSS trees
# make update non-zero to see difference in more updated tree vs. not
distance = distance(zss_tree1, zss_tree2, get_children=Node.get_children,
insert_cost=lambda node: 10, remove_cost=lambda node: 10, update_cost=lambda a, b: 1)
print(distance) # Output the tree edit distance
Loading
Loading