Skip to content
This repository has been archived by the owner on Jan 8, 2025. It is now read-only.

Commit

Permalink
Merge branch 'ml4ai:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
bigglesandginger authored Apr 23, 2024
2 parents 5e5787f + 6cda598 commit 22fe06d
Show file tree
Hide file tree
Showing 48 changed files with 845 additions and 283 deletions.
91 changes: 0 additions & 91 deletions .drone.yml

This file was deleted.

7 changes: 5 additions & 2 deletions .github/workflows/tests-and-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,17 @@ jobs:
working-directory: .
run: |
# retrieve latest model for img2mml component
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
pip install huggingface_hub
python scripts/retrieve_model_ci.py
# Install askem
pip install ".[all]"
# Install tree-sitter parser (for Python component unit tests)
- name: Install tree-sitter parsers
working-directory: .
run: python skema/program_analysis/tree_sitter_parsers/build_parsers.py --ci --all


# docs (API)
# generate python docs using pdoc
- name: "Create documentation for Python components (API docs)"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.skema-py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ ENV PATH="/root/.cargo/bin:${PATH}"
RUN pip install wheel
RUN pip install six
# Download ML model (~150MB)
RUN curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
RUN pip install huggingface_hub && python scripts/retrieve_model_ci.py
RUN tree /app
#RUN pip install ".[all]"
# exclude dependencies for docs
Expand Down
2 changes: 1 addition & 1 deletion docs/dev/env.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ conda activate skema
# Install tree-sitter parsers
python skema/program_analysis/tree_sitter_parsers/build_parsers.py --all
# download the checkpoint for the img2mml service
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > skema/img2mml/trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
python scripts/retrieve_model.py
# mathjax deps for img2mml
(cd skema/img2mml/data_generation && npm install)
```
Expand Down
11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@ dependencies=[
"numpy",
"dill==0.3.7",
"networkx==2.8.8",
"PyYAML",
"PyYAML==6.*",
"tree-sitter==0.20.4",
"neo4j==5.14.1",
"requests",
"beautifulsoup4", # used to remove comments etc from pMML before sending to MORAE
"beautifulsoup4==4.12.*", # used to remove comments etc from pMML before sending to MORAE
"typing_extensions", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
"fastapi~=0.100.0",
"starlette",
"httpx",
"pydantic>=2.0.0",
"uvicorn",
"python-multipart",
"func_timeout"
"func_timeout==4.3.5"
]
# The Python program analysis pipeline does not currently work with Python 3.9
# or 3.10. This may change in the future.
Expand Down Expand Up @@ -57,7 +57,7 @@ isa = [
]

# shared ML dependencies
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0"]
ml = ["torch==2.0.1", "torchvision==0.15.2", "beartype==0.15.0", "huggingface_hub"]

# Im2MML dependencies. The img2mml service converts equation images to MathML.
# See the skema/img2mml directory.
Expand Down Expand Up @@ -100,6 +100,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati
"skema.rest" = "skema/rest"
"skema.skema_py" = "skema/skema_py"
"skema.utils" = "skema/utils"
"skema.data" = "skema/data"

# re-map skema/text_reading/python to skema.text_reading
#"skema.text_reading" = "skema/text_reading/python"
Expand All @@ -110,7 +111,7 @@ all = ["skema[core]", "skema[dev]", "skema[doc]", "skema[demo]", "skema[annotati

[tool.setuptools.package-data]
# needed to ensure models are included in package/discoverable
"*" = ["*.json", "vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml"]
"*" = ["*.json", "*vocab.txt", "*.pt", "*.png", "*.html", "*.yml", "*.yaml", "*.zip"]

[tool.setuptools.dynamic]
readme = {file = ["README.md"], content-type = "text/markdown"}
Expand Down
31 changes: 31 additions & 0 deletions scripts/retrieve_model_ci.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
from pathlib import Path

from huggingface_hub import hf_hub_download

def retrieve_model(model_path=None) -> str:
"""
Retrieve the img2mml model from the specified path or download it if not found.
Args:
model_path (str, optional): Path to the img2mml model file. Defaults to None.
Returns:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
REPO_NAME = "lum-ai/img2mml"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
# If the model path is none or doesn't exist, the default model will be downloaded from server.
if model_path is None or not os.path.exists(model_path):
model_path = cwd / "trained_models" / MODEL_NAME

# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
print(f"Downloading the model checkpoint from HuggingFace...")
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)

return str(model_path)

retrieve_model()
Binary file added skema/data/program_analysis/ABM-COVID-ABS.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-COmplexVID-19.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-Covasim.zip
Binary file not shown.
Binary file added skema/data/program_analysis/ABM-REINA.zip
Binary file not shown.
Binary file added skema/data/program_analysis/Bucky.zip
Binary file not shown.
Binary file added skema/data/program_analysis/CHIME-SIR-model.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Climlab.zip
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Generated-Halfar.zip
Binary file not shown.
Binary file added skema/data/program_analysis/MechBayes.zip
Binary file not shown.
Binary file added skema/data/program_analysis/SIDARTHE.zip
Binary file not shown.
Binary file not shown.
Binary file added skema/data/program_analysis/Simple-SIR.zip
Binary file not shown.
Binary file added skema/data/program_analysis/TIE-GCM.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions skema/data/program_analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from pathlib import Path

MODEL_ZIP_ROOT_PATH = Path(__file__).parent
Binary file added skema/data/program_analysis/cism_v3.zip
Binary file not shown.
Binary file added skema/data/program_analysis/climlab-v2.zip
Binary file not shown.
Binary file added skema/data/program_analysis/code_sir.zip
Binary file not shown.
Binary file added skema/data/program_analysis/examples_python.zip
Binary file not shown.
11 changes: 5 additions & 6 deletions skema/img2mml/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,23 @@ service was developed by Deepsana Shahi, Adarsh Pyarelal and Liang Zhang.

The model itself is not checked into the repository, but you can get it from
here:
https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
https://huggingface.co/lum-ai/img2mml/blob/main/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt

Place the model file in the `trained_models` directory.

The curl command below should do the trick.
The Python command below should do the trick.

```
curl -L https://artifacts.askem.lum.ai/skema/img2mml/models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt > trained_models/cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt
python ../../scripts/retrieve_model_ci.py
```

If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above curl command that will replace the previous one.
If you have the checkpoint in the `trained_models` directory already and hope to update it, please run the above Python command that will replace the previous one.

To update the model name or path, please make the following modifications to support updating the img2mml service and the corresponding Docker operations:

1. Modify the ENV variable of `SKEMA_IMG2MML_MODEL_PATH`.
2. Update the path settings in the "retrieve latest model for img2mml component" section of `skema/.github/workflows/tests-and-docs.yml`.
3. Adjust the curl command in the test_equation_reading section of `skema/.drone.yml` to download the checkpoint.
4. Update the download checkpoint path in `skema/img2mml/README.md`.
3. Update the download checkpoint path in `skema/img2mml/README.md`.

These changes will ensure that the necessary files and paths are updated correctly.

Expand Down
10 changes: 5 additions & 5 deletions skema/img2mml/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from PIL import Image
from io import BytesIO

from huggingface_hub import hf_hub_download

def retrieve_model(model_path=None) -> str:
"""
Expand All @@ -25,7 +26,7 @@ def retrieve_model(model_path=None) -> str:
str: Path to the loaded model file.
"""
cwd = Path(__file__).parents[0]
MODEL_BASE_ADDRESS = "https://artifacts.askem.lum.ai/skema/img2mml/models"
REPO_NAME = "lum-ai/img2mml"
MODEL_NAME = "cnn_xfmer_arxiv_im2mml_with_fonts_boldface_best.pt"
# If the model path is none or doesn't exist, the default model will be downloaded from server.
if model_path is None or not os.path.exists(model_path):
Expand All @@ -34,10 +35,9 @@ def retrieve_model(model_path=None) -> str:
# Check if the model file already exists
if not os.path.exists(model_path):
# If the file doesn't exist, download it from the specified URL
url = f"{MODEL_BASE_ADDRESS}/{MODEL_NAME}"
print(f"Downloading the model checkpoint from {url}...")
urllib.request.urlretrieve(url, model_path)

print(f"Downloading the model checkpoint from HuggingFace...")
hf_hub_download(repo_id=REPO_NAME, filename=MODEL_NAME, local_dir=model_path.parent, local_dir_use_symlinks=False)

return str(model_path)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import pytest
from tempfile import TemporaryDirectory
from pathlib import Path

from skema.program_analysis.CAST.fortran.ts2cast import TS2CAST
from skema.program_analysis.CAST2FN.model.cast import (
Assignment,
Var,
Name,
CASTLiteralValue,
ModelIf,
Operator,
ScalarType
)

def cond_compound1():
return """
program cond_compound1
integer :: a = 3
if (a .gt. 1 .and. a .lt. 10) then
a = 40
end if
end program cond_compound1
"""


def generate_cast(test_file_string):
with TemporaryDirectory() as temp:
source_path = Path(temp) / "source.f95"
source_path.write_text(test_file_string)
out_cast = TS2CAST(str(source_path)).out_cast

return out_cast[0]

def test_cond_compound1():
exp_cast = generate_cast(cond_compound1())

asg_node = exp_cast.nodes[0].body[0]

assert isinstance(asg_node, Assignment)
assert isinstance(asg_node.left, Var)
assert isinstance(asg_node.left.val, Name)
assert asg_node.left.val.name == "a"
assert asg_node.left.val.id == 0

assert isinstance(asg_node.right, CASTLiteralValue)
assert asg_node.right.value_type == ScalarType.INTEGER
assert asg_node.right.value == '3'

cond_node = exp_cast.nodes[0].body[1]
cond_expr = cond_node.expr
assert isinstance(cond_node, ModelIf)
assert isinstance(cond_expr, ModelIf)

if_node = cond_expr
assert isinstance(if_node, ModelIf)

expr = if_node.expr
assert isinstance(expr, Operator)
assert expr.op == ".gt."
assert len(expr.operands) == 2
assert isinstance(expr.operands[1], CASTLiteralValue)
assert expr.operands[1].value_type == ScalarType.INTEGER
assert expr.operands[1].value == "1"

assert isinstance(expr.operands[0], Name)
assert expr.operands[0].name == "a"
assert expr.operands[0].id == 0

assert len(if_node.body) == 1
body = if_node.body[0]
assert isinstance(body, Operator)
assert body.op == ".lt."
assert len(body.operands) == 2
assert isinstance(body.operands[0], Name)
assert body.operands[0].name == "a"
assert body.operands[0].id == 0

assert isinstance(body.operands[1], CASTLiteralValue)
assert body.operands[1].value_type == ScalarType.INTEGER
assert body.operands[1].value == "10"

assert len(if_node.orelse) == 1
orelse = if_node.orelse[0]
assert isinstance(orelse, CASTLiteralValue)
assert orelse.value_type == ScalarType.BOOLEAN
assert orelse.value == False

cond_body = cond_node.body
assert len(cond_body) == 1
assert isinstance(cond_body[0], Assignment)
assert isinstance(cond_body[0].left, Var)
assert cond_body[0].left.val.name == "a"
assert cond_body[0].left.val.id == 0

assert isinstance(cond_body[0].right, CASTLiteralValue)
assert cond_body[0].right.value_type == ScalarType.INTEGER
assert cond_body[0].right.value == '40'

cond_else = cond_node.orelse
assert len(cond_else) == 0
Loading

0 comments on commit 22fe06d

Please sign in to comment.