Skip to content
This repository has been archived by the owner on Jan 8, 2025. It is now read-only.

Commit

Permalink
Merge branch 'ml4ai:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Sorrento110 authored Jul 17, 2023
2 parents 561a414 + 02f01dd commit 0bcd945
Show file tree
Hide file tree
Showing 17 changed files with 560 additions and 74 deletions.
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ dependencies=[
"tree-sitter",
"requests",
"beautifulsoup4", # used to remove comments etc from pMML before sending to MORAE
"typing_extensions==4.5.0", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
"fastapi",
"typing_extensions", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
"fastapi~=0.100.0",
"pydantic~=2.0.0",
"uvicorn",
"python-multipart"
]
Expand Down
2 changes: 1 addition & 1 deletion skema/img2mml/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
class LatexEquation(BaseModel):
tex_src: str = Field(title="LaTeX equation", description="The LaTeX equation to process")
class Config:
schema_extra = {
json_schema_extra = {
"example": {
"tex_src": "\\frac{\\partial x}{\\partial t} = {\\alpha x} - {\\beta x y}",
},
Expand Down
97 changes: 90 additions & 7 deletions skema/rest/integrated_text_reading_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,10 @@ def parquet_to_json(path):


def cosmos_client(name: str, data: BinaryIO):
""" Posts a pdf to COSMOS and returns the JSON representation of the parquet file """
"""
Posts a pdf to COSMOS and returns the JSON representation of the parquet file
"""

# Create POST request to COSMOS server
# Prep the pdf data for upload
Expand Down Expand Up @@ -404,6 +407,21 @@ async def integrated_text_extractions(
annotate_skema: bool = True,
annotate_mit: bool = True,
) -> TextReadingAnnotationsOutput:
"""
### Python example
```
params = {
"annotate_skema":True,
"annotate_mit": True
}
files = [("pdfs", ("paper.txt", open("paper.txt", "rb")))]
response = request.post(f"{URL}/text-reading/integrated-text-extractions", params=params, files=files)
if response.status_code == 200:
data = response.json()
```
"""
# Get the input plain texts
texts = texts.texts

Expand All @@ -429,6 +447,22 @@ async def integrated_pdf_extractions(
annotate_skema: bool = True,
annotate_mit: bool = True
) -> TextReadingAnnotationsOutput:
"""
### Python example
```
params = {
"annotate_skema":True,
"annotate_mit": True
}
files = [("pdfs", ("ijerp.pdf", open("ijerp.pdf", "rb")))]
response = request.post(f"{URL}/text-reading/integrated-pdf-extractions", params=params, files=files)
if response.status_code == 200:
data = response.json()
```
"""
# TODO: Make this handle multiple pdf files in parallel
# Call COSMOS on the pdfs
cosmos_data = list()
Expand All @@ -453,10 +487,19 @@ async def integrated_pdf_extractions(
@router.post(
"/cosmos_to_json",
status_code=200,
description="Calls COSMOS on a pdf and converts the data into json"
)
async def cosmos_to_json(pdf: UploadFile) -> List[Dict]:
""" Calls COSMOS on a pdf and converts the data into json """
""" Calls COSMOS on a pdf and converts the data into json
### Python example
```
response = requests.post(f"{endpoint}/text-reading/cosmos_to_json",
files=[
("pdf", ("ijerp.pdf", open("ijerph-18-09027.pdf", 'rb')))
]
)
```
"""
return cosmos_client(pdf.filename, pdf.file)


Expand All @@ -467,7 +510,18 @@ async def cosmos_to_json(pdf: UploadFile) -> List[Dict]:
)
async def ground_to_mira(k: int, queries: MiraGroundingInputs, response: Response) -> List[
List[MiraGroundingOutputItem]]:
""" Proxy to the MIRA grounding functionality on the SKEMA TR service """
""" Proxy to the MIRA grounding functionality on the SKEMA TR service
### Python example
```
queries = {"queries": ["infected", "suceptible"]}
params = {"k": 5}
response = requests.post(f"{endpoint}/text-reading/ground_to_mira", params=params, json=queries)
if response.status_code == 200:
results = response.json()
```
"""
params = {
"k": k
}
Expand All @@ -488,7 +542,18 @@ async def ground_to_mira(k: int, queries: MiraGroundingInputs, response: Respons

@router.post("/cards/get_model_card")
async def get_model_card(text_file: UploadFile, code_file: UploadFile, response: Response):
""" Calls the model card endpoint from MIT's pipeline """
""" Calls the model card endpoint from MIT's pipeline
### Python example
```
files = {
"text_file": ('text_file.txt", open("text_file.txt", 'rb')),
"code_file": ('code_file.py", open("code_file.py", 'rb')),
}
response = requests.post(f"{endpoint}/text-reading/cards/get_model_card", files=files)
```
"""

params = {
"gpt_key": OPENAI_KEY,
Expand All @@ -504,11 +569,29 @@ async def get_model_card(text_file: UploadFile, code_file: UploadFile, response:
return inner_response.json()

@router.post("/cards/get_data_card")
async def get_model_card(csv_file: UploadFile, doc_file: UploadFile, response: Response):
""" Calls the data card endpoint from MIT's pipeline """
async def get_data_card(smart:bool, csv_file: UploadFile, doc_file: UploadFile, response: Response):
"""
Calls the data card endpoint from MIT's pipeline.
Smart run provides better results but may result in slow response times as a consequence of extra GPT calls.
### Python example
```
params = {
"smart": False
}
files = {
"csv_file": ('csv_file.csv", open("csv_file.csv", 'rb')),
"doc_file": ('doc_file.txt", open("doc_file.txt", 'rb')),
}
response = requests.post(f"{endpoint}/text-reading/cards/get_data_card", params=params files=files)
```
"""

params = {
"gpt_key": OPENAI_KEY,
"smart": smart
}
files = {
"csv_file": (csv_file.filename, csv_file.file, "text/csv"),
Expand Down
19 changes: 18 additions & 1 deletion skema/rest/metal_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,24 @@ def link_amr(amr_type: str,
similarity_threshold: float = 0.5,
amr_file: UploadFile = File(...),
text_extractions_file: UploadFile = File(...)):
""" Links an AMR to a text extractions file """
""" Links an AMR to a text extractions file
### Python example
```
params = {
"amr_type": "petrinet"
}
files = {
"amr_file": ("amr.json", open("amr.json"), "application/json"),
"text_extractions_file": ("extractions.json", open("extractions.json"), "application/json")
}
response = requests.post(f"{ENDPOINT}/metal/link_amr", params=params, files=files)
if response.status_code == 200:
enriched_amr = response.json()
```
"""

# Load the AMR
amr = json.load(amr_file.file)
Expand Down
2 changes: 1 addition & 1 deletion skema/rest/proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)

# Text Reading services
MIT_TR_ADDRESS = os.environ.get("MIT_TR_ADDRESS", "https://mit-tr.askem.lum.ai")
MIT_TR_ADDRESS = os.environ.get("MIT_TR_ADDRESS", "http://100.26.10.46")
SKEMA_TR_ADDRESS = os.environ.get("SKEMA_TR_ADDRESS", "http://hopper.sista.arizona.edu")
OPENAI_KEY = os.environ.get("OPENAI_KEY", "YOU_FORGOT_TO_SET_OPENAI_KEY")
COSMOS_ADDRESS = os.environ.get("COSMOS_ADDRESS", "http://cosmos0002.chtc.wisc.edu:8089")
73 changes: 53 additions & 20 deletions skema/rest/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from askem_extractions.data_model import AttributeCollection
from pydantic import BaseModel, Field

# see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
from typing_extensions import Literal

Expand Down Expand Up @@ -42,14 +43,36 @@ class HealthStatus(BaseModel):
class EquationImagesToAMR(BaseModel):
# FIXME: will this work or do we need base64?
images: List[eqn2mml_schema.ImageBytes]
model: Literal["regnet", "petrinet"] = Field(description="The model type")
model: Literal["regnet", "petrinet"] = Field(
description="The model type", example="petrinet"
)


class EquationLatexToAMR(BaseModel):
equations: List[str] = Field(description="Equations in LaTeX",
example=["\\frac{\\partial x}{\\partial t} = {\\alpha x} - {\\beta x y}",
"\\frac{\\partial y}{\\partial t} = {\\alpha x y} - {\\gamma y}"])
model: Literal["regnet", "petrinet"] = Field(description="The model type", example="regnet")
equations: List[str] = Field(
description="Equations in LaTeX",
example=[
r"\frac{\partial x}{\partial t} = {\alpha x} - {\beta x y}",
r"\frac{\partial y}{\partial t} = {\alpha x y} - {\gamma y}",
],
)
model: Literal["regnet", "petrinet"] = Field(
description="The model type", example="regnet"
)


class MmlToAMR(BaseModel):
equations: List[str] = Field(
description="Equations in pMML",
example=[
"<math><mfrac><mrow><mi>d</mi><mi>Susceptible</mi></mrow><mrow><mi>d</mi><mi>t</mi></mrow></mfrac><mo>=</mo><mo>−</mo><mi>Infection</mi><mi>Infected</mi><mi>Susceptible</mi></math>",
"<math><mfrac><mrow><mi>d</mi><mi>Infected</mi></mrow><mrow><mi>d</mi><mi>t</mi></mrow></mfrac><mo>=</mo><mo>−</mo><mi>Recovery</mi><mi>Infected</mi><mo>+</mo><mi>Infection</mi><mi>Infected</mi><mi>Susceptible</mi></math>",
"<math><mfrac><mrow><mi>d</mi><mi>Recovered</mi></mrow><mrow><mi>d</mi><mi>t</mi></mrow></mfrac><mo>=</mo><mi>Recovery</mi><mi>Infected</mi></math>",
],
)
model: Literal["regnet", "petrinet"] = Field(
description="The model type", example="petrinet"
)


class CodeSnippet(BaseModel):
Expand All @@ -74,29 +97,28 @@ class MiraGroundingInputs(BaseModel):

class MiraGroundingOutputItem(BaseModel):
class MiraDKGConcept(BaseModel):
id: str = Field(
description="DKG element id",
example="apollosv:00000233"
)
id: str = Field(description="DKG element id", example="apollosv:00000233")
name: str = Field(
description="Canonical name of the concept",
example="infected population"
description="Canonical name of the concept", example="infected population"
)
description: Optional[str] = Field(
description="Long winded description of the concept",
example="A population of only infected members of one species."
example="A population of only infected members of one species.",
)
synonyms: List[str] = Field(
description="Any alternative name to the cannonical one for the concept",
example=["Ill individuals", "The sick and ailing"]
example=[["Ill individuals", "The sick and ailing"]],
)
embedding: List[float] = Field(
description="Word embedding of the underlying model for the concept"
)

def __hash__(self):
return hash(tuple([self.id, tuple(self.synonyms), tuple(self.embedding)]))

score: float = Field(
description="Cosine similarity of the embedding representation of the input with that of the DKG element",
example=0.7896
example=0.7896,
)
groundingConcept: MiraDKGConcept = Field(
description="DKG concept associated to the query",
Expand All @@ -109,8 +131,8 @@ class MiraDKGConcept(BaseModel):
0.01590670458972454,
0.03795482963323593,
-0.08787763118743896,
]
)
],
),
)


Expand All @@ -136,6 +158,9 @@ class TextReadingError(BaseModel):
example="Out of memory error",
)

def __hash__(self):
return hash(f"{self.pipeline}-{self.message}")


class TextReadingDocumentResults(BaseModel):
data: Optional[AttributeCollection] = Field(
Expand All @@ -149,24 +174,32 @@ class TextReadingDocumentResults(BaseModel):
example=[TextReadingError(pipeline="MIT", message="Unauthorized API key")],
)

def __hash__(self):
return hash(
tuple([self.data, "NONE" if self.errors is None else tuple(self.errors)])
)


class TextReadingAnnotationsOutput(BaseModel):
"""Contains the TR document results for all the documents submitted for annotation"""

outputs: List[TextReadingDocumentResults] = Field(
name="outputs",
description="Contains the results of TR annotations for each input document. There is one entry per input and "
"inputs and outputs are matched by the same index in the list",
"inputs and outputs are matched by the same index in the list",
example=[
TextReadingDocumentResults(data=AttributeCollection(attributes=[])),
TextReadingDocumentResults(
errors=[TextReadingError(pipeline="SKEMA", message="Dummy error")]
data=AttributeCollection(attributes=[]), errors=None
),
TextReadingDocumentResults(
data=AttributeCollection(attributes=[]),
errors=[TextReadingError(pipeline="SKEMA", message="Dummy error")],
),
],
)

generalized_errors: Optional[List[TextReadingError]] = Field(
name="generalized_errors",
description="Any pipeline-wide errors, not specific to a particular input",
example=[TextReadingError(pipeline="MIT", message="API quota exceeded")]
example=[TextReadingError(pipeline="MIT", message="API quota exceeded")],
)
15 changes: 15 additions & 0 deletions skema/rest/workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,21 @@ async def equations_to_amr(data: schema.EquationLatexToAMR):
)
return res.json()

# pmml -> amr
@router.post("/pmml/equations-to-amr", summary="Equations pMML → AMR")
async def equations_to_amr(data: schema.MmlToAMR):

payload = {"mathml": data.equations, "model": data.model}
res = requests.put(f"{SKEMA_RS_ADDESS}/mathml/amr", json=payload)
if res.status_code != 200:
return JSONResponse(
status_code=400,
content={
"error": f"MORAE PUT /mathml/amr failed to process payload",
"payload": payload,
},
)
return res.json()

# code snippets -> fn -> petrinet amr
@router.post("/code/snippets-to-pn-amr", summary="Code snippets → PetriNet AMR")
Expand Down
Loading

0 comments on commit 0bcd945

Please sign in to comment.