Merge branch 'ml4ai:main' into main

DARPA-ASKEM · Jul 17, 2023 · 0bcd945 · 0bcd945
2 parents 561a414 + 02f01dd
commit 0bcd945
Show file tree

Hide file tree

Showing 17 changed files with 560 additions and 74 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,8 +17,9 @@ dependencies=[
     "tree-sitter",
     "requests",
     "beautifulsoup4", # used to remove comments etc from pMML before sending to MORAE
-    "typing_extensions==4.5.0", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
-    "fastapi",
+    "typing_extensions", # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
+    "fastapi~=0.100.0",
+    "pydantic~=2.0.0",
     "uvicorn",
     "python-multipart"
 ]

diff --git a/skema/img2mml/schema.py b/skema/img2mml/schema.py
@@ -25,7 +25,7 @@
 class LatexEquation(BaseModel):
     tex_src: str = Field(title="LaTeX equation", description="The LaTeX equation to process")
     class Config:
-        schema_extra = {
+        json_schema_extra = {
             "example": {
                 "tex_src": "\\frac{\\partial x}{\\partial t} = {\\alpha x} - {\\beta x y}",
             },

diff --git a/skema/rest/integrated_text_reading_proxy.py b/skema/rest/integrated_text_reading_proxy.py
@@ -236,7 +236,10 @@ def parquet_to_json(path):
 
 
 def cosmos_client(name: str, data: BinaryIO):
-    """ Posts a pdf to COSMOS and returns the JSON representation of the parquet file """
+    """
+    Posts a pdf to COSMOS and returns the JSON representation of the parquet file
+
+    """
 
     # Create POST request to COSMOS server
     # Prep the pdf data for upload
@@ -404,6 +407,21 @@ async def integrated_text_extractions(
         annotate_skema: bool = True,
         annotate_mit: bool = True,
 ) -> TextReadingAnnotationsOutput:
+    """
+    ### Python example
+    ```
+    params = {
+       "annotate_skema":True,
+       "annotate_mit": True
+    }
+
+    files = [("pdfs", ("paper.txt", open("paper.txt", "rb")))]
+
+    response = request.post(f"{URL}/text-reading/integrated-text-extractions", params=params, files=files)
+    if response.status_code == 200:
+        data = response.json()
+    ```
+    """
     # Get the input plain texts
     texts = texts.texts
 
@@ -429,6 +447,22 @@ async def integrated_pdf_extractions(
         annotate_skema: bool = True,
         annotate_mit: bool = True
 ) -> TextReadingAnnotationsOutput:
+    """
+
+    ### Python example
+    ```
+    params = {
+       "annotate_skema":True,
+       "annotate_mit": True
+    }
+
+    files = [("pdfs", ("ijerp.pdf", open("ijerp.pdf", "rb")))]
+
+    response = request.post(f"{URL}/text-reading/integrated-pdf-extractions", params=params, files=files)
+    if response.status_code == 200:
+        data = response.json()
+    ```
+    """
     # TODO: Make this handle multiple pdf files in parallel
     # Call COSMOS on the pdfs
     cosmos_data = list()
@@ -453,10 +487,19 @@ async def integrated_pdf_extractions(
 @router.post(
     "/cosmos_to_json",
     status_code=200,
-    description="Calls COSMOS on a pdf and converts the data into json"
 )
 async def cosmos_to_json(pdf: UploadFile) -> List[Dict]:
-    """ Calls COSMOS on a pdf and converts the data into json """
+    """ Calls COSMOS on a pdf and converts the data into json
+
+        ### Python example
+        ```
+        response = requests.post(f"{endpoint}/text-reading/cosmos_to_json",
+                        files=[
+                            ("pdf", ("ijerp.pdf", open("ijerph-18-09027.pdf", 'rb')))
+                        ]
+                    )
+        ```
+    """
     return cosmos_client(pdf.filename, pdf.file)
 
 
@@ -467,7 +510,18 @@ async def cosmos_to_json(pdf: UploadFile) -> List[Dict]:
 )
 async def ground_to_mira(k: int, queries: MiraGroundingInputs, response: Response) -> List[
     List[MiraGroundingOutputItem]]:
-    """ Proxy to the MIRA grounding functionality on the SKEMA TR service """
+    """ Proxy to the MIRA grounding functionality on the SKEMA TR service
+
+        ### Python example
+        ```
+        queries = {"queries": ["infected", "suceptible"]}
+        params = {"k": 5}
+        response = requests.post(f"{endpoint}/text-reading/ground_to_mira", params=params, json=queries)
+
+        if response.status_code == 200:
+            results = response.json()
+        ```
+    """
     params = {
         "k": k
     }
@@ -488,7 +542,18 @@ async def ground_to_mira(k: int, queries: MiraGroundingInputs, response: Respons
 
 @router.post("/cards/get_model_card")
 async def get_model_card(text_file: UploadFile, code_file: UploadFile, response: Response):
-    """ Calls the model card endpoint from MIT's pipeline """
+    """ Calls the model card endpoint from MIT's pipeline
+
+        ### Python example
+        ```
+        files = {
+            "text_file": ('text_file.txt", open("text_file.txt", 'rb')),
+            "code_file": ('code_file.py", open("code_file.py", 'rb')),
+        }
+
+        response = requests.post(f"{endpoint}/text-reading/cards/get_model_card", files=files)
+        ```
+    """
 
     params = {
         "gpt_key": OPENAI_KEY,
@@ -504,11 +569,29 @@ async def get_model_card(text_file: UploadFile, code_file: UploadFile, response:
     return inner_response.json()
 
 @router.post("/cards/get_data_card")
-async def get_model_card(csv_file: UploadFile, doc_file: UploadFile, response: Response):
-    """ Calls the data card endpoint from MIT's pipeline """
+async def get_data_card(smart:bool, csv_file: UploadFile, doc_file: UploadFile, response: Response):
+    """
+        Calls the data card endpoint from MIT's pipeline.
+        Smart run provides better results but may result in slow response times as a consequence of extra GPT calls.
+
+        ### Python example
+        ```
+        params = {
+            "smart": False
+        }
+
+        files = {
+            "csv_file": ('csv_file.csv", open("csv_file.csv", 'rb')),
+            "doc_file": ('doc_file.txt", open("doc_file.txt", 'rb')),
+        }
+
+        response = requests.post(f"{endpoint}/text-reading/cards/get_data_card", params=params files=files)
+        ```
+    """
 
     params = {
         "gpt_key": OPENAI_KEY,
+        "smart": smart
     }
     files = {
         "csv_file": (csv_file.filename, csv_file.file, "text/csv"),

diff --git a/skema/rest/metal_proxy.py b/skema/rest/metal_proxy.py
@@ -21,7 +21,24 @@ def link_amr(amr_type: str,
              similarity_threshold: float = 0.5,
              amr_file: UploadFile = File(...),
              text_extractions_file: UploadFile = File(...)):
-    """ Links an AMR to a text extractions file """
+    """ Links an AMR to a text extractions file
+
+        ### Python example
+        ```
+        params = {
+          "amr_type": "petrinet"
+        }
+
+        files = {
+          "amr_file": ("amr.json", open("amr.json"), "application/json"),
+          "text_extractions_file": ("extractions.json", open("extractions.json"), "application/json")
+        }
+
+        response = requests.post(f"{ENDPOINT}/metal/link_amr", params=params, files=files)
+        if response.status_code == 200:
+            enriched_amr = response.json()
+        ```
+    """
 
     # Load the AMR
     amr = json.load(amr_file.file)

diff --git a/skema/rest/proxies.py b/skema/rest/proxies.py
@@ -20,7 +20,7 @@
 )
 
 # Text Reading services
-MIT_TR_ADDRESS = os.environ.get("MIT_TR_ADDRESS", "https://mit-tr.askem.lum.ai")
+MIT_TR_ADDRESS = os.environ.get("MIT_TR_ADDRESS", "http://100.26.10.46")
 SKEMA_TR_ADDRESS = os.environ.get("SKEMA_TR_ADDRESS", "http://hopper.sista.arizona.edu")
 OPENAI_KEY = os.environ.get("OPENAI_KEY", "YOU_FORGOT_TO_SET_OPENAI_KEY")
 COSMOS_ADDRESS = os.environ.get("COSMOS_ADDRESS", "http://cosmos0002.chtc.wisc.edu:8089")
diff --git a/skema/rest/schema.py b/skema/rest/schema.py
@@ -6,6 +6,7 @@
 
 from askem_extractions.data_model import AttributeCollection
 from pydantic import BaseModel, Field
+
 # see https://github.com/pydantic/pydantic/issues/5821#issuecomment-1559196859
 from typing_extensions import Literal
 
@@ -42,14 +43,36 @@ class HealthStatus(BaseModel):
 class EquationImagesToAMR(BaseModel):
     # FIXME: will this work or do we need base64?
     images: List[eqn2mml_schema.ImageBytes]
-    model: Literal["regnet", "petrinet"] = Field(description="The model type")
+    model: Literal["regnet", "petrinet"] = Field(
+        description="The model type", example="petrinet"
+    )
 
 
 class EquationLatexToAMR(BaseModel):
-    equations: List[str] = Field(description="Equations in LaTeX",
-                                 example=["\\frac{\\partial x}{\\partial t} = {\\alpha x} - {\\beta x y}",
-                                          "\\frac{\\partial y}{\\partial t} = {\\alpha x y} - {\\gamma y}"])
-    model: Literal["regnet", "petrinet"] = Field(description="The model type", example="regnet")
+    equations: List[str] = Field(
+        description="Equations in LaTeX",
+        example=[
+            r"\frac{\partial x}{\partial t} = {\alpha x} - {\beta x y}",
+            r"\frac{\partial y}{\partial t} = {\alpha x y} - {\gamma y}",
+        ],
+    )
+    model: Literal["regnet", "petrinet"] = Field(
+        description="The model type", example="regnet"
+    )
+
+
+class MmlToAMR(BaseModel):
+    equations: List[str] = Field(
+        description="Equations in pMML",
+        example=[
+            "<math><mfrac><mrow><mi>d</mi><mi>Susceptible</mi></mrow><mrow><mi>d</mi><mi>t</mi></mrow></mfrac><mo>=</mo><mo>−</mo><mi>Infection</mi><mi>Infected</mi><mi>Susceptible</mi></math>",
+            "<math><mfrac><mrow><mi>d</mi><mi>Infected</mi></mrow><mrow><mi>d</mi><mi>t</mi></mrow></mfrac><mo>=</mo><mo>−</mo><mi>Recovery</mi><mi>Infected</mi><mo>+</mo><mi>Infection</mi><mi>Infected</mi><mi>Susceptible</mi></math>",
+            "<math><mfrac><mrow><mi>d</mi><mi>Recovered</mi></mrow><mrow><mi>d</mi><mi>t</mi></mrow></mfrac><mo>=</mo><mi>Recovery</mi><mi>Infected</mi></math>",
+        ],
+    )
+    model: Literal["regnet", "petrinet"] = Field(
+        description="The model type", example="petrinet"
+    )
 
 
 class CodeSnippet(BaseModel):
@@ -74,29 +97,28 @@ class MiraGroundingInputs(BaseModel):
 
 class MiraGroundingOutputItem(BaseModel):
     class MiraDKGConcept(BaseModel):
-        id: str = Field(
-            description="DKG element id",
-            example="apollosv:00000233"
-        )
+        id: str = Field(description="DKG element id", example="apollosv:00000233")
         name: str = Field(
-            description="Canonical name of the concept",
-            example="infected population"
+            description="Canonical name of the concept", example="infected population"
         )
         description: Optional[str] = Field(
             description="Long winded description of the concept",
-            example="A population of only infected members of one species."
+            example="A population of only infected members of one species.",
         )
         synonyms: List[str] = Field(
             description="Any alternative name to the cannonical one for the concept",
-            example=["Ill individuals", "The sick and ailing"]
+            example=[["Ill individuals", "The sick and ailing"]],
         )
         embedding: List[float] = Field(
             description="Word embedding of the underlying model for the concept"
         )
 
+        def __hash__(self):
+            return hash(tuple([self.id, tuple(self.synonyms), tuple(self.embedding)]))
+
     score: float = Field(
         description="Cosine similarity of the embedding representation of the input with that of the DKG element",
-        example=0.7896
+        example=0.7896,
     )
     groundingConcept: MiraDKGConcept = Field(
         description="DKG concept associated to the query",
@@ -109,8 +131,8 @@ class MiraDKGConcept(BaseModel):
                 0.01590670458972454,
                 0.03795482963323593,
                 -0.08787763118743896,
-            ]
-        )
+            ],
+        ),
     )
 
 
@@ -136,6 +158,9 @@ class TextReadingError(BaseModel):
         example="Out of memory error",
     )
 
+    def __hash__(self):
+        return hash(f"{self.pipeline}-{self.message}")
+
 
 class TextReadingDocumentResults(BaseModel):
     data: Optional[AttributeCollection] = Field(
@@ -149,24 +174,32 @@ class TextReadingDocumentResults(BaseModel):
         example=[TextReadingError(pipeline="MIT", message="Unauthorized API key")],
     )
 
+    def __hash__(self):
+        return hash(
+            tuple([self.data, "NONE" if self.errors is None else tuple(self.errors)])
+        )
+
 
 class TextReadingAnnotationsOutput(BaseModel):
     """Contains the TR document results for all the documents submitted for annotation"""
 
     outputs: List[TextReadingDocumentResults] = Field(
         name="outputs",
         description="Contains the results of TR annotations for each input document. There is one entry per input and "
-                    "inputs and outputs are matched by the same index in the list",
+        "inputs and outputs are matched by the same index in the list",
         example=[
-            TextReadingDocumentResults(data=AttributeCollection(attributes=[])),
             TextReadingDocumentResults(
-                errors=[TextReadingError(pipeline="SKEMA", message="Dummy error")]
+                data=AttributeCollection(attributes=[]), errors=None
+            ),
+            TextReadingDocumentResults(
+                data=AttributeCollection(attributes=[]),
+                errors=[TextReadingError(pipeline="SKEMA", message="Dummy error")],
             ),
         ],
     )
 
     generalized_errors: Optional[List[TextReadingError]] = Field(
         name="generalized_errors",
         description="Any pipeline-wide errors, not specific to a particular input",
-        example=[TextReadingError(pipeline="MIT", message="API quota exceeded")]
+        example=[TextReadingError(pipeline="MIT", message="API quota exceeded")],
     )
diff --git a/skema/rest/workflows.py b/skema/rest/workflows.py
@@ -96,6 +96,21 @@ async def equations_to_amr(data: schema.EquationLatexToAMR):
         )
     return res.json()
 
+# pmml -> amr
+@router.post("/pmml/equations-to-amr", summary="Equations pMML → AMR")
+async def equations_to_amr(data: schema.MmlToAMR):
+
+    payload = {"mathml": data.equations, "model": data.model}
+    res = requests.put(f"{SKEMA_RS_ADDESS}/mathml/amr", json=payload)
+    if res.status_code != 200:
+        return JSONResponse(
+            status_code=400,
+            content={
+                "error": f"MORAE PUT /mathml/amr failed to process payload",
+                "payload": payload,
+            },
+        )
+    return res.json()
 
 # code snippets -> fn -> petrinet amr
 @router.post("/code/snippets-to-pn-amr", summary="Code snippets → PetriNet AMR")