Merge pull request #51 from the-deep-nlp/feature/new_endpoints

Feature/new endpoints
the-deep-nlp · Dec 5, 2024 · f86c3d9 · f86c3d9
2 parents 91f74f9 + 75d58d5
commit f86c3d9
Show file tree

Hide file tree

Showing 21 changed files with 3,345 additions and 820 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -57,6 +57,7 @@ jobs:
       CSRF_TRUSTED_ORIGINS: ''
       SUMMARIZATION_V3_ECS_ENDPOINT: ''
       ENTRYEXTRACTION_ECS_ENDPOINT: ''
+      ENTRYEXTRACTION_LLM_ECS_ENDPOINT: ''
       GEOLOCATION_ECS_ENDPOINT: ''
       TOPICMODEL_ECS_ENDPOINT: ''
 
@@ -88,6 +89,7 @@ jobs:
       RELIABILITY_MODEL_ID: ''
       RELIABILITY_MODEL_VERSION: ''
 
+      OPENAI_API_KEY: ''
     steps:
       - name: Checkout
         uses: actions/checkout@v3

diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml
diff --git a/analysis_module/mock_templates.py b/analysis_module/mock_templates.py
@@ -607,6 +607,69 @@
     ]
 }
 
+MOCK_ENTRY_CLASSIFICATION_LLM = {
+    "client_id": "entry-classification-llm-client-6000",
+    # this mdoel prediction refer to the framework_id: 1623
+    # entry_id: 510021, project_id: 2587 (2021 IFRC ESSN Turkey)
+    "model_tags": {
+        "element1": {
+            "pillar-0": {
+                "subpillar-4": {
+                    "sector-1": [
+                        "t31unid23fntmwrb"
+                    ],
+                    "sector-4": [
+                        "subsector-1"
+                    ]
+                },
+                "subpillar-3": {
+                    "sector-4": [
+                        "subsector-1"
+                    ]
+                }
+            },
+            "pillar-1": {
+                "subpillar-1": {
+                    "sector-4": [
+                        "subsector-1"
+                    ]
+                }
+            }
+        },
+        "overview-matrix1dWidget-d48u7z4yohwuu7zg": {
+            "8lowwhswgb5j9f5s": {
+                "gjsbosuej330kl45": True,
+                "iwsqjtrs2u5z8qgk": True
+            }
+        },
+        "element0": {
+            "pillar-3": {
+                "subpillar-21": True,
+                "subpillar-20": True
+            }
+        }
+    },
+    "geolocations": [
+        {
+            "entity": "Somalia",
+            "meta": {
+                "offset_start": 88,
+                "offset_end": 94,
+                "latitude": -10,
+                "longitude": -55
+            }
+        },
+        {
+            "entity": "Portugal",
+            "meta": {
+                "offset_start": 183,
+                "offset_end": 191,
+                "latitude": 39.6945,
+                "longitude": -8.13057
+            }
+        }
+    ]
+}
 
 """
 it's a huge output (and it can be bigger that this one). Maybe we can truncate it.
@@ -1745,3 +1808,96 @@
         }
     ]
 }
+
+MOCK_ENTRY_EXTRACTION_LLM = {
+    # this model prediction refer to the framework_id: 1623
+    # lead_id 67027, url: 'https://reliefweb.int/sites/reliefweb.int/files/resources/UNHCR-Turkey-Operational-Update-October-2019.pdf' #  noqa
+    # project_id: 2587 (2021 IFRC ESSN Turkey)
+    "client_id": "entry-classification-llm-client-6000",
+    "metadata": {
+        "total_pages": 10,
+        "total_words_count": 5876
+    },
+    "blocks": [{
+        "type": "text",
+        "text": "4 million Refugees and asylum-seekers in Turkey including over 3.6 million Syrian nationals and close to 400,000 registered refugees and asylum-seekers of other nationalities. Over 98% of Syrian refugees live across Turkey in 81 provinces",
+        "page": 0,
+        "textOrder": 2,
+        "relevant": True,
+        "prediction_status": True,
+        "classification": {
+            "element1": {
+                "pillar-0": {
+                    "o9kyhltzmplk0a1k": {
+                        "sector-9": []
+                    }
+                },
+                "46bg6n1o50obgx77": {
+                    "v2kfnyjbn41vv46j": {
+                        "sector-9": []
+                    }
+                }
+            },
+            "overview-matrix1dWidget-d48u7z4yohwuu7zg": {
+                "8lowwhswgb5j9f5s": {
+                    "qycslaise1s014vm": True,
+                    "7wps5hbnemt59dv9": True
+                }
+            },
+            "element0": {
+                "pillar-0": {
+                    "nxjm8rsprb9fu2wq": True
+                },
+                "kyiciutprwct1vph": {
+                    "4ftpwnssu2ugeekk": True
+                }
+            }
+        },
+        "geolocations": [
+            {
+                "entity": "Niger",
+                "meta": {
+                    "offset_start": 88,
+                    "offset_end": 94,
+                    "latitude": -10,
+                    "longitude": -55
+                }
+            },
+            {
+                "entity": "Nigeria",
+                "meta": {
+                    "offset_start": 183,
+                    "offset_end": 191,
+                    "latitude": None,
+                    "longitude": None
+                }
+            }
+        ],
+    },
+    {
+        "type": "text",
+        "text": "9,700 Refugees departed for resettlement in 2019 as of end of October, over 78 per cent of whom are Syrians",
+        "page": 0,
+        "textOrder": 3,
+        "relevant": True,
+        "prediction_status": True,
+        "classification": {
+            "element1": {
+                "pillar-0": {
+                    "o9kyhltzmplk0a1k": {
+                        "sector-9": []
+                    }
+                }
+            },
+            "overview-matrix1dWidget-d48u7z4yohwuu7zg": {
+                "8lowwhswgb5j9f5s": True
+            },
+            "element0": {
+                "kyiciutprwct1vph": {
+                    "4ftpwnssu2ugeekk": True
+                }
+            }
+        }
+    }
+    ]
+}
diff --git a/analysis_module/mockserver.py b/analysis_module/mockserver.py
@@ -16,7 +16,11 @@
 
 from core.models import NLPRequest
 from core_server.settings import ENDPOINT_NAME
-from .mock_templates import MOCK_ENTRY_CLASSIFICATION, MOCK_ENTRY_CLASSIFICATION_FORMATTED, MOCK_GEOLOCATION  # noqa
+from .mock_templates import (MOCK_ENTRY_CLASSIFICATION,
+                             MOCK_ENTRY_CLASSIFICATION_LLM,
+                             MOCK_ENTRY_CLASSIFICATION_FORMATTED,
+                             MOCK_ENTRY_EXTRACTION_LLM,
+                             MOCK_GEOLOCATION) # noqa
 from .utils import send_callback_url_request
 
 
@@ -497,13 +501,76 @@ def process_entry_extraction_mock(body) -> Any:
             logger.error("Could not send data to callback url", exc_info=True)
 
 
+def entry_extraction_llm_mock(body) -> Any:
+    process_entry_extraction_llm_mock.apply_async(
+        args=(body,), countdown=2
+    )  # Trigger task after 2 seconds
+    return json.dumps({"status": "Successfully received the request."}), 200
+
+
+@shared_task
+def process_entry_extraction_llm_mock(body) -> Any:
+    documents = body.get("documents") or []
+
+    callback_url = body.get("callback_url")
+    if not documents or not callback_url:
+        return
+
+    for document in documents:
+        client_id = document["client_id"]
+        text_extraction_id = document["text_extraction_id"]
+        # random_extracted_text = "This is some random entry extracted text"
+        random_entry_extraction_classification = MOCK_ENTRY_EXTRACTION_LLM
+        random_entry_extraction_classification.update({
+            "classification_model_info": {
+                "name": "llm_model",
+                "version": "1.0.0"
+            },
+            "client_id": client_id,
+            "entry_extraction_id": "73f9ca13-deb2-4f39-8e86-a856490bfc0d",  # random
+            "text_extraction_id": text_extraction_id
+        })
+        filepath = save_data_local_and_get_url(
+            "entry_extraction", client_id, random_entry_extraction_classification
+        )
+
+        """
+        the text_extraction_id is not something easy to retrieve in case the request is
+        set with the "url". In both cases, with the url, or the textextractionid, the text
+        was already extracted, and it's not (easily) to retrieve the id from the presigned url.
+        In the case of a request with the id, is instead possible to get the right document.
+        """
+        callback_data = {
+            "client_id": client_id,
+            "entry_extraction_classification_path": filepath,
+            "text_extraction_id": text_extraction_id,
+            "status": 1
+        }
+        try:
+            requests.post(
+                callback_url,
+                json=callback_data,
+                timeout=30,
+            )
+            logger.info("Successfully send data on callback url for entry extraction.")
+        except Exception:
+            logger.error("Could not send data to callback url", exc_info=True)
+
+
 def entry_classification_mock(body) -> Any:
     process_entry_classification_mock.apply_async(
         args=(body,), countdown=2
     )  # Trigger task after 2 seconds
     return json.dumps({"status": "Successfully received the request."}), 200
 
 
+def entry_classification_llm_mock(body) -> Any:
+    process_entry_classification_llm_mock.apply_async(
+        args=(body,), countdown=2
+    )  # Trigger task after 2 seconds
+    return json.dumps({"status": "Successfully received the request."}), 200
+
+
 @shared_task
 def process_entry_classification_mock(body) -> Any:
     callback_payload = MOCK_ENTRY_CLASSIFICATION
@@ -527,6 +594,29 @@ def process_entry_classification_mock(body) -> Any:
         logger.error("Could not send data to callback url", exc_info=True)
 
 
+@shared_task
+def process_entry_classification_llm_mock(body) -> Any:
+    callback_payload = MOCK_ENTRY_CLASSIFICATION_LLM
+    callback_payload.update({
+        "client_id": body["entries"][0]["client_id"],
+        "model_info": {
+            "id": "llm_model",
+            "version": "1.0.0"
+        },
+        "prediction_status": True
+    })
+    callback_url = body["callback_url"]
+    try:
+        requests.post(
+            callback_url,
+            json=callback_payload,
+            timeout=30
+        )
+        logger.info("Successfully send data on callback url for entry classification")
+    except Exception:
+        logger.error("Could not send data to callback url", exc_info=True)
+
+
 TYPE_ACTIONS_MOCK = {
     "topicmodel": topicmodeling_mock_model,
     "summarization": summarization_mock_model,
@@ -535,7 +625,9 @@ def process_entry_classification_mock(body) -> Any:
     "geolocation": geolocation_mock_model,
     "text-extraction": text_extraction_mock,
     "entry-extraction-classification": entry_extraction_mock,
-    "entry-classification": entry_classification_mock
+    "entry-extraction-classification-llm": entry_extraction_llm_mock,
+    "entry-classification": entry_classification_mock,
+    "entry-classification-llm": entry_classification_llm_mock
 }
 
 

diff --git a/analysis_module/serializers.py b/analysis_module/serializers.py
@@ -69,6 +69,16 @@ class PredictionRequestSerializer(serializers.Serializer):
     mock = serializers.BooleanField(default=False)
 
 
+class PredictionRequestSerializerV2(serializers.Serializer):
+    entries = PredictionEntrySerializer(many=True)
+    af_id = serializers.IntegerField()
+    project_id = serializers.IntegerField()
+    publishing_organization = serializers.CharField()
+    authoring_organization = serializers.ListField()
+    callback_url = serializers.CharField()
+    mock = serializers.BooleanField(default=False)
+
+
 class ExtractionDocumentSerializer(serializers.Serializer):
     url = serializers.CharField()
     client_id = serializers.CharField()
@@ -137,8 +147,19 @@ def to_representation(self, value):
 
 
 class EntryExtractionSerializer(serializers.Serializer):
+    documents = DocumentEntryExtractionUnionField()
+    callback_url = serializers.CharField()
+    request_type = serializers.ChoiceField(
+        choices=ExtractionRequestTypeChoices,
+        default=ExtractionRequestTypeChoices.USER,
+    )
+    mock = serializers.BooleanField(default=False)
+
 
+class EntryExtractionSerializerLLM(serializers.Serializer):
     documents = DocumentEntryExtractionUnionField()
+    af_id = serializers.IntegerField()
+    project_id = serializers.IntegerField()
     callback_url = serializers.CharField()
     request_type = serializers.ChoiceField(
         choices=ExtractionRequestTypeChoices,