updated the dockerfile (python ver) and use the parallel processing o…

…f dataframes using mapply
the-deep-nlp · Mar 18, 2024 · 6ac0e2c · 6ac0e2c
1 parent f4fce97
commit 6ac0e2c
Show file tree

Hide file tree

Showing 4 changed files with 507 additions and 324 deletions.
diff --git a/handlers/ecs/topicmodeling/Dockerfile b/handlers/ecs/topicmodeling/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8-slim-buster
+FROM python:3.10-slim-buster
 
 LABEL maintainer="[email protected]"
 

diff --git a/handlers/ecs/topicmodeling/app.py b/handlers/ecs/topicmodeling/app.py
@@ -12,6 +12,7 @@
 from fastapi import FastAPI, BackgroundTasks
 from pydantic import BaseModel
 from botocore.exceptions import ClientError
+import mapply
 from topic_generator import TopicGenerator
 from topic_generator_llm import TopicGenerationLLM
 from nlp_modules_utils import (
@@ -26,6 +27,7 @@
 )
 
 logging.getLogger().setLevel(logging.INFO)
+mapply.init(chunk_size=1, progressbar=False)
 
 SENTRY_DSN = os.environ.get("SENTRY_DSN")
 ENVIRONMENT = os.environ.get("ENVIRONMENT")
@@ -155,7 +157,7 @@ def _get_embeddings(
         finetuned_task: str = "['first_level_tags']",
         return_type: str = "default_analyis",
         embeddings_return_type: str = "array",
-        batch_size: int = 10
+        batch_size: int = 25
     ):
         """
         Calculates the embeddings of the entries
@@ -233,7 +235,7 @@ def select_most_relevant_excerpts(self, df):
         for v in data_json.values():
             v["Representation"] = " ".join(set(v["Representation"]))
         new_df = pd.DataFrame.from_dict(data_json, orient="index")
-        new_df["label"] = new_df.apply(self.generate_llm_topic, axis=1)
+        new_df["label"] = new_df.mapply(self.generate_llm_topic, axis=1)
         new_df.drop(columns=["Representation", "Document"], inplace=True)
         return new_df.to_dict(orient="index")
 
@@ -242,7 +244,7 @@ def generate_llm_topic(self, x: pd.DataFrame, max_excerpts: int=20):
         Generate the short topic using LLM based on keywords
         The excerpts are restricted to first 20 (default)
         """
-        topic_generation = TopicGenerationLLM(x["Document"][:max_excerpts], x["Representation"][:max_excerpts])
+        topic_generation = TopicGenerationLLM(x["Document"][:max_excerpts], x["Representation"])
         return topic_generation.topic_generator_handler()
 
     def dispatch_results(self, status, presigned_url=None):