Skip to content

Commit

Permalink
updated the dockerfile (python ver) and use the parallel processing o…
Browse files Browse the repository at this point in the history
…f dataframes using mapply
  • Loading branch information
ranjan-stha committed Mar 18, 2024
1 parent f4fce97 commit 6ac0e2c
Show file tree
Hide file tree
Showing 4 changed files with 507 additions and 324 deletions.
2 changes: 1 addition & 1 deletion handlers/ecs/topicmodeling/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.8-slim-buster
FROM python:3.10-slim-buster

LABEL maintainer="[email protected]"

Expand Down
8 changes: 5 additions & 3 deletions handlers/ecs/topicmodeling/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from botocore.exceptions import ClientError
import mapply
from topic_generator import TopicGenerator
from topic_generator_llm import TopicGenerationLLM
from nlp_modules_utils import (
Expand All @@ -26,6 +27,7 @@
)

logging.getLogger().setLevel(logging.INFO)
mapply.init(chunk_size=1, progressbar=False)

SENTRY_DSN = os.environ.get("SENTRY_DSN")
ENVIRONMENT = os.environ.get("ENVIRONMENT")
Expand Down Expand Up @@ -155,7 +157,7 @@ def _get_embeddings(
finetuned_task: str = "['first_level_tags']",
return_type: str = "default_analyis",
embeddings_return_type: str = "array",
batch_size: int = 10
batch_size: int = 25
):
"""
Calculates the embeddings of the entries
Expand Down Expand Up @@ -233,7 +235,7 @@ def select_most_relevant_excerpts(self, df):
for v in data_json.values():
v["Representation"] = " ".join(set(v["Representation"]))
new_df = pd.DataFrame.from_dict(data_json, orient="index")
new_df["label"] = new_df.apply(self.generate_llm_topic, axis=1)
new_df["label"] = new_df.mapply(self.generate_llm_topic, axis=1)
new_df.drop(columns=["Representation", "Document"], inplace=True)
return new_df.to_dict(orient="index")

Expand All @@ -242,7 +244,7 @@ def generate_llm_topic(self, x: pd.DataFrame, max_excerpts: int=20):
Generate the short topic using LLM based on keywords
The excerpts are restricted to first 20 (default)
"""
topic_generation = TopicGenerationLLM(x["Document"][:max_excerpts], x["Representation"][:max_excerpts])
topic_generation = TopicGenerationLLM(x["Document"][:max_excerpts], x["Representation"])
return topic_generation.topic_generator_handler()

def dispatch_results(self, status, presigned_url=None):
Expand Down
Loading

0 comments on commit 6ac0e2c

Please sign in to comment.