From 3650d32b5dec4a8fa7e6eece55b7f0b06a2dc889 Mon Sep 17 00:00:00 2001 From: zachary_royals Date: Wed, 28 Jun 2023 11:53:17 -0400 Subject: [PATCH] backend and readme improvements --- README.md | 3 +- app.py | 0 backend/app/api/routes.py | 17 +++----- backend/app/utils/document_utils.py | 11 +++-- prompts.py | 67 ----------------------------- 5 files changed, 15 insertions(+), 83 deletions(-) delete mode 100644 app.py delete mode 100644 prompts.py diff --git a/README.md b/README.md index 436c19e..77b488b 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,8 @@ # zelta-challenge -## How it works +[Project Overview](https://www.notion.so/Zelta-AI-Challenge-59fd8050b8714b41bc055fe6784a9449?pvs=4) -App allows transcript to be loaded with insights returned from the API. ## How to run the project diff --git a/app.py b/app.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index 9e492fa..88c38e9 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -19,9 +19,6 @@ router = APIRouter() -# class Document(BaseModel): -# file: UploadFile = File() - @router.get("/", response_class=HTMLResponse) def root(): @@ -50,24 +47,22 @@ def root(): @router.post("/analyze") async def analyze_document(file: UploadFile) -> dict: filename = file.filename - + breakpoint() loop = asyncio.get_event_loop() with ThreadPoolExecutor() as executor: if filename.endswith(".pdf"): - # run blocking operations in a thread pool + ### unable to extract pdf_bytes = await file.read() # read file into bytes - - # write bytes to a temporary file - temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) + temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file temp_pdf_file.write(pdf_bytes) extracted_text = await loop.run_in_executor( executor, extract_text_from_pdf, temp_pdf_file.name ) + temp_pdf_file.close() - os.unlink(temp_pdf_file.name) chunks = await loop.run_in_executor( @@ -78,10 +73,10 @@ async def analyze_document(file: UploadFile) -> dict: # run tasks in parallel tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks] insights = await asyncio.gather(*tasks) - + + #append insights into final product prompt summary = final_product_prompt_template.format_messages(text=insights) chat = ChatOpenAI(temperature=0.0, model="gpt-4") - # run blocking operations in a thread pool final_insights = await loop.run_in_executor(executor, chat, summary) return final_insights diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py index 6d930d6..a6103e1 100644 --- a/backend/app/utils/document_utils.py +++ b/backend/app/utils/document_utils.py @@ -1,21 +1,26 @@ from langchain.text_splitter import CharacterTextSplitter -# from PyPDF2 import PdfReader import pdfplumber +import math +# from PyPDF2 import PdfReader + from nltk.tokenize import word_tokenize, sent_tokenize -def extract_text_from_pdf(pdf): +def extract_text_from_pdf(pdf) -> str: with pdfplumber.open(pdf) as pdf_reader: text = "\n".join(page.extract_text() for page in pdf_reader.pages) return text def split_into_chunks(text: str) -> list[str]: + chunk_size = len(text) / 2 + chunk_overlap = math.floor(chunk_size * .05) text_splitter = CharacterTextSplitter( - separator="\n", chunk_size=12000, chunk_overlap=1200, length_function=len + separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len ) chunks = text_splitter.split_text(text) return chunks + # def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]: # sentences = sent_tokenize(text) # chunks = [] diff --git a/prompts.py b/prompts.py deleted file mode 100644 index 7d73222..0000000 --- a/prompts.py +++ /dev/null @@ -1,67 +0,0 @@ -from langchain.prompts import ChatPromptTemplate - - -product_prompt_string = """\ -For the following text, extract the following information from speaker 2 and remove any spaces between letters of each word. - - -delivery_days: How many days did it take for the product \ -to arrive? If this information is not found, output -1. - -price_value: Extract any sentences about the value or price,\ -and output them as a comma separated Python list. - -customer_negative_feedback: Extract any problems customers are facing with the current product \ -If this information is not found, output -1. - -feature_requests: Extract any sentences about feature requests,\ -and output them as a comma separated Python list. - -competitor_mentions: Extract any sentences about the competition\ -and output them as a comma separated Python list. - - -Format the output as JSON with the following keys: -delivery_days -price_value -customer_negative_feedback -feature_requests -competitor_mentions - -text: {text} -""" - - -final_prompt_string = """\ -For the following text, distill the following information from from the text elements, please ignore negative values and remove brackets: - -delivery_days: How many days did it take for the product \ -to arrive? If this information is not found, output -1. - -price_value: Extract any sentences about the value or price,\ -and output them as a comma separated Python list. - -customer_negative_feedback: Extract any problems customers are facing with the current product \ -If this information is not found, output -1. - -feature_requests: Extract any sentences about feature requests,\ -and output them as a comma separated Python list. - -competitor_mentions: Extract any sentences about the competition\ -and output them as a comma separated Python list. - - -Format the output as JSON with the following keys: -delivery_days -price_value -customer_negative_feedback -feature_requests -competitor_mentions - -text: {text} -""" - -product_prompt_template = ChatPromptTemplate.from_template(product_prompt_string) - - -final_product_prompt_template = ChatPromptTemplate.from_template(product_prompt_string)