From 3650d32b5dec4a8fa7e6eece55b7f0b06a2dc889 Mon Sep 17 00:00:00 2001
From: zachary_royals <royalszachary@gmail.com>
Date: Wed, 28 Jun 2023 11:53:17 -0400
Subject: [PATCH] backend and readme improvements

---
 README.md                           |  3 +-
 app.py                              |  0
 backend/app/api/routes.py           | 17 +++-----
 backend/app/utils/document_utils.py | 11 +++--
 prompts.py                          | 67 -----------------------------
 5 files changed, 15 insertions(+), 83 deletions(-)
 delete mode 100644 app.py
 delete mode 100644 prompts.py

diff --git a/README.md b/README.md
index 436c19e..77b488b 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,8 @@
 # zelta-challenge
 
 
-## How it works 
+[Project Overview](https://www.notion.so/Zelta-AI-Challenge-59fd8050b8714b41bc055fe6784a9449?pvs=4)
 
-App allows transcript to be loaded with insights returned from the API.
 
 ## How to run the project
 
diff --git a/app.py b/app.py
deleted file mode 100644
index e69de29..0000000
diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
index 9e492fa..88c38e9 100644
--- a/backend/app/api/routes.py
+++ b/backend/app/api/routes.py
@@ -19,9 +19,6 @@
 router = APIRouter()
 
 
-# class Document(BaseModel):
-#     file: UploadFile = File()
-
 
 @router.get("/", response_class=HTMLResponse)
 def root():
@@ -50,24 +47,22 @@ def root():
 @router.post("/analyze")
 async def analyze_document(file: UploadFile) -> dict:
     filename = file.filename
-
+    breakpoint()
     loop = asyncio.get_event_loop()
 
     with ThreadPoolExecutor() as executor:
         if filename.endswith(".pdf"):
-            # run blocking operations in a thread pool
 
+            ### unable to extract 
             pdf_bytes = await file.read()  # read file into bytes
-
-            # write bytes to a temporary file
-            temp_pdf_file = tempfile.NamedTemporaryFile(delete=False)
+            temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file
             temp_pdf_file.write(pdf_bytes)
 
             extracted_text = await loop.run_in_executor(
                 executor, extract_text_from_pdf, temp_pdf_file.name
             )
+            
             temp_pdf_file.close()
-
             os.unlink(temp_pdf_file.name)
 
             chunks = await loop.run_in_executor(
@@ -78,10 +73,10 @@ async def analyze_document(file: UploadFile) -> dict:
             # run tasks in parallel
             tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks]
             insights = await asyncio.gather(*tasks)
-     
+
+            #append insights into final product prompt
             summary = final_product_prompt_template.format_messages(text=insights)
             chat = ChatOpenAI(temperature=0.0, model="gpt-4")
-            # run blocking operations in a thread pool
             final_insights = await loop.run_in_executor(executor, chat, summary)
 
             return final_insights
diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py
index 6d930d6..a6103e1 100644
--- a/backend/app/utils/document_utils.py
+++ b/backend/app/utils/document_utils.py
@@ -1,21 +1,26 @@
 
 from langchain.text_splitter import CharacterTextSplitter
-# from PyPDF2 import PdfReader
 import pdfplumber
+import math
+# from PyPDF2 import PdfReader
+
 from nltk.tokenize import word_tokenize, sent_tokenize
 
-def extract_text_from_pdf(pdf):
+def extract_text_from_pdf(pdf) -> str:
     with pdfplumber.open(pdf) as pdf_reader:
         text = "\n".join(page.extract_text() for page in pdf_reader.pages)
     return text
 
 
 def split_into_chunks(text: str) -> list[str]:
+    chunk_size = len(text) / 2
+    chunk_overlap = math.floor(chunk_size * .05)
     text_splitter = CharacterTextSplitter(
-        separator="\n", chunk_size=12000, chunk_overlap=1200, length_function=len
+        separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
     )
     chunks = text_splitter.split_text(text)
     return chunks
+
 # def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]:
 #     sentences = sent_tokenize(text)
 #     chunks = []
diff --git a/prompts.py b/prompts.py
deleted file mode 100644
index 7d73222..0000000
--- a/prompts.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from langchain.prompts import ChatPromptTemplate
-
-
-product_prompt_string = """\
-For the following text, extract the following information from speaker 2 and remove any spaces between letters of each word. 
-
-
-delivery_days: How many days did it take for the product \
-to arrive? If this information is not found, output -1.
-
-price_value: Extract any sentences about the value or price,\
-and output them as a comma separated Python list.
-
-customer_negative_feedback: Extract any problems customers are facing with the current product \
-If this information is not found, output -1.
-
-feature_requests: Extract any sentences about feature requests,\
-and output them as a comma separated Python list.
-
-competitor_mentions: Extract any sentences about the competition\
-and output them as a comma separated Python list.
-
-
-Format the output as JSON with the following keys:
-delivery_days
-price_value
-customer_negative_feedback
-feature_requests
-competitor_mentions
-
-text: {text}
-"""
-
-
-final_prompt_string = """\
-For the following text, distill the following information from from the text elements, please ignore negative values and remove brackets:
-
-delivery_days: How many days did it take for the product \
-to arrive? If this information is not found, output -1.
-
-price_value: Extract any sentences about the value or price,\
-and output them as a comma separated Python list.
-
-customer_negative_feedback: Extract any problems customers are facing with the current product \
-If this information is not found, output -1.
-
-feature_requests: Extract any sentences about feature requests,\
-and output them as a comma separated Python list.
-
-competitor_mentions: Extract any sentences about the competition\
-and output them as a comma separated Python list.
-
-
-Format the output as JSON with the following keys:
-delivery_days
-price_value
-customer_negative_feedback
-feature_requests
-competitor_mentions
-
-text: {text}
-"""
-
-product_prompt_template = ChatPromptTemplate.from_template(product_prompt_string)
-
-
-final_product_prompt_template = ChatPromptTemplate.from_template(product_prompt_string)