From c805162f624ff75d1b5fd308cfaf423baafde8a2 Mon Sep 17 00:00:00 2001
From: zachary_royals <royalszachary@gmail.com>
Date: Wed, 28 Jun 2023 14:26:21 -0400
Subject: [PATCH 1/4] loose ends removed

---
 backend/app/api/prompts.py          | 12 +++++------
 backend/app/api/routes.py           | 31 +++++++++++------------------
 backend/app/utils/document_utils.py |  7 +++----
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/backend/app/api/prompts.py b/backend/app/api/prompts.py
index 2aa232a..3325c26 100644
--- a/backend/app/api/prompts.py
+++ b/backend/app/api/prompts.py
@@ -6,19 +6,19 @@
 
 
 delivery_days: How many days did it take for the product \
-to arrive? If this information is not found, output is "not found"., do not use -1
+to arrive? If this information is not found, output is "not found"., 
 
 price_value: Extract any sentences about the value or price,\
-and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+and output them as a comma separated Python list. If this information is not found, output is "not found"
 
 customer_negative_feedback: Extract any problems customers are facing with the current product \
-If this information is not found,  output is "not found", do not use -1
+If this information is not found,  output is "not found"
 
 feature_requests: Extract any sentences about feature requests,\
-and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+and output them as a comma separated Python list. If this information is not found, output is "not found"
 
 competitor_mentions: Extract any sentences about the competition\
-and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+and output them as a comma separated Python list. If this information is not found, output is "not found"
 
 
 Format the output as JSON with the following keys:
@@ -33,7 +33,7 @@
 
 
 final_prompt_string = """\
-For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative  or "not found" values:
+For the following text, amalgamate the values from the text into a single json output, keep the attributes provided below. please ignore "not found" values:
 
 delivery_days: How many days did it take for the product \
 to arrive? If this information is not found, output is "not found", do not use -1
diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
index 88c38e9..9e57558 100644
--- a/backend/app/api/routes.py
+++ b/backend/app/api/routes.py
@@ -1,25 +1,21 @@
 import asyncio
 import os
-import tempfile
-
+import time
 from fastapi import APIRouter, UploadFile
 from fastapi.responses import HTMLResponse
 from langchain.chat_models import ChatOpenAI
 from concurrent.futures import ThreadPoolExecutor
 
 from dotenv import load_dotenv, find_dotenv
-from prompts import product_prompt_template, final_product_prompt_template
+from .prompts import product_prompt_template, final_product_prompt_template
 from utils.document_utils import extract_text_from_pdf, split_into_chunks
 
-
 _ = load_dotenv(find_dotenv())
 
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
 router = APIRouter()
 
-
-
 @router.get("/", response_class=HTMLResponse)
 def root():
     return """
@@ -46,24 +42,16 @@ def root():
 
 @router.post("/analyze")
 async def analyze_document(file: UploadFile) -> dict:
+    start = time.time()
     filename = file.filename
-    breakpoint()
     loop = asyncio.get_event_loop()
 
     with ThreadPoolExecutor() as executor:
         if filename.endswith(".pdf"):
 
-            ### unable to extract 
-            pdf_bytes = await file.read()  # read file into bytes
-            temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file
-            temp_pdf_file.write(pdf_bytes)
-
             extracted_text = await loop.run_in_executor(
-                executor, extract_text_from_pdf, temp_pdf_file.name
+                executor, extract_text_from_pdf, file.file
             )
-            
-            temp_pdf_file.close()
-            os.unlink(temp_pdf_file.name)
 
             chunks = await loop.run_in_executor(
                 executor, split_into_chunks, extracted_text
@@ -71,14 +59,19 @@ async def analyze_document(file: UploadFile) -> dict:
             chat = ChatOpenAI(temperature=0.0, model="gpt-4")
             
             # run tasks in parallel
+            start_chat_tasks = time.time()
             tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks]
             insights = await asyncio.gather(*tasks)
+            print(f'Time taken for initial chunk insights: {time.time() - start_chat_tasks} seconds')
 
             #append insights into final product prompt
-            summary = final_product_prompt_template.format_messages(text=insights)
-            chat = ChatOpenAI(temperature=0.0, model="gpt-4")
-            final_insights = await loop.run_in_executor(executor, chat, summary)
+            start_agg_insights = time.time()
+            agg_insights = final_product_prompt_template.format_messages(text=insights)
+            final_insights = await loop.run_in_executor(executor, chat, agg_insights)
+            print(f'Time taken for final insights: {time.time() - start_agg_insights} seconds')
 
+            execution_time = time.time() - start
+            print(f'Time taken: {execution_time} seconds')
             return final_insights
 
         elif file.endswith(".txt"):
diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py
index a6103e1..910757a 100644
--- a/backend/app/utils/document_utils.py
+++ b/backend/app/utils/document_utils.py
@@ -1,7 +1,6 @@
 
 from langchain.text_splitter import CharacterTextSplitter
 import pdfplumber
-import math
 # from PyPDF2 import PdfReader
 
 from nltk.tokenize import word_tokenize, sent_tokenize
@@ -12,13 +11,13 @@ def extract_text_from_pdf(pdf) -> str:
     return text
 
 
-def split_into_chunks(text: str) -> list[str]:
-    chunk_size = len(text) / 2
-    chunk_overlap = math.floor(chunk_size * .05)
+def split_into_chunks(text: str, chunk_size: int=8000, chunk_overlap: int=400) -> list[str]:
+
     text_splitter = CharacterTextSplitter(
         separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
     )
     chunks = text_splitter.split_text(text)
+    print(f'There are {len(chunks)} chunks with a chunk size of {chunk_size} and an overlap size of {chunk_overlap}')
     return chunks
 
 # def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]:

From 848aa9b538effbe3aed00e87c11c84a5ed72a9c9 Mon Sep 17 00:00:00 2001
From: zachary_royals <royalszachary@gmail.com>
Date: Mon, 10 Jul 2023 12:56:02 -0400
Subject: [PATCH 2/4] guardrails package and rail file added.

---
 backend/app/api/routes.py             | 33 +++++++++++++++------------
 backend/app/api/sales_transcript.rail | 27 ++++++++++++++++++++++
 2 files changed, 45 insertions(+), 15 deletions(-)
 create mode 100644 backend/app/api/sales_transcript.rail

diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
index 9e57558..e68f68b 100644
--- a/backend/app/api/routes.py
+++ b/backend/app/api/routes.py
@@ -1,6 +1,8 @@
 import asyncio
 import os
 import time
+import openai
+
 from fastapi import APIRouter, UploadFile
 from fastapi.responses import HTMLResponse
 from langchain.chat_models import ChatOpenAI
@@ -9,6 +11,9 @@
 from dotenv import load_dotenv, find_dotenv
 from .prompts import product_prompt_template, final_product_prompt_template
 from utils.document_utils import extract_text_from_pdf, split_into_chunks
+from rich import print
+
+import guardrails as gd
 
 _ = load_dotenv(find_dotenv())
 
@@ -48,31 +53,29 @@ async def analyze_document(file: UploadFile) -> dict:
 
     with ThreadPoolExecutor() as executor:
         if filename.endswith(".pdf"):
-
             extracted_text = await loop.run_in_executor(
                 executor, extract_text_from_pdf, file.file
             )
 
+            guard = gd.Guard.from_rail('/Users/Zachary_Royals/Code/zelta-challenge/backend/app/api/sales_transcript.rail')
+            
             chunks = await loop.run_in_executor(
                 executor, split_into_chunks, extracted_text
             )
-            chat = ChatOpenAI(temperature=0.0, model="gpt-4")
-            
             # run tasks in parallel
-            start_chat_tasks = time.time()
-            tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks]
-            insights = await asyncio.gather(*tasks)
-            print(f'Time taken for initial chunk insights: {time.time() - start_chat_tasks} seconds')
-
-            #append insights into final product prompt
-            start_agg_insights = time.time()
-            agg_insights = final_product_prompt_template.format_messages(text=insights)
-            final_insights = await loop.run_in_executor(executor, chat, agg_insights)
-            print(f'Time taken for final insights: {time.time() - start_agg_insights} seconds')
-
+            for chunk in chunks:
+                raw_llm_output, validated_output = guard(
+                                                    openai.ChatCompletion.create,
+                                                    prompt_params={"sales_transcript": chunk},
+                                                    engine="chat-gpt4",
+                                                    max_tokens=1024,
+                                                    temperature=0.3,
+                                                    
+                                            )
+                
             execution_time = time.time() - start
             print(f'Time taken: {execution_time} seconds')
-            return final_insights
+            return validated_output
 
         elif file.endswith(".txt"):
             return "This is a text file."
diff --git a/backend/app/api/sales_transcript.rail b/backend/app/api/sales_transcript.rail
new file mode 100644
index 0000000..6a8011e
--- /dev/null
+++ b/backend/app/api/sales_transcript.rail
@@ -0,0 +1,27 @@
+<rail version="0.1">
+
+<output>
+    <object name="customer_feedback">
+        <string name="delivery_days" description="delivery days of the product"/>
+        <string name="price_value" description="Any sentences about the price or value"/>
+        <string name="feature_requests" description="Any sentences about features or improvements that could be added"/>
+        <list name="competitor_mentions" description="Mentions of a competitor. Each company should be classified into separate item in the list.">
+            <object>
+                <string name="name" description="Name of the company" />
+                <string name="advantages" description="Describes what strengths of the competition's company has to offer" />
+                <string name="disadvtanges" description="Describes what the competitor's company is bad at" />
+            </object>
+        </list>
+    </object>
+</output>
+
+
+<prompt>
+
+Given the following sales transcript extract the following information from speaker two, please extract a dictionary that contains the customer's feedback.
+
+{{sales_transcript}}
+
+</prompt>
+
+</rail>
\ No newline at end of file

From 4de906a726e72d4fc44a94ecf25878f8634147c2 Mon Sep 17 00:00:00 2001
From: zachary_royals <royalszachary@gmail.com>
Date: Wed, 12 Jul 2023 11:56:10 -0400
Subject: [PATCH 3/4] output_schema added to rail file, chunking change

---
 backend/app/api/routes.py             | 13 ++++++++-----
 backend/app/api/sales_transcript.rail | 24 +++++++++++++-----------
 backend/app/utils/document_utils.py   |  2 +-
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
index e68f68b..bc83875 100644
--- a/backend/app/api/routes.py
+++ b/backend/app/api/routes.py
@@ -63,16 +63,19 @@ async def analyze_document(file: UploadFile) -> dict:
                 executor, split_into_chunks, extracted_text
             )
             # run tasks in parallel
+            validated_outputs = []
             for chunk in chunks:
-                raw_llm_output, validated_output = guard(
+                _, validated_output = guard(
                                                     openai.ChatCompletion.create,
                                                     prompt_params={"sales_transcript": chunk},
-                                                    engine="chat-gpt4",
-                                                    max_tokens=1024,
-                                                    temperature=0.3,
+                                                    model="gpt-4",
+                                                    max_tokens=6000,
+                                                    temperature=0.0,
                                                     
                                             )
-                
+                validated_outputs.append(validated_output)
+            
+            # additional prompt to still collection of validated outputs?
             execution_time = time.time() - start
             print(f'Time taken: {execution_time} seconds')
             return validated_output
diff --git a/backend/app/api/sales_transcript.rail b/backend/app/api/sales_transcript.rail
index 6a8011e..ccd93c2 100644
--- a/backend/app/api/sales_transcript.rail
+++ b/backend/app/api/sales_transcript.rail
@@ -1,27 +1,29 @@
 <rail version="0.1">
 
 <output>
-    <object name="customer_feedback">
+    <object name="customer_feedback" description="customer feedback coming from speaker two">
         <string name="delivery_days" description="delivery days of the product"/>
-        <string name="price_value" description="Any sentences about the price or value"/>
-        <string name="feature_requests" description="Any sentences about features or improvements that could be added"/>
-        <list name="competitor_mentions" description="Mentions of a competitor. Each company should be classified into separate item in the list.">
-            <object>
-                <string name="name" description="Name of the company" />
-                <string name="advantages" description="Describes what strengths of the competition's company has to offer" />
-                <string name="disadvtanges" description="Describes what the competitor's company is bad at" />
-            </object>
-        </list>
+        <string name="price_value" description="Mentions about the price to value"/>
+        <string name="feature_requests" description="Features or improvements that could be added"/>
     </object>
 </output>
 
 
 <prompt>
 
-Given the following sales transcript extract the following information from speaker two, please extract a dictionary that contains the customer's feedback.
+
+Given the following document, construct a JSON that follows that correct schema.
+
 
 {{sales_transcript}}
 
+@xml_prefix_prompt  
+
+
+{{output_schema}}  
+
+
+@json_suffix_prompt
 </prompt>
 
 </rail>
\ No newline at end of file
diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py
index 910757a..92417a7 100644
--- a/backend/app/utils/document_utils.py
+++ b/backend/app/utils/document_utils.py
@@ -11,7 +11,7 @@ def extract_text_from_pdf(pdf) -> str:
     return text
 
 
-def split_into_chunks(text: str, chunk_size: int=8000, chunk_overlap: int=400) -> list[str]:
+def split_into_chunks(text: str, chunk_size: int=6000, chunk_overlap: int=400) -> list[str]:
 
     text_splitter = CharacterTextSplitter(
         separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len

From 46cd4922cb612f16d2a1f5ac2c2034befb844a13 Mon Sep 17 00:00:00 2001
From: Luca Blight <46580497+Luca-Blight@users.noreply.github.com>
Date: Wed, 12 Jul 2023 11:58:04 -0400
Subject: [PATCH 4/4] Delete .DS_Store

removed .DS_Store
---
 .DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 6d25650cd55bea37daf28ffe8a37dcab6f949329..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKK~BRk5S%R)ZRw>a#04oYsKgIK)gCx-fd>SVR#3Gx6%-+Ex$_6U!23ACtZjuf
z4L5`k+Ktv@uXoq>IErHcGJ2ez00RJBs$#3hqCvD@bw^skqMhP$WVl9#Tg;H`c$?ub
zGN60c=WZof<C%N+wSURkGSAXwp3*~Kk5lUK!wGA^K7H4Gh8kJ(eXIF)kh;tZz8Nb_
zx4q@K=6%47_sDTa+~A>U1bMH;^l*U*=QzeWamd)(C~p3*mHBs=d&caW#{4hLU-j;n
z-U(|@oL$4Z-Abxahh@~9>%Jk&D@N+LDKW=_S?0<g?P-k_U12pBn<mD9F<=b*5Cc53
zRl0qTRvH7wfH6=qpx=jxsu%|>Ji1Q@3wH$|wrO_4y8bL`OcXE<Sa@U)MSLjHhf26&
zh!3Yd3Vw0G!lMs|gv*D7ktN(v#Es7Uk#mQ{9<4M6jDc+i_Iz`o=l|sU`~P;3wTuB{
z;9oHy?eXP!#3hBZb!TyU)<)DTs*3s*9?K9`+)+$lJ&KR0PH2y`LW~0z9%-T2kAS1W
K3S;0`8TbSiE`DtQ