diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 6d25650..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/backend/app/api/prompts.py b/backend/app/api/prompts.py index 2aa232a..3325c26 100644 --- a/backend/app/api/prompts.py +++ b/backend/app/api/prompts.py @@ -6,19 +6,19 @@ delivery_days: How many days did it take for the product \ -to arrive? If this information is not found, output is "not found"., do not use -1 +to arrive? If this information is not found, output is "not found"., price_value: Extract any sentences about the value or price,\ -and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 +and output them as a comma separated Python list. If this information is not found, output is "not found" customer_negative_feedback: Extract any problems customers are facing with the current product \ -If this information is not found, output is "not found", do not use -1 +If this information is not found, output is "not found" feature_requests: Extract any sentences about feature requests,\ -and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 +and output them as a comma separated Python list. If this information is not found, output is "not found" competitor_mentions: Extract any sentences about the competition\ -and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 +and output them as a comma separated Python list. If this information is not found, output is "not found" Format the output as JSON with the following keys: @@ -33,7 +33,7 @@ final_prompt_string = """\ -For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative or "not found" values: +For the following text, amalgamate the values from the text into a single json output, keep the attributes provided below. please ignore "not found" values: delivery_days: How many days did it take for the product \ to arrive? If this information is not found, output is "not found", do not use -1 diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index 88c38e9..bc83875 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -1,6 +1,7 @@ import asyncio import os -import tempfile +import time +import openai from fastapi import APIRouter, UploadFile from fastapi.responses import HTMLResponse @@ -8,9 +9,11 @@ from concurrent.futures import ThreadPoolExecutor from dotenv import load_dotenv, find_dotenv -from prompts import product_prompt_template, final_product_prompt_template +from .prompts import product_prompt_template, final_product_prompt_template from utils.document_utils import extract_text_from_pdf, split_into_chunks +from rich import print +import guardrails as gd _ = load_dotenv(find_dotenv()) @@ -18,8 +21,6 @@ router = APIRouter() - - @router.get("/", response_class=HTMLResponse) def root(): return """ @@ -46,40 +47,38 @@ def root(): @router.post("/analyze") async def analyze_document(file: UploadFile) -> dict: + start = time.time() filename = file.filename - breakpoint() loop = asyncio.get_event_loop() with ThreadPoolExecutor() as executor: if filename.endswith(".pdf"): - - ### unable to extract - pdf_bytes = await file.read() # read file into bytes - temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file - temp_pdf_file.write(pdf_bytes) - extracted_text = await loop.run_in_executor( - executor, extract_text_from_pdf, temp_pdf_file.name + executor, extract_text_from_pdf, file.file ) - - temp_pdf_file.close() - os.unlink(temp_pdf_file.name) + guard = gd.Guard.from_rail('/Users/Zachary_Royals/Code/zelta-challenge/backend/app/api/sales_transcript.rail') + chunks = await loop.run_in_executor( executor, split_into_chunks, extracted_text ) - chat = ChatOpenAI(temperature=0.0, model="gpt-4") - # run tasks in parallel - tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks] - insights = await asyncio.gather(*tasks) - - #append insights into final product prompt - summary = final_product_prompt_template.format_messages(text=insights) - chat = ChatOpenAI(temperature=0.0, model="gpt-4") - final_insights = await loop.run_in_executor(executor, chat, summary) - - return final_insights + validated_outputs = [] + for chunk in chunks: + _, validated_output = guard( + openai.ChatCompletion.create, + prompt_params={"sales_transcript": chunk}, + model="gpt-4", + max_tokens=6000, + temperature=0.0, + + ) + validated_outputs.append(validated_output) + + # additional prompt to still collection of validated outputs? + execution_time = time.time() - start + print(f'Time taken: {execution_time} seconds') + return validated_output elif file.endswith(".txt"): return "This is a text file." diff --git a/backend/app/api/sales_transcript.rail b/backend/app/api/sales_transcript.rail new file mode 100644 index 0000000..ccd93c2 --- /dev/null +++ b/backend/app/api/sales_transcript.rail @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + +Given the following document, construct a JSON that follows that correct schema. + + +{{sales_transcript}} + +@xml_prefix_prompt + + +{{output_schema}} + + +@json_suffix_prompt + + + \ No newline at end of file diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py index a6103e1..92417a7 100644 --- a/backend/app/utils/document_utils.py +++ b/backend/app/utils/document_utils.py @@ -1,7 +1,6 @@ from langchain.text_splitter import CharacterTextSplitter import pdfplumber -import math # from PyPDF2 import PdfReader from nltk.tokenize import word_tokenize, sent_tokenize @@ -12,13 +11,13 @@ def extract_text_from_pdf(pdf) -> str: return text -def split_into_chunks(text: str) -> list[str]: - chunk_size = len(text) / 2 - chunk_overlap = math.floor(chunk_size * .05) +def split_into_chunks(text: str, chunk_size: int=6000, chunk_overlap: int=400) -> list[str]: + text_splitter = CharacterTextSplitter( separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len ) chunks = text_splitter.split_text(text) + print(f'There are {len(chunks)} chunks with a chunk size of {chunk_size} and an overlap size of {chunk_overlap}') return chunks # def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]: