From c805162f624ff75d1b5fd308cfaf423baafde8a2 Mon Sep 17 00:00:00 2001 From: zachary_royals Date: Wed, 28 Jun 2023 14:26:21 -0400 Subject: [PATCH 1/4] loose ends removed --- backend/app/api/prompts.py | 12 +++++------ backend/app/api/routes.py | 31 +++++++++++------------------ backend/app/utils/document_utils.py | 7 +++---- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/backend/app/api/prompts.py b/backend/app/api/prompts.py index 2aa232a..3325c26 100644 --- a/backend/app/api/prompts.py +++ b/backend/app/api/prompts.py @@ -6,19 +6,19 @@ delivery_days: How many days did it take for the product \ -to arrive? If this information is not found, output is "not found"., do not use -1 +to arrive? If this information is not found, output is "not found"., price_value: Extract any sentences about the value or price,\ -and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 +and output them as a comma separated Python list. If this information is not found, output is "not found" customer_negative_feedback: Extract any problems customers are facing with the current product \ -If this information is not found, output is "not found", do not use -1 +If this information is not found, output is "not found" feature_requests: Extract any sentences about feature requests,\ -and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 +and output them as a comma separated Python list. If this information is not found, output is "not found" competitor_mentions: Extract any sentences about the competition\ -and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 +and output them as a comma separated Python list. If this information is not found, output is "not found" Format the output as JSON with the following keys: @@ -33,7 +33,7 @@ final_prompt_string = """\ -For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative or "not found" values: +For the following text, amalgamate the values from the text into a single json output, keep the attributes provided below. please ignore "not found" values: delivery_days: How many days did it take for the product \ to arrive? If this information is not found, output is "not found", do not use -1 diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index 88c38e9..9e57558 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -1,25 +1,21 @@ import asyncio import os -import tempfile - +import time from fastapi import APIRouter, UploadFile from fastapi.responses import HTMLResponse from langchain.chat_models import ChatOpenAI from concurrent.futures import ThreadPoolExecutor from dotenv import load_dotenv, find_dotenv -from prompts import product_prompt_template, final_product_prompt_template +from .prompts import product_prompt_template, final_product_prompt_template from utils.document_utils import extract_text_from_pdf, split_into_chunks - _ = load_dotenv(find_dotenv()) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") router = APIRouter() - - @router.get("/", response_class=HTMLResponse) def root(): return """ @@ -46,24 +42,16 @@ def root(): @router.post("/analyze") async def analyze_document(file: UploadFile) -> dict: + start = time.time() filename = file.filename - breakpoint() loop = asyncio.get_event_loop() with ThreadPoolExecutor() as executor: if filename.endswith(".pdf"): - ### unable to extract - pdf_bytes = await file.read() # read file into bytes - temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file - temp_pdf_file.write(pdf_bytes) - extracted_text = await loop.run_in_executor( - executor, extract_text_from_pdf, temp_pdf_file.name + executor, extract_text_from_pdf, file.file ) - - temp_pdf_file.close() - os.unlink(temp_pdf_file.name) chunks = await loop.run_in_executor( executor, split_into_chunks, extracted_text @@ -71,14 +59,19 @@ async def analyze_document(file: UploadFile) -> dict: chat = ChatOpenAI(temperature=0.0, model="gpt-4") # run tasks in parallel + start_chat_tasks = time.time() tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks] insights = await asyncio.gather(*tasks) + print(f'Time taken for initial chunk insights: {time.time() - start_chat_tasks} seconds') #append insights into final product prompt - summary = final_product_prompt_template.format_messages(text=insights) - chat = ChatOpenAI(temperature=0.0, model="gpt-4") - final_insights = await loop.run_in_executor(executor, chat, summary) + start_agg_insights = time.time() + agg_insights = final_product_prompt_template.format_messages(text=insights) + final_insights = await loop.run_in_executor(executor, chat, agg_insights) + print(f'Time taken for final insights: {time.time() - start_agg_insights} seconds') + execution_time = time.time() - start + print(f'Time taken: {execution_time} seconds') return final_insights elif file.endswith(".txt"): diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py index a6103e1..910757a 100644 --- a/backend/app/utils/document_utils.py +++ b/backend/app/utils/document_utils.py @@ -1,7 +1,6 @@ from langchain.text_splitter import CharacterTextSplitter import pdfplumber -import math # from PyPDF2 import PdfReader from nltk.tokenize import word_tokenize, sent_tokenize @@ -12,13 +11,13 @@ def extract_text_from_pdf(pdf) -> str: return text -def split_into_chunks(text: str) -> list[str]: - chunk_size = len(text) / 2 - chunk_overlap = math.floor(chunk_size * .05) +def split_into_chunks(text: str, chunk_size: int=8000, chunk_overlap: int=400) -> list[str]: + text_splitter = CharacterTextSplitter( separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len ) chunks = text_splitter.split_text(text) + print(f'There are {len(chunks)} chunks with a chunk size of {chunk_size} and an overlap size of {chunk_overlap}') return chunks # def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]: From 848aa9b538effbe3aed00e87c11c84a5ed72a9c9 Mon Sep 17 00:00:00 2001 From: zachary_royals Date: Mon, 10 Jul 2023 12:56:02 -0400 Subject: [PATCH 2/4] guardrails package and rail file added. --- backend/app/api/routes.py | 33 +++++++++++++++------------ backend/app/api/sales_transcript.rail | 27 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 15 deletions(-) create mode 100644 backend/app/api/sales_transcript.rail diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index 9e57558..e68f68b 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -1,6 +1,8 @@ import asyncio import os import time +import openai + from fastapi import APIRouter, UploadFile from fastapi.responses import HTMLResponse from langchain.chat_models import ChatOpenAI @@ -9,6 +11,9 @@ from dotenv import load_dotenv, find_dotenv from .prompts import product_prompt_template, final_product_prompt_template from utils.document_utils import extract_text_from_pdf, split_into_chunks +from rich import print + +import guardrails as gd _ = load_dotenv(find_dotenv()) @@ -48,31 +53,29 @@ async def analyze_document(file: UploadFile) -> dict: with ThreadPoolExecutor() as executor: if filename.endswith(".pdf"): - extracted_text = await loop.run_in_executor( executor, extract_text_from_pdf, file.file ) + guard = gd.Guard.from_rail('/Users/Zachary_Royals/Code/zelta-challenge/backend/app/api/sales_transcript.rail') + chunks = await loop.run_in_executor( executor, split_into_chunks, extracted_text ) - chat = ChatOpenAI(temperature=0.0, model="gpt-4") - # run tasks in parallel - start_chat_tasks = time.time() - tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks] - insights = await asyncio.gather(*tasks) - print(f'Time taken for initial chunk insights: {time.time() - start_chat_tasks} seconds') - - #append insights into final product prompt - start_agg_insights = time.time() - agg_insights = final_product_prompt_template.format_messages(text=insights) - final_insights = await loop.run_in_executor(executor, chat, agg_insights) - print(f'Time taken for final insights: {time.time() - start_agg_insights} seconds') - + for chunk in chunks: + raw_llm_output, validated_output = guard( + openai.ChatCompletion.create, + prompt_params={"sales_transcript": chunk}, + engine="chat-gpt4", + max_tokens=1024, + temperature=0.3, + + ) + execution_time = time.time() - start print(f'Time taken: {execution_time} seconds') - return final_insights + return validated_output elif file.endswith(".txt"): return "This is a text file." diff --git a/backend/app/api/sales_transcript.rail b/backend/app/api/sales_transcript.rail new file mode 100644 index 0000000..6a8011e --- /dev/null +++ b/backend/app/api/sales_transcript.rail @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + +Given the following sales transcript extract the following information from speaker two, please extract a dictionary that contains the customer's feedback. + +{{sales_transcript}} + + + + \ No newline at end of file From 4de906a726e72d4fc44a94ecf25878f8634147c2 Mon Sep 17 00:00:00 2001 From: zachary_royals Date: Wed, 12 Jul 2023 11:56:10 -0400 Subject: [PATCH 3/4] output_schema added to rail file, chunking change --- backend/app/api/routes.py | 13 ++++++++----- backend/app/api/sales_transcript.rail | 24 +++++++++++++----------- backend/app/utils/document_utils.py | 2 +- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index e68f68b..bc83875 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -63,16 +63,19 @@ async def analyze_document(file: UploadFile) -> dict: executor, split_into_chunks, extracted_text ) # run tasks in parallel + validated_outputs = [] for chunk in chunks: - raw_llm_output, validated_output = guard( + _, validated_output = guard( openai.ChatCompletion.create, prompt_params={"sales_transcript": chunk}, - engine="chat-gpt4", - max_tokens=1024, - temperature=0.3, + model="gpt-4", + max_tokens=6000, + temperature=0.0, ) - + validated_outputs.append(validated_output) + + # additional prompt to still collection of validated outputs? execution_time = time.time() - start print(f'Time taken: {execution_time} seconds') return validated_output diff --git a/backend/app/api/sales_transcript.rail b/backend/app/api/sales_transcript.rail index 6a8011e..ccd93c2 100644 --- a/backend/app/api/sales_transcript.rail +++ b/backend/app/api/sales_transcript.rail @@ -1,27 +1,29 @@ - + - - - - - - - - - + + -Given the following sales transcript extract the following information from speaker two, please extract a dictionary that contains the customer's feedback. + +Given the following document, construct a JSON that follows that correct schema. + {{sales_transcript}} +@xml_prefix_prompt + + +{{output_schema}} + + +@json_suffix_prompt \ No newline at end of file diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py index 910757a..92417a7 100644 --- a/backend/app/utils/document_utils.py +++ b/backend/app/utils/document_utils.py @@ -11,7 +11,7 @@ def extract_text_from_pdf(pdf) -> str: return text -def split_into_chunks(text: str, chunk_size: int=8000, chunk_overlap: int=400) -> list[str]: +def split_into_chunks(text: str, chunk_size: int=6000, chunk_overlap: int=400) -> list[str]: text_splitter = CharacterTextSplitter( separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len From 46cd4922cb612f16d2a1f5ac2c2034befb844a13 Mon Sep 17 00:00:00 2001 From: Luca Blight <46580497+Luca-Blight@users.noreply.github.com> Date: Wed, 12 Jul 2023 11:58:04 -0400 Subject: [PATCH 4/4] Delete .DS_Store removed .DS_Store --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 6d25650cd55bea37daf28ffe8a37dcab6f949329..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKK~BRk5S%R)ZRw>a#04oYsKgIK)gCx-fd>SVR#3Gx6%-+Ex$_6U!23ACtZjuf z4L5`k+Ktv@uXoq>IErHcGJ2ez00RJBs$#3hqCvD@bw^skqMhP$WVl9#Tg;H`c$?ub zGN60c=WZofU1bMH;^l*U*=QzeWamd)(C~p3*mHBs=d&caW#{4hLU-j;n z-U(|@oL$4Z-Abxahh@~9>%Jk&D@N+LDKW=_S?0Ji1Q@3wH$|wrO_4y8bL`OcXE