-
Notifications
You must be signed in to change notification settings - Fork 23
/
doc-gpt.py
70 lines (65 loc) · 2.69 KB
/
doc-gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import openai
import pypdf
import streamlit as st
from langchain.llms import OpenAIChat
from langchain.vectorstores import FAISS
from langchain.chains import VectorDBQA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader
@st.cache_data
def split_pdf(fpath,chunk_chars=4000,overlap=50):
"""
Pre-process PDF into chunks
Some code from: https://github.com/whitead/paper-qa/blob/main/paperqa/readers.py
"""
st.info("`Reading and splitting doc ...`")
pdfReader = pypdf.PdfReader(fpath)
splits = []
split = ""
for i, page in enumerate(pdfReader.pages):
split += page.extract_text()
while len(split) > chunk_chars:
splits.append(split[:chunk_chars])
split = split[chunk_chars - overlap :]
if len(split) > overlap:
splits.append(split[:chunk_chars])
return splits
@st.cache_resource
def create_ix(splits):
"""
Create vector DB index of PDF
"""
st.info("`Building index ...`")
embeddings = OpenAIEmbeddings()
return FAISS.from_texts(splits,embeddings)
# Auth
st.sidebar.image("Img/reading.jpg")
api_key = st.sidebar.text_input("`OpenAI API Key:`", type="password")
st.sidebar.write("`By:` [@RLanceMartin](https://twitter.com/RLanceMartin)")
os.environ["OPENAI_API_KEY"] = api_key
chunk_chars = st.sidebar.radio("`Choose chunk size for splitting`", (2000, 3000, 4000), index=1)
st.sidebar.info("`Larger chunk size can produce better answers, but may hit ChatGPT context limit (4096 tokens)`")
# App
st.header("`doc-gpt`")
st.info("`Hello! I am a ChatGPT connected to whatever document you upload.`")
uploaded_file_pdf = st.file_uploader("`Upload PDF File:` ", type = ['pdf'] , accept_multiple_files=False)
if uploaded_file_pdf and api_key:
# Split and create index
d=split_pdf(uploaded_file_pdf,chunk_chars)
if d:
ix=create_ix(d)
# Use ChatGPT with index QA chain
llm = OpenAIChat(temperature=0)
chain = VectorDBQA.from_chain_type(llm, chain_type="stuff", vectorstore=ix)
query = st.text_input("`Please ask a question:` ","What is this document about?")
try:
st.info(f"`{chain.run(query)}`")
except openai.error.InvalidRequestError:
# Limitation w/ ChatGPT: 4096 token context length
# https://github.com/acheong08/ChatGPT/discussions/649
st.warning('Error with model request, often due to context length. Try reducing chunk size.', icon="⚠️")
else:
st.warning('Error with reading pdf, often b/c it is a scanned image of text. Try another file.', icon="⚠️")
else:
st.info("`Please enter OpenAI Key and upload pdf file`")