-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathp1.py
82 lines (67 loc) · 3.17 KB
/
p1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import streamlit as st
import pdfplumber
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from sentence_transformers import SentenceTransformer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import SimpleNodeParser
# Set embedding model to use HuggingFace's distilbert for better factual answering
Settings.embed_model = HuggingFaceEmbedding(
model_name="distilbert-base-nli-stsb-mean-tokens" # Improved model for sentence embeddings
)
# Function to process a PDF and extract text
def process_pdf(file):
with pdfplumber.open(file) as pdf:
text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
return text
# Function to create LlamaIndex from a list of documents using Sentence-BERT embeddings
def create_llama_index(documents):
# Split documents into smaller chunks for better retrieval
chunked_documents = []
for doc in documents:
text = doc.text
chunks = [text[i:i+500] for i in range(0, len(text), 500)] # Chunking text into smaller pieces
for chunk in chunks:
chunked_documents.append(Document(text=chunk, metadata=doc.metadata))
# Create the index using Sentence-BERT embeddings
index = VectorStoreIndex.from_documents(chunked_documents)
return index
# Function to query the LlamaIndex
def query_llama_index(question, index):
retriever = index.as_retriever()
response = retriever.retrieve(question)
if response:
# Access the text content from the first response node
return response[0].node.get_text()
return "No relevant context found."
# Streamlit App Configuration
st.set_page_config(page_title="PDF Q&A Chatbot", layout="wide")
# Sidebar for File Upload
st.sidebar.title("📂 File Upload")
st.sidebar.write("Upload PDFs to process:")
uploaded_files = st.sidebar.file_uploader("Upload PDFs for Q&A", type="pdf", accept_multiple_files=True)
documents = []
if uploaded_files:
for uploaded_file in uploaded_files:
raw_text = process_pdf(uploaded_file)
document = Document(text=raw_text, metadata={"source": uploaded_file.name})
documents.append(document)
st.sidebar.success(f"Processed {len(uploaded_files)} PDFs successfully!")
# Create the index using Sentence-BERT embeddings
index = create_llama_index(documents)
st.sidebar.info("Documents have been indexed and are ready for Q&A!")
# Main App Content
st.title("🤖 Chatbot with PDF Knowledge")
st.write("Ask questions based on the uploaded PDFs in the sidebar.")
if uploaded_files and documents:
question = st.text_input("Type your question here:")
if question:
with st.spinner("Fetching the answer..."):
answer = query_llama_index(question, index)
if answer:
st.write(f"**Answer:** {answer}")
else:
st.warning("No relevant context found for your query.")
else:
st.info("Please upload PDFs in the sidebar to get started.")