p1.py

import streamlit as st
import pdfplumber
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, PromptTemplate, Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from sentence_transformers import SentenceTransformer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import SimpleNodeParser

# Set embedding model to use HuggingFace's distilbert for better factual answering
Settings.embed_model = HuggingFaceEmbedding(
    model_name="distilbert-base-nli-stsb-mean-tokens"  # Improved model for sentence embeddings
)

# Function to process a PDF and extract text
def process_pdf(file):
    with pdfplumber.open(file) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text

# Function to create LlamaIndex from a list of documents using Sentence-BERT embeddings
def create_llama_index(documents):
    # Split documents into smaller chunks for better retrieval
    chunked_documents = []
    for doc in documents:
        text = doc.text
        chunks = [text[i:i+500] for i in range(0, len(text), 500)]  # Chunking text into smaller pieces
        for chunk in chunks:
            chunked_documents.append(Document(text=chunk, metadata=doc.metadata))
    
    # Create the index using Sentence-BERT embeddings
    index = VectorStoreIndex.from_documents(chunked_documents)
    return index

# Function to query the LlamaIndex
def query_llama_index(question, index):
    retriever = index.as_retriever()
    response = retriever.retrieve(question)
    if response:
        # Access the text content from the first response node
        return response[0].node.get_text()
    return "No relevant context found."

# Streamlit App Configuration
st.set_page_config(page_title="PDF Q&A Chatbot", layout="wide")

# Sidebar for File Upload
st.sidebar.title("📂 File Upload")
st.sidebar.write("Upload PDFs to process:")

uploaded_files = st.sidebar.file_uploader("Upload PDFs for Q&A", type="pdf", accept_multiple_files=True)

documents = []
if uploaded_files:
    for uploaded_file in uploaded_files:
        raw_text = process_pdf(uploaded_file)
        document = Document(text=raw_text, metadata={"source": uploaded_file.name})
        documents.append(document)

    st.sidebar.success(f"Processed {len(uploaded_files)} PDFs successfully!")

    # Create the index using Sentence-BERT embeddings
    index = create_llama_index(documents)
    st.sidebar.info("Documents have been indexed and are ready for Q&A!")

# Main App Content
st.title("🤖 Chatbot with PDF Knowledge")
st.write("Ask questions based on the uploaded PDFs in the sidebar.")

if uploaded_files and documents:
    question = st.text_input("Type your question here:")

    if question:
        with st.spinner("Fetching the answer..."):
            answer = query_llama_index(question, index)

        if answer:
            st.write(f"**Answer:** {answer}")
        else:
            st.warning("No relevant context found for your query.")
else:
    st.info("Please upload PDFs in the sidebar to get started.")