From 86294814cd887819952ed97a521af6c4d5f00b9f Mon Sep 17 00:00:00 2001 From: daavoo Date: Fri, 15 Nov 2024 20:47:14 +0100 Subject: [PATCH] Update demo --- .github/.devcontainer.json | 2 +- demo/app.py | 21 ++++++++++++++++++- pyproject.toml | 4 ++++ .../preprocessing/data_loaders.py | 14 ++++++++----- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/.github/.devcontainer.json b/.github/.devcontainer.json index 1248f42..b467d3b 100644 --- a/.github/.devcontainer.json +++ b/.github/.devcontainer.json @@ -7,5 +7,5 @@ }, "packages": ["libgl1-mesa-dev"] }, - "postCreateCommand": "pip install -e '.[tests]'" + "postCreateCommand": "pip install -e '.[demo]'" } diff --git a/demo/app.py b/demo/app.py index 79cd6e2..96e5102 100644 --- a/demo/app.py +++ b/demo/app.py @@ -1,3 +1,22 @@ import streamlit as st -st.title("Blueprint Demo") +from opennotebookllm.preprocessing.data_loaders import load_pdf +from opennotebookllm.preprocessing.data_cleaners import clean_html, clean_pdf + + +uploaded_file = st.file_uploader("Choose a file", type=["pdf", "html"]) + +if uploaded_file is not None: + if uploaded_file.type == "text/html": + raw_text = uploaded_file.getvalue().decode("utf-8") + clean_text = clean_html(raw_text) + elif uploaded_file.type == "application/pdf": + raw_text = load_pdf(uploaded_file) + clean_text = clean_pdf(raw_text) + col1, col2 = st.columns(2) + with col1: + st.title("Raw Text") + st.write(raw_text[:200]) + with col2: + st.title("Cleaned Text") + st.write(clean_text[:200]) diff --git a/pyproject.toml b/pyproject.toml index 3a4dd18..1f97d12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,10 @@ tests = [ "pytest-sugar>=0.9.6", ] +demo = [ + "streamlit" +] + [project.urls] Documentation = "https://mozilla-ai.github.io/OpenNotebookLLM/" Issues = "https://github.com/mozilla-ai/OpenNotebookLLM/issues" diff --git a/src/opennotebookllm/preprocessing/data_loaders.py b/src/opennotebookllm/preprocessing/data_loaders.py index d2575c0..5a05170 100644 --- a/src/opennotebookllm/preprocessing/data_loaders.py +++ b/src/opennotebookllm/preprocessing/data_loaders.py @@ -1,14 +1,18 @@ +from io import BytesIO + import PyPDF2 import PyPDF2.errors - from loguru import logger -def load_pdf(pdf_file: str) -> str | None: +def load_pdf(pdf_file: str | BytesIO) -> str | None: try: - with open(pdf_file, "rb") as file: - pdf_reader = PyPDF2.PdfReader(file) - return "\n".join(page.extract_text() for page in pdf_reader.pages) + if isinstance(pdf_file, str): + with open(pdf_file, "rb") as file: + pdf_reader = PyPDF2.PdfReader(file) + else: + pdf_reader = PyPDF2.PdfReader(pdf_file) + return "\n".join(page.extract_text() for page in pdf_reader.pages) except Exception as e: logger.exception(e) return None