Update demo

mozilla-ai · Nov 15, 2024 · 8629481 · 8629481
1 parent d2b276c
commit 8629481
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 7 deletions.
diff --git a/.github/.devcontainer.json b/.github/.devcontainer.json
@@ -7,5 +7,5 @@
       },
       "packages": ["libgl1-mesa-dev"]
     },
-    "postCreateCommand": "pip install -e '.[tests]'"
+    "postCreateCommand": "pip install -e '.[demo]'"
 }
diff --git a/demo/app.py b/demo/app.py
@@ -1,3 +1,22 @@
 import streamlit as st
 
-st.title("Blueprint Demo")
+from opennotebookllm.preprocessing.data_loaders import load_pdf
+from opennotebookllm.preprocessing.data_cleaners import clean_html, clean_pdf
+
+
+uploaded_file = st.file_uploader("Choose a file", type=["pdf", "html"])
+
+if uploaded_file is not None:
+    if uploaded_file.type == "text/html":
+        raw_text = uploaded_file.getvalue().decode("utf-8")
+        clean_text = clean_html(raw_text)
+    elif uploaded_file.type == "application/pdf":
+        raw_text = load_pdf(uploaded_file)
+        clean_text = clean_pdf(raw_text)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.title("Raw Text")
+        st.write(raw_text[:200])
+    with col2:
+        st.title("Cleaned Text")
+        st.write(clean_text[:200])
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,10 @@ tests = [
   "pytest-sugar>=0.9.6",
 ]
 
+demo = [
+  "streamlit"
+]
+
 [project.urls]
 Documentation = "https://mozilla-ai.github.io/OpenNotebookLLM/"
 Issues = "https://github.com/mozilla-ai/OpenNotebookLLM/issues"

diff --git a/src/opennotebookllm/preprocessing/data_loaders.py b/src/opennotebookllm/preprocessing/data_loaders.py
@@ -1,14 +1,18 @@
+from io import BytesIO
+
 import PyPDF2
 import PyPDF2.errors
-
 from loguru import logger
 
 
-def load_pdf(pdf_file: str) -> str | None:
+def load_pdf(pdf_file: str | BytesIO) -> str | None:
     try:
-        with open(pdf_file, "rb") as file:
-            pdf_reader = PyPDF2.PdfReader(file)
-            return "\n".join(page.extract_text() for page in pdf_reader.pages)
+        if isinstance(pdf_file, str):
+            with open(pdf_file, "rb") as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+        else:
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+        return "\n".join(page.extract_text() for page in pdf_reader.pages)
     except Exception as e:
         logger.exception(e)
         return None