Skip to content

Commit

Permalink
Update demo
Browse files Browse the repository at this point in the history
  • Loading branch information
daavoo committed Nov 15, 2024
1 parent d2b276c commit 8629481
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .github/.devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
},
"packages": ["libgl1-mesa-dev"]
},
"postCreateCommand": "pip install -e '.[tests]'"
"postCreateCommand": "pip install -e '.[demo]'"
}
21 changes: 20 additions & 1 deletion demo/app.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
import streamlit as st

st.title("Blueprint Demo")
from opennotebookllm.preprocessing.data_loaders import load_pdf
from opennotebookllm.preprocessing.data_cleaners import clean_html, clean_pdf


uploaded_file = st.file_uploader("Choose a file", type=["pdf", "html"])

if uploaded_file is not None:
if uploaded_file.type == "text/html":
raw_text = uploaded_file.getvalue().decode("utf-8")
clean_text = clean_html(raw_text)
elif uploaded_file.type == "application/pdf":
raw_text = load_pdf(uploaded_file)
clean_text = clean_pdf(raw_text)
col1, col2 = st.columns(2)
with col1:
st.title("Raw Text")
st.write(raw_text[:200])
with col2:
st.title("Cleaned Text")
st.write(clean_text[:200])
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ tests = [
"pytest-sugar>=0.9.6",
]

demo = [
"streamlit"
]

[project.urls]
Documentation = "https://mozilla-ai.github.io/OpenNotebookLLM/"
Issues = "https://github.com/mozilla-ai/OpenNotebookLLM/issues"
Expand Down
14 changes: 9 additions & 5 deletions src/opennotebookllm/preprocessing/data_loaders.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from io import BytesIO

import PyPDF2
import PyPDF2.errors

from loguru import logger


def load_pdf(pdf_file: str) -> str | None:
def load_pdf(pdf_file: str | BytesIO) -> str | None:
try:
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
return "\n".join(page.extract_text() for page in pdf_reader.pages)
if isinstance(pdf_file, str):
with open(pdf_file, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
else:
pdf_reader = PyPDF2.PdfReader(pdf_file)
return "\n".join(page.extract_text() for page in pdf_reader.pages)
except Exception as e:
logger.exception(e)
return None
Expand Down

0 comments on commit 8629481

Please sign in to comment.