Skip to content

Commit

Permalink
Add API Reference
Browse files Browse the repository at this point in the history
  • Loading branch information
daavoo committed Nov 19, 2024
1 parent acd50a9 commit 2a8f005
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
git config user.email 'github-actions[bot]@users.noreply.github.com'
- name: Install requirements
run: pip install -r docs/requirements.txt
run: pip install '.[docs]'

- name: Publish docs

Expand Down
3 changes: 3 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# API Reference

::: opennotebookllm.preprocessing.data_cleaners
3 changes: 0 additions & 3 deletions docs/requirements.txt

This file was deleted.

8 changes: 8 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,16 @@ site_name: Blueprint Docs

nav:
- Home: index.md
- API Reference: api.md

theme:
name: material
palette:
primary: deep orange

plugins:
- mkdocstrings:
handlers:
python:
options:
show_root_heading: true
64 changes: 59 additions & 5 deletions src/opennotebookllm/preprocessing/data_cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,82 @@


def clean_with_regex(text: str) -> str:
text = re.sub(r"\s+", " ", text).strip()
"""
Clean text using regular expressions.
This function removes:
- URLs
- emails
- special characters
- extra spaces
Examples:
>>> clean_with_regex("\xa0Hello, world! http://example.com")
"Hello, world!"
Args:
text (str): The text to clean.
Returns:
str: The cleaned text.
"""
text = re.sub(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
"",
text,
)
text = re.sub(r"[\w\.-]+@[\w\.-]+\.[\w]+", "", text)
text = re.sub(r'[^a-zA-Z0-9\s.,!?;:"\']', "", text)
text = re.sub(r"\s+", " ", text).strip()
return text


def clean_html(text: str) -> str:
"""Clean HTML text.
This function removes:
- scripts
- styles
- links
- meta tags
In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex].
Examples:
>>> clean_html("<html><body><p>Hello, world! </p></body></html>"")
"Hello, world!"
Args:
text (str): The HTML text to clean.
Returns:
str: The cleaned text.
"""
soup = BeautifulSoup(text, "html.parser")
for tag in soup(["script", "style", "link", "meta"]):
tag.decompose()
text = soup.get_text()
return clean_with_regex(text)


def clean_markdown_image(text: str) -> str:
return re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text)
def clean_markdown(text: str) -> str:
"""Clean Markdown text.
This function removes:
- markdown images
def clean_markdown(text: str) -> str:
return clean_with_regex(clean_markdown_image(text))
In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex].
Examples:
>>> clean_markdown('# Title with image ![alt text](image.jpg "Image Title")')
"Title with image"
Args:
text (str): The Markdown text to clean.
Returns:
str: The cleaned text.
"""
text = re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text)

return clean_with_regex(text)

0 comments on commit 2a8f005

Please sign in to comment.