Add API Reference

mozilla-ai · Nov 19, 2024 · 2a8f005 · 2a8f005
1 parent acd50a9
commit 2a8f005
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 9 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -27,7 +27,7 @@ jobs:
           git config user.email 'github-actions[bot]@users.noreply.github.com'
 
       - name: Install requirements
-        run: pip install -r docs/requirements.txt
+        run: pip install '.[docs]'
 
       - name: Publish docs
 

diff --git a/docs/api.md b/docs/api.md
@@ -0,0 +1,3 @@
+# API Reference
+
+::: opennotebookllm.preprocessing.data_cleaners
diff --git a/docs/requirements.txt b/docs/requirements.txt
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -2,8 +2,16 @@ site_name: Blueprint Docs
 
 nav:
   - Home: index.md
+  - API Reference: api.md
 
 theme:
   name: material
   palette:
     primary: deep orange
+
+plugins:
+- mkdocstrings:
+    handlers:
+      python:
+        options:
+          show_root_heading: true
diff --git a/src/opennotebookllm/preprocessing/data_cleaners.py b/src/opennotebookllm/preprocessing/data_cleaners.py
@@ -3,28 +3,82 @@
 
 
 def clean_with_regex(text: str) -> str:
-    text = re.sub(r"\s+", " ", text).strip()
+    """
+    Clean text using regular expressions.
+
+    This function removes:
+        - URLs
+        - emails
+        - special characters
+        - extra spaces
+
+    Examples:
+        >>> clean_with_regex("\xa0Hello,   world! http://example.com")
+        "Hello, world!"
+
+    Args:
+        text (str): The text to clean.
+
+    Returns:
+        str: The cleaned text.
+    """
     text = re.sub(
         r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
         "",
         text,
     )
     text = re.sub(r"[\w\.-]+@[\w\.-]+\.[\w]+", "", text)
     text = re.sub(r'[^a-zA-Z0-9\s.,!?;:"\']', "", text)
+    text = re.sub(r"\s+", " ", text).strip()
     return text
 
 
 def clean_html(text: str) -> str:
+    """Clean HTML text.
+
+    This function removes:
+        - scripts
+        - styles
+        - links
+        - meta tags
+
+    In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex].
+
+    Examples:
+        >>> clean_html("<html><body><p>Hello,  world!  </p></body></html>"")
+        "Hello, world!"
+
+    Args:
+        text (str): The HTML text to clean.
+
+    Returns:
+        str: The cleaned text.
+    """
     soup = BeautifulSoup(text, "html.parser")
     for tag in soup(["script", "style", "link", "meta"]):
         tag.decompose()
     text = soup.get_text()
     return clean_with_regex(text)
 
 
-def clean_markdown_image(text: str) -> str:
-    return re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text)
+def clean_markdown(text: str) -> str:
+    """Clean Markdown text.
 
+    This function removes:
+        - markdown images
 
-def clean_markdown(text: str) -> str:
-    return clean_with_regex(clean_markdown_image(text))
+    In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex].
+
+    Examples:
+        >>> clean_markdown('# Title   with image ![alt text](image.jpg "Image Title")')
+        "Title with image"
+
+    Args:
+        text (str): The Markdown text to clean.
+
+    Returns:
+        str: The cleaned text.
+    """
+    text = re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text)
+
+    return clean_with_regex(text)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# API Reference

		::: opennotebookllm.preprocessing.data_cleaners