diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 8c78bf0..198ba9a 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -27,7 +27,7 @@ jobs: git config user.email 'github-actions[bot]@users.noreply.github.com' - name: Install requirements - run: pip install -r docs/requirements.txt + run: pip install '.[docs]' - name: Publish docs diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 0000000..c304571 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,3 @@ +# API Reference + +::: opennotebookllm.preprocessing.data_cleaners diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index c584e19..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mkdocs -mkdocs-material -mkdocstrings[python] diff --git a/mkdocs.yml b/mkdocs.yml index 3855709..cbf0a8d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,8 +2,16 @@ site_name: Blueprint Docs nav: - Home: index.md + - API Reference: api.md theme: name: material palette: primary: deep orange + +plugins: +- mkdocstrings: + handlers: + python: + options: + show_root_heading: true diff --git a/src/opennotebookllm/preprocessing/data_cleaners.py b/src/opennotebookllm/preprocessing/data_cleaners.py index bc4c640..2b35b1f 100644 --- a/src/opennotebookllm/preprocessing/data_cleaners.py +++ b/src/opennotebookllm/preprocessing/data_cleaners.py @@ -3,7 +3,25 @@ def clean_with_regex(text: str) -> str: - text = re.sub(r"\s+", " ", text).strip() + """ + Clean text using regular expressions. + + This function removes: + - URLs + - emails + - special characters + - extra spaces + + Examples: + >>> clean_with_regex("\xa0Hello, world! http://example.com") + "Hello, world!" + + Args: + text (str): The text to clean. + + Returns: + str: The cleaned text. + """ text = re.sub( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", @@ -11,10 +29,31 @@ def clean_with_regex(text: str) -> str: ) text = re.sub(r"[\w\.-]+@[\w\.-]+\.[\w]+", "", text) text = re.sub(r'[^a-zA-Z0-9\s.,!?;:"\']', "", text) + text = re.sub(r"\s+", " ", text).strip() return text def clean_html(text: str) -> str: + """Clean HTML text. + + This function removes: + - scripts + - styles + - links + - meta tags + + In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex]. + + Examples: + >>> clean_html("
Hello, world!
"") + "Hello, world!" + + Args: + text (str): The HTML text to clean. + + Returns: + str: The cleaned text. + """ soup = BeautifulSoup(text, "html.parser") for tag in soup(["script", "style", "link", "meta"]): tag.decompose() @@ -22,9 +61,24 @@ def clean_html(text: str) -> str: return clean_with_regex(text) -def clean_markdown_image(text: str) -> str: - return re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text) +def clean_markdown(text: str) -> str: + """Clean Markdown text. + This function removes: + - markdown images -def clean_markdown(text: str) -> str: - return clean_with_regex(clean_markdown_image(text)) + In addition, it calls [clean_with_regex][opennotebookllm.preprocessing.data_cleaners.clean_with_regex]. + + Examples: + >>> clean_markdown('# Title with image ![alt text](image.jpg "Image Title")') + "Title with image" + + Args: + text (str): The Markdown text to clean. + + Returns: + str: The cleaned text. + """ + text = re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text) + + return clean_with_regex(text)