Add link cleaner

rhel-lightspeed · Oct 18, 2024 · e22f9c8 · e22f9c8
1 parent 20f201f
commit e22f9c8
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 0 deletions.
diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py
@@ -0,0 +1,16 @@
+"""Tests for cleaner functions."""
+
+from textprep import cleaner
+
+
+def test_clean_links():
+    """Test cleaning up links."""
+    content = """
+Here is a [link](https://example.com) and here is another [reference link][1].
+
+[1]: https://example.com/reference
+"""
+    result = cleaner.clean_links(content)
+    assert result == (
+        "Here is a link (https://example.com) and here is another reference link. (https://example.com/reference)"
+    )
diff --git a/textprep/cleaner.py b/textprep/cleaner.py
@@ -0,0 +1,17 @@
+"""Clean up various parts of documents."""
+
+import re
+
+
+def clean_links(content: str) -> str:
+    """Clean up links in the markdown content."""
+    # Inline link pattern: [link text](URL)
+    content = re.sub(r"\[([^\]]+)\]\((http[s]?:\/\/[^\)]+)\)", r"\1 (\2)", content)
+
+    # Reference link pattern: [link text][ref]
+    content = re.sub(r"\[([^\]]+)\]\[[^\]]+\]", r"\1", content)
+
+    # Remove reference-style definitions: [ref]: URL
+    content = re.sub(r"\n\s*\[[^\]]+\]:\s*(http[s]?:\/\/[^\s]+)", r" (\1)", content)
+
+    return content.strip()
diff --git a/textprep/errata.py b/textprep/errata.py
@@ -0,0 +1,14 @@
+"""Parse errata documents."""
+
+from textprep.splitter import parse_markdown
+
+EXCLUDED_SECTIONS = ["Updated Packages"]
+
+if __name__ == "__main__":
+    with open("tests/example_docs/RHSA-2022_0886.md") as f:
+        errata_doc = parse_markdown(f.read())
+
+    for section in errata_doc["content"]:
+        print(section.metadata)
+        print(section.page_content)
+        print()