diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py new file mode 100644 index 0000000..f03e02f --- /dev/null +++ b/tests/test_cleaner.py @@ -0,0 +1,16 @@ +"""Tests for cleaner functions.""" + +from textprep import cleaner + + +def test_clean_links(): + """Test cleaning up links.""" + content = """ +Here is a [link](https://example.com) and here is another [reference link][1]. + +[1]: https://example.com/reference +""" + result = cleaner.clean_links(content) + assert result == ( + "Here is a link (https://example.com) and here is another reference link. (https://example.com/reference)" + ) diff --git a/textprep/cleaner.py b/textprep/cleaner.py new file mode 100644 index 0000000..b592ad6 --- /dev/null +++ b/textprep/cleaner.py @@ -0,0 +1,17 @@ +"""Clean up various parts of documents.""" + +import re + + +def clean_links(content: str) -> str: + """Clean up links in the markdown content.""" + # Inline link pattern: [link text](URL) + content = re.sub(r"\[([^\]]+)\]\((http[s]?:\/\/[^\)]+)\)", r"\1 (\2)", content) + + # Reference link pattern: [link text][ref] + content = re.sub(r"\[([^\]]+)\]\[[^\]]+\]", r"\1", content) + + # Remove reference-style definitions: [ref]: URL + content = re.sub(r"\n\s*\[[^\]]+\]:\s*(http[s]?:\/\/[^\s]+)", r" (\1)", content) + + return content.strip() diff --git a/textprep/errata.py b/textprep/errata.py new file mode 100644 index 0000000..429c121 --- /dev/null +++ b/textprep/errata.py @@ -0,0 +1,14 @@ +"""Parse errata documents.""" + +from textprep.splitter import parse_markdown + +EXCLUDED_SECTIONS = ["Updated Packages"] + +if __name__ == "__main__": + with open("tests/example_docs/RHSA-2022_0886.md") as f: + errata_doc = parse_markdown(f.read()) + + for section in errata_doc["content"]: + print(section.metadata) + print(section.page_content) + print()