Skip to content
This repository has been archived by the owner on Nov 11, 2024. It is now read-only.

Commit

Permalink
Add link cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
major committed Oct 18, 2024
1 parent 20f201f commit e22f9c8
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 0 deletions.
16 changes: 16 additions & 0 deletions tests/test_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Tests for cleaner functions."""

from textprep import cleaner


def test_clean_links():
"""Test cleaning up links."""
content = """
Here is a [link](https://example.com) and here is another [reference link][1].
[1]: https://example.com/reference
"""
result = cleaner.clean_links(content)
assert result == (
"Here is a link (https://example.com) and here is another reference link. (https://example.com/reference)"
)
17 changes: 17 additions & 0 deletions textprep/cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Clean up various parts of documents."""

import re


def clean_links(content: str) -> str:
"""Clean up links in the markdown content."""
# Inline link pattern: [link text](URL)
content = re.sub(r"\[([^\]]+)\]\((http[s]?:\/\/[^\)]+)\)", r"\1 (\2)", content)

# Reference link pattern: [link text][ref]
content = re.sub(r"\[([^\]]+)\]\[[^\]]+\]", r"\1", content)

# Remove reference-style definitions: [ref]: URL
content = re.sub(r"\n\s*\[[^\]]+\]:\s*(http[s]?:\/\/[^\s]+)", r" (\1)", content)

return content.strip()
14 changes: 14 additions & 0 deletions textprep/errata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Parse errata documents."""

from textprep.splitter import parse_markdown

EXCLUDED_SECTIONS = ["Updated Packages"]

if __name__ == "__main__":
with open("tests/example_docs/RHSA-2022_0886.md") as f:
errata_doc = parse_markdown(f.read())

for section in errata_doc["content"]:
print(section.metadata)
print(section.page_content)
print()

0 comments on commit e22f9c8

Please sign in to comment.