Add errata parsing

rhel-lightspeed · Oct 18, 2024 · b44b8e9 · b44b8e9
1 parent c70ae73
commit b44b8e9
Show file tree

Hide file tree

Showing 5 changed files with 321 additions and 17 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -92,3 +92,10 @@ preview = true
 
 [tool.ruff.per-file-ignores]
 "tests/*" = ["S101"]
+
+[tool.coverage.report]
+skip_empty = true
+
+[tool.coverage.run]
+branch = true
+source = ["textprep"]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,17 +2,25 @@
 
 import pytest
 
+EXAMPLE_DOCS_DIR = "tests/example_docs"
+EXAMPLE_DOCS = {
+    "errata": "RHSA-2022_0886.md",
+}
+
 
 def load_example_doc(doctype: str):
     """Load an example document."""
-    example_docs = {
-        "errata": "RHSA-2022_0886.md",
-    }
-    with open(f"tests/example_docs/{example_docs[doctype]}") as f:
+    with open(f"{EXAMPLE_DOCS_DIR}/{EXAMPLE_DOCS[doctype]}") as f:
         return f.read()
 
 
 @pytest.fixture
 def errata_doc():
     """Load an example errata document."""
     return load_example_doc("errata")
+
+
+@pytest.fixture
+def errata_doc_path():
+    """Load an example errata document path."""
+    return f"{EXAMPLE_DOCS_DIR}/{EXAMPLE_DOCS["errata"]}"
diff --git a/tests/test_errata.py b/tests/test_errata.py
@@ -0,0 +1,172 @@
+"""Tests for parsing errata files."""
+
+import pytest
+from langchain_core.documents import Document
+
+from textprep.errata import (
+    clean_bugzillas,
+    clean_description,
+    clean_solution,
+    get_affected_products,
+    get_section_content,
+    load_errata,
+    parse,
+)
+
+
+def test_load_errata(tmp_path):
+    content = """+++
+title = '''Super important errata right here'''
++++
+
+# Most important heading
+
+First bit of content.
+"""
+    d = tmp_path / "sub"
+    d.mkdir()
+    p = d / "errata.md"
+    p.write_text(content, encoding="utf-8")
+
+    # Load valid markdown.
+    result = load_errata(p)
+    assert result["frontmatter"]["title"] == "Super important errata right here"
+    assert result["content"][0].page_content.strip() == "First bit of content."
+
+    # Load some JSON to throw an exception.
+    p.write_text('{"key": "Wait a minute, this is JSON!}')
+    with pytest.raises(ValueError):
+        result = load_errata(p)
+
+    # Delete the file and try to load it again.
+    p.unlink()
+    with pytest.raises(FileNotFoundError):
+        result = load_errata(p)
+
+
+def test_get_section_content(errata_doc):
+    sections = [
+        Document(metadata={"Header 1": "First"}, page_content="First section content"),
+        Document(metadata={"Header 2": "Second"}, page_content="Second section content"),
+    ]
+
+    assert get_section_content(sections, "First") == "First section content"
+    assert get_section_content(sections, "Second") == "Second section content"
+    assert get_section_content(sections, "Third") == ""
+
+
+def test_clean_bugzillas():
+    content = """
+- [BZ - 2044863](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863)
+- [BZ - 2044864](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044864)
+- [BZ - 2044865](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044865)
+"""
+    result = clean_bugzillas(content)
+    expected = """This update fixes the these bugs:
+
+- BZ 2044863 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863
+- BZ 2044864 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044864
+- BZ 2044865 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044865"""
+    assert result == expected
+
+
+def test_clean_description():
+    content = """This is an important errata!
+
+Security Fix(es):
+
+- A very special CVE
+
+For more details about the security issue(s), blah blah blah."""
+    result = clean_description(content)
+    print(result)
+    expected = (
+        "This is an important errata!\n\nThis update fixes the following security issues:\n\n- A very special CVE"
+    )
+    assert result == expected
+
+
+def test_clean_solution():
+    # The newlines should be removed and the link should be cleaned on this boilerplate
+    # text.
+    content = """For details on how to apply this update, which includes the changes described in this advisory, refer to:
+
+<https://access.redhat.com/articles/11258>"""
+    result = clean_solution(content)
+    assert result.endswith("refer to: https://access.redhat.com/articles/11258")
+
+    # This should be left untouched since it's unique content.
+    content = "This is unique content about a solution you should know!"
+    assert clean_solution(content) == content
+
+
+def test_get_affected_products():
+    product_keys = [
+        "Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support",
+        "Red Hat CodeReady Linux Builder for ARM 64",
+    ]
+    product_detail = [
+        "Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64|8|aarch64",
+        "Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support|8.8|aarch64",
+        "Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support|8.6|aarch64",
+    ]
+
+    result = get_affected_products(product_keys, product_detail)
+
+    # These two matching ones should be joined.
+    expected = "- Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support versions 8.6 and 8.8"
+    assert expected in result
+
+    # This one should be separate.
+    expected = "- Red Hat CodeReady Linux Builder for ARM 64 version 8"
+    assert expected in result
+
+
+def test_parse_functional(errata_doc_path):
+    result = parse(errata_doc_path)
+    expected = """RHSA-2022:0886 - Moderate: virt:rhel and virt-devel:rhel security update
+
+Published: 2022-03-15T09:10:17Z
+
+Access this document at this URL: https://access.redhat.com/errata/RHSA-2022:0886
+
+An update for the virt:rhel and virt-devel:rhel modules is now available for Red Hat Enterprise Linux 8.
+
+Red Hat Product Security has rated this update as having a security impact of Moderate. A Common Vulnerability Scoring System (CVSS) base score, which gives a detailed severity rating, is available for each vulnerability from the CVE link(s) in the References section.
+
+Kernel-based Virtual Machine (KVM) offers a full virtualization solution for Linux on numerous hardware platforms. The virt:rhel module contains packages which provide user-space components used to run virtual machines using KVM. The packages also provide APIs for managing and interacting with the virtualized
+systems.
+
+This update fixes the following security issues:
+
+- QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 (CVE-2022-0358)
+
+For details on how to apply this update, which includes the changes described in this advisory, refer to: https://access.redhat.com/articles/11258
+
+This errata affects the following products:
+
+- Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support versions 8.6 and 8.8
+- Red Hat CodeReady Linux Builder for ARM 64 version 8
+- Red Hat CodeReady Linux Builder for IBM z Systems - Extended Update Support versions 8.6 and 8.8
+- Red Hat CodeReady Linux Builder for IBM z Systems version 8
+- Red Hat CodeReady Linux Builder for Power, little endian - Extended Update Support versions 8.6 and 8.8
+- Red Hat CodeReady Linux Builder for Power, little endian version 8
+- Red Hat CodeReady Linux Builder for x86_64 - Extended Update Support versions 8.6 and 8.8
+- Red Hat CodeReady Linux Builder for x86_64 version 8
+- Red Hat Enterprise Linux Server - AUS version 8.6
+- Red Hat Enterprise Linux Server - TUS versions 8.6 and 8.8
+- Red Hat Enterprise Linux Server for Power LE - Update Services for SAP Solutions versions 8.6 and 8.8
+- Red Hat Enterprise Linux for ARM 64 - Extended Update Support versions 8.6 and 8.8
+- Red Hat Enterprise Linux for ARM 64 version 8
+- Red Hat Enterprise Linux for IBM z Systems - Extended Update Support versions 8.6 and 8.8
+- Red Hat Enterprise Linux for IBM z Systems version 8
+- Red Hat Enterprise Linux for Power, little endian - Extended Update Support versions 8.6 and 8.8
+- Red Hat Enterprise Linux for Power, little endian version 8
+- Red Hat Enterprise Linux for x86_64 - Extended Update Support versions 8.6 and 8.8
+- Red Hat Enterprise Linux for x86_64 - Update Services for SAP Solutions versions 8.6 and 8.8
+- Red Hat Enterprise Linux for x86_64 version 8
+
+This update fixes the these bugs:
+
+- BZ 2044863 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863"""
+    assert result == expected
diff --git a/textprep/cleaner.py b/textprep/cleaner.py
@@ -14,9 +14,7 @@ def clean_links(content: str) -> str:
     # Remove reference-style definitions: [ref]: URL
     content = re.sub(r"\n\s*\[[^\]]+\]:\s*(http[s]?:\/\/[^\s]+)", r" (\1)", content)
 
-    return content.strip()
-
+    # Use regex to find URLs enclosed in angle brackets and remove the brackets
+    content = re.sub(r"<(https?://[^>]+)>", r"\1", content)
 
-def remove_sections(sections: list, excluded_sections: list) -> list:
-    """Remove sections from markdown documents."""
-    return [x for x in sections if next(iter(x.metadata.values())) not in excluded_sections]
+    return content.strip()
diff --git a/textprep/errata.py b/textprep/errata.py
@@ -1,14 +1,133 @@
 """Parse errata documents."""
 
+import os
+import re
+
+from textprep.cleaner import clean_links
 from textprep.splitter import parse_markdown
 
-EXCLUDED_SECTIONS = ["Updated Packages"]
 
-if __name__ == "__main__":
-    with open("tests/example_docs/RHSA-2022_0886.md") as f:
-        errata_doc = parse_markdown(f.read())
+def load_errata(path: str) -> dict:
+    """Load an errata document from a file."""
+    if not os.path.isfile(path):
+        raise FileNotFoundError()
+
+    with open(path) as f:
+        raw_text = f.read()
+
+    try:
+        parsed = parse_markdown(raw_text)
+    except ValueError:
+        print(f"Error parsing file at {path}")
+        raise
+
+    return parsed
+
+
+def get_section_content(sections: list, section_name: str) -> str:
+    """Get the content of a section by its name."""
+    for section in sections:
+        if next(iter(section.metadata.values())) == section_name:
+            return str(section.page_content.strip())
+
+    return ""
+
+
+def clean_bugzillas(section: str) -> str:
+    """Clean up bugzilla references in the errata."""
+    pattern = r"\[([^\]]+)\]\((http[s]?:\/\/[^\)]+)\)"
+    matches = re.findall(pattern, section)
+
+    bugzillas = []
+    for match in matches:
+        bz_name = match[0].replace(" - ", " ")
+        bz_url = match[1]
+        bugzillas.append(f"- {bz_name} found at {bz_url}")
+
+    return "This update fixes the these bugs:\n\n" + "\n".join(bugzillas)
+
+
+def clean_description(section: str) -> str:
+    """Clean up an errata description."""
+    # Be specific about what we're fixing.
+    section = section.replace("Security Fix(es):", "This update fixes the following security issues:")
+
+    # Remove the boilerplate line about checking references.
+    section = "\n".join([
+        x for x in section.split("\n") if not x.startswith("For more details about the security issue")
+    ])
+
+    return section.strip()
+
+
+def clean_solution(section: str) -> str:
+    """Clean up an errata solution.
+
+    Some errata have specific instructions included in the errata itself while others
+    just link to a solution article.
+
+    TODO: It would be nice to pull in the solution article here instead of a link.
+    """
+    if section.startswith("For details on how to apply this update"):
+        section = section.replace("\n\n", " ")
+
+    return clean_links(section).strip()
+
+
+def get_affected_products(product_keys: list, product_detail: list) -> str:
+    """Get the affected products from the frontmatter.
+
+    This is tricky because the portal_product_filter contains a pipe delimited table,
+    but the names of products are fully repeated for every version and architecture.
+    That's a *lot* of redundant information to hand off to the embedding model and LLM.
+
+    The portal_product_names list contains the keys that go along with the
+    portal_product_filter table, so we can use to generate a more concise list.
+    """
+    product_detail = [x.split("|") for x in product_detail]
+
+    product_pieces = []
+    for product_key in product_keys:
+        # Get all the affected versions from the affected products detail list that
+        # match ths current product key.
+        versions = sorted([x[2] for x in product_detail if x[1] == product_key])
+
+        # Add the version(s) to the end the product key.
+        if len(versions) > 1:
+            product_pieces.append(f"- {product_key} versions {" and ".join(versions)}")
+        else:
+            product_pieces.append(f"- {product_key} version {versions[0]}")
+
+    product_text = "\n".join(sorted(product_pieces))
+
+    return "This errata affects the following products:\n\n" + product_text
+
+
+def parse(path: str) -> str:
+    """Parse an errata document into a clean format."""
+    errata_doc = load_errata(path)
+
+    metadata = errata_doc["frontmatter"]
+    sections = errata_doc["content"]
+
+    description = get_section_content(sections, "Description")
+    solution = get_section_content(sections, "Solution")
+    bugzillas = get_section_content(sections, "Fixes")
+    products = get_affected_products(
+        metadata["extra"]["portal_product_names"], metadata["extra"]["portal_product_filter"]
+    )
+
+    clean_doc_pieces = [
+        metadata["extra"]["original_title"],
+        f"Published: {metadata["extra"]["issued"]}",
+        f"Access this document at this URL: https://access.redhat.com{metadata["path"]}",
+        metadata["extra"]["portal_summary"],
+        clean_description(description),
+        clean_solution(solution),
+        products,
+        clean_bugzillas(bugzillas),
+    ]
+
+    clean_doc = "\n\n".join(clean_doc_pieces)
 
-    for section in errata_doc["content"]:
-        print(section.metadata)
-        print(section.page_content)
-        print()
+    return clean_doc