Skip to content
This repository has been archived by the owner on Nov 11, 2024. It is now read-only.

Commit

Permalink
Add errata parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
major committed Oct 18, 2024
1 parent c70ae73 commit b44b8e9
Show file tree
Hide file tree
Showing 5 changed files with 321 additions and 17 deletions.
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,10 @@ preview = true

[tool.ruff.per-file-ignores]
"tests/*" = ["S101"]

[tool.coverage.report]
skip_empty = true

[tool.coverage.run]
branch = true
source = ["textprep"]
16 changes: 12 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,25 @@

import pytest

EXAMPLE_DOCS_DIR = "tests/example_docs"
EXAMPLE_DOCS = {
"errata": "RHSA-2022_0886.md",
}


def load_example_doc(doctype: str):
"""Load an example document."""
example_docs = {
"errata": "RHSA-2022_0886.md",
}
with open(f"tests/example_docs/{example_docs[doctype]}") as f:
with open(f"{EXAMPLE_DOCS_DIR}/{EXAMPLE_DOCS[doctype]}") as f:
return f.read()


@pytest.fixture
def errata_doc():
"""Load an example errata document."""
return load_example_doc("errata")


@pytest.fixture
def errata_doc_path():
"""Load an example errata document path."""
return f"{EXAMPLE_DOCS_DIR}/{EXAMPLE_DOCS["errata"]}"
172 changes: 172 additions & 0 deletions tests/test_errata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""Tests for parsing errata files."""

import pytest
from langchain_core.documents import Document

from textprep.errata import (
clean_bugzillas,
clean_description,
clean_solution,
get_affected_products,
get_section_content,
load_errata,
parse,
)


def test_load_errata(tmp_path):
content = """+++
title = '''Super important errata right here'''
+++
# Most important heading
First bit of content.
"""
d = tmp_path / "sub"
d.mkdir()
p = d / "errata.md"
p.write_text(content, encoding="utf-8")

# Load valid markdown.
result = load_errata(p)
assert result["frontmatter"]["title"] == "Super important errata right here"
assert result["content"][0].page_content.strip() == "First bit of content."

# Load some JSON to throw an exception.
p.write_text('{"key": "Wait a minute, this is JSON!}')
with pytest.raises(ValueError):
result = load_errata(p)

# Delete the file and try to load it again.
p.unlink()
with pytest.raises(FileNotFoundError):
result = load_errata(p)


def test_get_section_content(errata_doc):
sections = [
Document(metadata={"Header 1": "First"}, page_content="First section content"),
Document(metadata={"Header 2": "Second"}, page_content="Second section content"),
]

assert get_section_content(sections, "First") == "First section content"
assert get_section_content(sections, "Second") == "Second section content"
assert get_section_content(sections, "Third") == ""


def test_clean_bugzillas():
content = """
- [BZ - 2044863](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863)
- [BZ - 2044864](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044864)
- [BZ - 2044865](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044865)
"""
result = clean_bugzillas(content)
expected = """This update fixes the these bugs:
- BZ 2044863 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863
- BZ 2044864 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044864
- BZ 2044865 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044865"""
assert result == expected


def test_clean_description():
content = """This is an important errata!
Security Fix(es):
- A very special CVE
For more details about the security issue(s), blah blah blah."""
result = clean_description(content)
print(result)
expected = (
"This is an important errata!\n\nThis update fixes the following security issues:\n\n- A very special CVE"
)
assert result == expected


def test_clean_solution():
# The newlines should be removed and the link should be cleaned on this boilerplate
# text.
content = """For details on how to apply this update, which includes the changes described in this advisory, refer to:
<https://access.redhat.com/articles/11258>"""
result = clean_solution(content)
assert result.endswith("refer to: https://access.redhat.com/articles/11258")

# This should be left untouched since it's unique content.
content = "This is unique content about a solution you should know!"
assert clean_solution(content) == content


def test_get_affected_products():
product_keys = [
"Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support",
"Red Hat CodeReady Linux Builder for ARM 64",
]
product_detail = [
"Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64|8|aarch64",
"Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support|8.8|aarch64",
"Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support|8.6|aarch64",
]

result = get_affected_products(product_keys, product_detail)

# These two matching ones should be joined.
expected = "- Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support versions 8.6 and 8.8"
assert expected in result

# This one should be separate.
expected = "- Red Hat CodeReady Linux Builder for ARM 64 version 8"
assert expected in result


def test_parse_functional(errata_doc_path):
result = parse(errata_doc_path)
expected = """RHSA-2022:0886 - Moderate: virt:rhel and virt-devel:rhel security update
Published: 2022-03-15T09:10:17Z
Access this document at this URL: https://access.redhat.com/errata/RHSA-2022:0886
An update for the virt:rhel and virt-devel:rhel modules is now available for Red Hat Enterprise Linux 8.
Red Hat Product Security has rated this update as having a security impact of Moderate. A Common Vulnerability Scoring System (CVSS) base score, which gives a detailed severity rating, is available for each vulnerability from the CVE link(s) in the References section.
Kernel-based Virtual Machine (KVM) offers a full virtualization solution for Linux on numerous hardware platforms. The virt:rhel module contains packages which provide user-space components used to run virtual machines using KVM. The packages also provide APIs for managing and interacting with the virtualized
systems.
This update fixes the following security issues:
- QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 (CVE-2022-0358)
For details on how to apply this update, which includes the changes described in this advisory, refer to: https://access.redhat.com/articles/11258
This errata affects the following products:
- Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support versions 8.6 and 8.8
- Red Hat CodeReady Linux Builder for ARM 64 version 8
- Red Hat CodeReady Linux Builder for IBM z Systems - Extended Update Support versions 8.6 and 8.8
- Red Hat CodeReady Linux Builder for IBM z Systems version 8
- Red Hat CodeReady Linux Builder for Power, little endian - Extended Update Support versions 8.6 and 8.8
- Red Hat CodeReady Linux Builder for Power, little endian version 8
- Red Hat CodeReady Linux Builder for x86_64 - Extended Update Support versions 8.6 and 8.8
- Red Hat CodeReady Linux Builder for x86_64 version 8
- Red Hat Enterprise Linux Server - AUS version 8.6
- Red Hat Enterprise Linux Server - TUS versions 8.6 and 8.8
- Red Hat Enterprise Linux Server for Power LE - Update Services for SAP Solutions versions 8.6 and 8.8
- Red Hat Enterprise Linux for ARM 64 - Extended Update Support versions 8.6 and 8.8
- Red Hat Enterprise Linux for ARM 64 version 8
- Red Hat Enterprise Linux for IBM z Systems - Extended Update Support versions 8.6 and 8.8
- Red Hat Enterprise Linux for IBM z Systems version 8
- Red Hat Enterprise Linux for Power, little endian - Extended Update Support versions 8.6 and 8.8
- Red Hat Enterprise Linux for Power, little endian version 8
- Red Hat Enterprise Linux for x86_64 - Extended Update Support versions 8.6 and 8.8
- Red Hat Enterprise Linux for x86_64 - Update Services for SAP Solutions versions 8.6 and 8.8
- Red Hat Enterprise Linux for x86_64 version 8
This update fixes the these bugs:
- BZ 2044863 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863"""
assert result == expected
8 changes: 3 additions & 5 deletions textprep/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ def clean_links(content: str) -> str:
# Remove reference-style definitions: [ref]: URL
content = re.sub(r"\n\s*\[[^\]]+\]:\s*(http[s]?:\/\/[^\s]+)", r" (\1)", content)

return content.strip()

# Use regex to find URLs enclosed in angle brackets and remove the brackets
content = re.sub(r"<(https?://[^>]+)>", r"\1", content)

def remove_sections(sections: list, excluded_sections: list) -> list:
"""Remove sections from markdown documents."""
return [x for x in sections if next(iter(x.metadata.values())) not in excluded_sections]
return content.strip()
135 changes: 127 additions & 8 deletions textprep/errata.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,133 @@
"""Parse errata documents."""

import os
import re

from textprep.cleaner import clean_links
from textprep.splitter import parse_markdown

EXCLUDED_SECTIONS = ["Updated Packages"]

if __name__ == "__main__":
with open("tests/example_docs/RHSA-2022_0886.md") as f:
errata_doc = parse_markdown(f.read())
def load_errata(path: str) -> dict:
"""Load an errata document from a file."""
if not os.path.isfile(path):
raise FileNotFoundError()

with open(path) as f:
raw_text = f.read()

try:
parsed = parse_markdown(raw_text)
except ValueError:
print(f"Error parsing file at {path}")
raise

return parsed


def get_section_content(sections: list, section_name: str) -> str:
"""Get the content of a section by its name."""
for section in sections:
if next(iter(section.metadata.values())) == section_name:
return str(section.page_content.strip())

return ""


def clean_bugzillas(section: str) -> str:
"""Clean up bugzilla references in the errata."""
pattern = r"\[([^\]]+)\]\((http[s]?:\/\/[^\)]+)\)"
matches = re.findall(pattern, section)

bugzillas = []
for match in matches:
bz_name = match[0].replace(" - ", " ")
bz_url = match[1]
bugzillas.append(f"- {bz_name} found at {bz_url}")

return "This update fixes the these bugs:\n\n" + "\n".join(bugzillas)


def clean_description(section: str) -> str:
"""Clean up an errata description."""
# Be specific about what we're fixing.
section = section.replace("Security Fix(es):", "This update fixes the following security issues:")

# Remove the boilerplate line about checking references.
section = "\n".join([
x for x in section.split("\n") if not x.startswith("For more details about the security issue")
])

return section.strip()


def clean_solution(section: str) -> str:
"""Clean up an errata solution.
Some errata have specific instructions included in the errata itself while others
just link to a solution article.
TODO: It would be nice to pull in the solution article here instead of a link.
"""
if section.startswith("For details on how to apply this update"):
section = section.replace("\n\n", " ")

return clean_links(section).strip()


def get_affected_products(product_keys: list, product_detail: list) -> str:
"""Get the affected products from the frontmatter.
This is tricky because the portal_product_filter contains a pipe delimited table,
but the names of products are fully repeated for every version and architecture.
That's a *lot* of redundant information to hand off to the embedding model and LLM.
The portal_product_names list contains the keys that go along with the
portal_product_filter table, so we can use to generate a more concise list.
"""
product_detail = [x.split("|") for x in product_detail]

product_pieces = []
for product_key in product_keys:
# Get all the affected versions from the affected products detail list that
# match ths current product key.
versions = sorted([x[2] for x in product_detail if x[1] == product_key])

# Add the version(s) to the end the product key.
if len(versions) > 1:
product_pieces.append(f"- {product_key} versions {" and ".join(versions)}")
else:
product_pieces.append(f"- {product_key} version {versions[0]}")

product_text = "\n".join(sorted(product_pieces))

return "This errata affects the following products:\n\n" + product_text


def parse(path: str) -> str:
"""Parse an errata document into a clean format."""
errata_doc = load_errata(path)

metadata = errata_doc["frontmatter"]
sections = errata_doc["content"]

description = get_section_content(sections, "Description")
solution = get_section_content(sections, "Solution")
bugzillas = get_section_content(sections, "Fixes")
products = get_affected_products(
metadata["extra"]["portal_product_names"], metadata["extra"]["portal_product_filter"]
)

clean_doc_pieces = [
metadata["extra"]["original_title"],
f"Published: {metadata["extra"]["issued"]}",
f"Access this document at this URL: https://access.redhat.com{metadata["path"]}",
metadata["extra"]["portal_summary"],
clean_description(description),
clean_solution(solution),
products,
clean_bugzillas(bugzillas),
]

clean_doc = "\n\n".join(clean_doc_pieces)

for section in errata_doc["content"]:
print(section.metadata)
print(section.page_content)
print()
return clean_doc

0 comments on commit b44b8e9

Please sign in to comment.