This repository has been archived by the owner on Nov 11, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
321 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
"""Tests for parsing errata files.""" | ||
|
||
import pytest | ||
from langchain_core.documents import Document | ||
|
||
from textprep.errata import ( | ||
clean_bugzillas, | ||
clean_description, | ||
clean_solution, | ||
get_affected_products, | ||
get_section_content, | ||
load_errata, | ||
parse, | ||
) | ||
|
||
|
||
def test_load_errata(tmp_path): | ||
content = """+++ | ||
title = '''Super important errata right here''' | ||
+++ | ||
# Most important heading | ||
First bit of content. | ||
""" | ||
d = tmp_path / "sub" | ||
d.mkdir() | ||
p = d / "errata.md" | ||
p.write_text(content, encoding="utf-8") | ||
|
||
# Load valid markdown. | ||
result = load_errata(p) | ||
assert result["frontmatter"]["title"] == "Super important errata right here" | ||
assert result["content"][0].page_content.strip() == "First bit of content." | ||
|
||
# Load some JSON to throw an exception. | ||
p.write_text('{"key": "Wait a minute, this is JSON!}') | ||
with pytest.raises(ValueError): | ||
result = load_errata(p) | ||
|
||
# Delete the file and try to load it again. | ||
p.unlink() | ||
with pytest.raises(FileNotFoundError): | ||
result = load_errata(p) | ||
|
||
|
||
def test_get_section_content(errata_doc): | ||
sections = [ | ||
Document(metadata={"Header 1": "First"}, page_content="First section content"), | ||
Document(metadata={"Header 2": "Second"}, page_content="Second section content"), | ||
] | ||
|
||
assert get_section_content(sections, "First") == "First section content" | ||
assert get_section_content(sections, "Second") == "Second section content" | ||
assert get_section_content(sections, "Third") == "" | ||
|
||
|
||
def test_clean_bugzillas(): | ||
content = """ | ||
- [BZ - 2044863](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863) | ||
- [BZ - 2044864](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044864) | ||
- [BZ - 2044865](https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044865) | ||
""" | ||
result = clean_bugzillas(content) | ||
expected = """This update fixes the these bugs: | ||
- BZ 2044863 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863 | ||
- BZ 2044864 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044864 | ||
- BZ 2044865 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044865""" | ||
assert result == expected | ||
|
||
|
||
def test_clean_description(): | ||
content = """This is an important errata! | ||
Security Fix(es): | ||
- A very special CVE | ||
For more details about the security issue(s), blah blah blah.""" | ||
result = clean_description(content) | ||
print(result) | ||
expected = ( | ||
"This is an important errata!\n\nThis update fixes the following security issues:\n\n- A very special CVE" | ||
) | ||
assert result == expected | ||
|
||
|
||
def test_clean_solution(): | ||
# The newlines should be removed and the link should be cleaned on this boilerplate | ||
# text. | ||
content = """For details on how to apply this update, which includes the changes described in this advisory, refer to: | ||
<https://access.redhat.com/articles/11258>""" | ||
result = clean_solution(content) | ||
assert result.endswith("refer to: https://access.redhat.com/articles/11258") | ||
|
||
# This should be left untouched since it's unique content. | ||
content = "This is unique content about a solution you should know!" | ||
assert clean_solution(content) == content | ||
|
||
|
||
def test_get_affected_products(): | ||
product_keys = [ | ||
"Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support", | ||
"Red Hat CodeReady Linux Builder for ARM 64", | ||
] | ||
product_detail = [ | ||
"Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64|8|aarch64", | ||
"Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support|8.8|aarch64", | ||
"Red Hat Enterprise Linux|Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support|8.6|aarch64", | ||
] | ||
|
||
result = get_affected_products(product_keys, product_detail) | ||
|
||
# These two matching ones should be joined. | ||
expected = "- Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support versions 8.6 and 8.8" | ||
assert expected in result | ||
|
||
# This one should be separate. | ||
expected = "- Red Hat CodeReady Linux Builder for ARM 64 version 8" | ||
assert expected in result | ||
|
||
|
||
def test_parse_functional(errata_doc_path): | ||
result = parse(errata_doc_path) | ||
expected = """RHSA-2022:0886 - Moderate: virt:rhel and virt-devel:rhel security update | ||
Published: 2022-03-15T09:10:17Z | ||
Access this document at this URL: https://access.redhat.com/errata/RHSA-2022:0886 | ||
An update for the virt:rhel and virt-devel:rhel modules is now available for Red Hat Enterprise Linux 8. | ||
Red Hat Product Security has rated this update as having a security impact of Moderate. A Common Vulnerability Scoring System (CVSS) base score, which gives a detailed severity rating, is available for each vulnerability from the CVE link(s) in the References section. | ||
Kernel-based Virtual Machine (KVM) offers a full virtualization solution for Linux on numerous hardware platforms. The virt:rhel module contains packages which provide user-space components used to run virtual machines using KVM. The packages also provide APIs for managing and interacting with the virtualized | ||
systems. | ||
This update fixes the following security issues: | ||
- QEMU: virtiofsd: potential privilege escalation via CVE-2018-13405 (CVE-2022-0358) | ||
For details on how to apply this update, which includes the changes described in this advisory, refer to: https://access.redhat.com/articles/11258 | ||
This errata affects the following products: | ||
- Red Hat CodeReady Linux Builder for ARM 64 - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat CodeReady Linux Builder for ARM 64 version 8 | ||
- Red Hat CodeReady Linux Builder for IBM z Systems - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat CodeReady Linux Builder for IBM z Systems version 8 | ||
- Red Hat CodeReady Linux Builder for Power, little endian - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat CodeReady Linux Builder for Power, little endian version 8 | ||
- Red Hat CodeReady Linux Builder for x86_64 - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat CodeReady Linux Builder for x86_64 version 8 | ||
- Red Hat Enterprise Linux Server - AUS version 8.6 | ||
- Red Hat Enterprise Linux Server - TUS versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux Server for Power LE - Update Services for SAP Solutions versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux for ARM 64 - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux for ARM 64 version 8 | ||
- Red Hat Enterprise Linux for IBM z Systems - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux for IBM z Systems version 8 | ||
- Red Hat Enterprise Linux for Power, little endian - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux for Power, little endian version 8 | ||
- Red Hat Enterprise Linux for x86_64 - Extended Update Support versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux for x86_64 - Update Services for SAP Solutions versions 8.6 and 8.8 | ||
- Red Hat Enterprise Linux for x86_64 version 8 | ||
This update fixes the these bugs: | ||
- BZ 2044863 found at https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=2044863""" | ||
assert result == expected |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,133 @@ | ||
"""Parse errata documents.""" | ||
|
||
import os | ||
import re | ||
|
||
from textprep.cleaner import clean_links | ||
from textprep.splitter import parse_markdown | ||
|
||
EXCLUDED_SECTIONS = ["Updated Packages"] | ||
|
||
if __name__ == "__main__": | ||
with open("tests/example_docs/RHSA-2022_0886.md") as f: | ||
errata_doc = parse_markdown(f.read()) | ||
def load_errata(path: str) -> dict: | ||
"""Load an errata document from a file.""" | ||
if not os.path.isfile(path): | ||
raise FileNotFoundError() | ||
|
||
with open(path) as f: | ||
raw_text = f.read() | ||
|
||
try: | ||
parsed = parse_markdown(raw_text) | ||
except ValueError: | ||
print(f"Error parsing file at {path}") | ||
raise | ||
|
||
return parsed | ||
|
||
|
||
def get_section_content(sections: list, section_name: str) -> str: | ||
"""Get the content of a section by its name.""" | ||
for section in sections: | ||
if next(iter(section.metadata.values())) == section_name: | ||
return str(section.page_content.strip()) | ||
|
||
return "" | ||
|
||
|
||
def clean_bugzillas(section: str) -> str: | ||
"""Clean up bugzilla references in the errata.""" | ||
pattern = r"\[([^\]]+)\]\((http[s]?:\/\/[^\)]+)\)" | ||
matches = re.findall(pattern, section) | ||
|
||
bugzillas = [] | ||
for match in matches: | ||
bz_name = match[0].replace(" - ", " ") | ||
bz_url = match[1] | ||
bugzillas.append(f"- {bz_name} found at {bz_url}") | ||
|
||
return "This update fixes the these bugs:\n\n" + "\n".join(bugzillas) | ||
|
||
|
||
def clean_description(section: str) -> str: | ||
"""Clean up an errata description.""" | ||
# Be specific about what we're fixing. | ||
section = section.replace("Security Fix(es):", "This update fixes the following security issues:") | ||
|
||
# Remove the boilerplate line about checking references. | ||
section = "\n".join([ | ||
x for x in section.split("\n") if not x.startswith("For more details about the security issue") | ||
]) | ||
|
||
return section.strip() | ||
|
||
|
||
def clean_solution(section: str) -> str: | ||
"""Clean up an errata solution. | ||
Some errata have specific instructions included in the errata itself while others | ||
just link to a solution article. | ||
TODO: It would be nice to pull in the solution article here instead of a link. | ||
""" | ||
if section.startswith("For details on how to apply this update"): | ||
section = section.replace("\n\n", " ") | ||
|
||
return clean_links(section).strip() | ||
|
||
|
||
def get_affected_products(product_keys: list, product_detail: list) -> str: | ||
"""Get the affected products from the frontmatter. | ||
This is tricky because the portal_product_filter contains a pipe delimited table, | ||
but the names of products are fully repeated for every version and architecture. | ||
That's a *lot* of redundant information to hand off to the embedding model and LLM. | ||
The portal_product_names list contains the keys that go along with the | ||
portal_product_filter table, so we can use to generate a more concise list. | ||
""" | ||
product_detail = [x.split("|") for x in product_detail] | ||
|
||
product_pieces = [] | ||
for product_key in product_keys: | ||
# Get all the affected versions from the affected products detail list that | ||
# match ths current product key. | ||
versions = sorted([x[2] for x in product_detail if x[1] == product_key]) | ||
|
||
# Add the version(s) to the end the product key. | ||
if len(versions) > 1: | ||
product_pieces.append(f"- {product_key} versions {" and ".join(versions)}") | ||
else: | ||
product_pieces.append(f"- {product_key} version {versions[0]}") | ||
|
||
product_text = "\n".join(sorted(product_pieces)) | ||
|
||
return "This errata affects the following products:\n\n" + product_text | ||
|
||
|
||
def parse(path: str) -> str: | ||
"""Parse an errata document into a clean format.""" | ||
errata_doc = load_errata(path) | ||
|
||
metadata = errata_doc["frontmatter"] | ||
sections = errata_doc["content"] | ||
|
||
description = get_section_content(sections, "Description") | ||
solution = get_section_content(sections, "Solution") | ||
bugzillas = get_section_content(sections, "Fixes") | ||
products = get_affected_products( | ||
metadata["extra"]["portal_product_names"], metadata["extra"]["portal_product_filter"] | ||
) | ||
|
||
clean_doc_pieces = [ | ||
metadata["extra"]["original_title"], | ||
f"Published: {metadata["extra"]["issued"]}", | ||
f"Access this document at this URL: https://access.redhat.com{metadata["path"]}", | ||
metadata["extra"]["portal_summary"], | ||
clean_description(description), | ||
clean_solution(solution), | ||
products, | ||
clean_bugzillas(bugzillas), | ||
] | ||
|
||
clean_doc = "\n\n".join(clean_doc_pieces) | ||
|
||
for section in errata_doc["content"]: | ||
print(section.metadata) | ||
print(section.page_content) | ||
print() | ||
return clean_doc |