From 74d2cde36bcb49de563e4d9d0d2fb823a9eee681 Mon Sep 17 00:00:00 2001
From: William Coe <willcoe@aiven.io>
Date: Sun, 14 Aug 2022 19:35:28 +0800
Subject: [PATCH] Support stripping metadata from updates.

Updates metadata can be quite large and cause systems with low memory to
OOM refreshing it. Add support for dropping from the metadata
architectures that are not in use in any system the mirror serves. This
is achieved in much the same way as snapshotting is implemneted: pull
metadata, rewrite, update the various values like checksum and size so
DNF is happy to consume it and add to repomd.xml.

Some refactoring was performed as much of the code used for snapshotting
is also useful here.
---
 rpm_s3_mirror.spec          |   1 +
 rpm_s3_mirror/config.py     |   4 +-
 rpm_s3_mirror/mirror.py     |  32 ++++++--
 rpm_s3_mirror/repository.py | 152 +++++++++++++++++++++++++++++-------
 rpm_s3_mirror/s3.py         |   1 -
 5 files changed, 155 insertions(+), 35 deletions(-)

diff --git a/rpm_s3_mirror.spec b/rpm_s3_mirror.spec
index 2747253..ecee73a 100644
--- a/rpm_s3_mirror.spec
+++ b/rpm_s3_mirror.spec
@@ -14,6 +14,7 @@ Requires:       python3-dateutil
 Requires:       python3-boto3
 Requires:       python3-lxml
 Requires:       systemd
+Requires:       zchunk
 
 %undefine _missing_build_ids_terminate_build
 
diff --git a/rpm_s3_mirror/config.py b/rpm_s3_mirror/config.py
index ad25310..834adbd 100644
--- a/rpm_s3_mirror/config.py
+++ b/rpm_s3_mirror/config.py
@@ -16,6 +16,7 @@
 DEFAULTS = {
     "scratch_dir": "/var/tmp/",
     "max_workers": 4,
+    "trim_updates_to_arches": [],
 }
 
 
@@ -31,6 +32,7 @@ class Config:
     scratch_dir = None
     max_workers = None
     upstream_repositories = None
+    trim_updates_to_arches = None
     _config = DEFAULTS
 
     def __init__(self):
@@ -63,7 +65,7 @@ def _populate_required(self):
                     raise ConfigError(f"Missing required environment variable: {key.upper()}")
                 else:
                     continue
-            elif key == "upstream_repositories":
+            elif key in ("upstream_repositories", "trim_updates_to_arches"):
                 value = value.split(",")
             elif key == "max_workers":
                 value = int(value)
diff --git a/rpm_s3_mirror/mirror.py b/rpm_s3_mirror/mirror.py
index 70a86c6..256619f 100644
--- a/rpm_s3_mirror/mirror.py
+++ b/rpm_s3_mirror/mirror.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/
 
 import logging
+import os
 import re
 from contextlib import suppress
-from os.path import join
+from os.path import join, basename
 from tempfile import TemporaryDirectory
 
 import time
@@ -132,10 +133,31 @@ def _sync_repository(self, upstream_repository: RPMRepository):
                 manifest_path = join(manifest_location, "manifest.json")
                 self.s3.put_manifest(location=manifest_path, manifest=manifest)
 
-            # Finally, overwrite the repomd.xml file to make our changes live
-            # Copy from the cached file in /var/tmp, because sync_packages may take a bit of time, and repomd.xml may
-            # change in upstream.
-            self.s3.overwrite_repomd(repomd_xml_local_path=cache_xml_path, base_url=upstream_repository.base_url)
+            # The metadata can be quite large and cause dnf to use excessive memory processing
+            # the updates. As an optimization, support dropping architectures that we know
+            # we don't need.
+            if self.config.trim_updates_to_arches:
+                self.log.info("Trimming metadata to: %s", self.config.trim_updates_to_arches)
+                metadata_scratch = os.path.join(temp_dir, "metadata")
+                os.makedirs(metadata_scratch, exist_ok=True)
+                with open(cache_xml_path, "rb") as f:
+                    repodata_files = upstream_repository.strip_metadata(
+                        xml_bytes=f.read(),
+                        target_arches=self.config.trim_updates_to_arches,
+                        scratch_dir=metadata_scratch,
+                    )
+                    base_path = urlparse(upstream_repository.base_url).path[1:]  # need to strip the leading slash
+                    for file_path in repodata_files.upload_files:
+                        dest_path = join(base_path, "repodata", basename(file_path))
+                        self.log.info("Uploading: %s -> %s", file_path, dest_path)
+                        self.s3.put_object(
+                            local_path=file_path,
+                            key=dest_path,
+                        )
+            else:
+                # Overwrite the repomd.xml file from upstream to make our changes live.
+                self.log.info("Overwriting repomd.xml")
+                self.s3.overwrite_repomd(repomd_xml_local_path=cache_xml_path, base_url=upstream_repository.base_url)
         self.log.info("Updated mirror with %s packages", len(new_packages))
         self.stats.gauge(
             metric="s3_mirror_sync_seconds",
diff --git a/rpm_s3_mirror/repository.py b/rpm_s3_mirror/repository.py
index b63469b..2a94a59 100644
--- a/rpm_s3_mirror/repository.py
+++ b/rpm_s3_mirror/repository.py
@@ -1,9 +1,12 @@
 # Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/
-
+import dataclasses
+import re
+import subprocess
 from collections import namedtuple
 from datetime import datetime
-from typing import Iterator, Dict
+from typing import Iterator, Dict, Optional, Tuple
 from urllib.parse import urlparse
+from xml.etree.ElementTree import ElementTree
 
 from lxml.etree import fromstring, Element, tostring  # pylint: disable=no-name-in-module
 from lxml.etree import XMLParser  # pylint: disable=no-name-in-module
@@ -126,20 +129,86 @@ def __iter__(self) -> Iterator[Package]:
         "checksum",
     ],
 )
-SnapshotPrimary = namedtuple(
-    "SnapshotPrimary",
-    [
-        "open_checksum",
-        "checksum",
-        "checksum_type",
-        "size",
-        "open_size",
-        "local_path",
-        "location",
-    ],
-)
 
-Snapshot = namedtuple("Snapshot", ["sync_files", "upload_files"])
+RepoDataFiles = namedtuple("RepoDataFiles", ["sync_files", "upload_files"])
+
+
+@dataclasses.dataclass
+class SectionMetadata:
+    size: int
+    open_size: int
+    open_checksum: str
+    checksum: str
+    local_path: str
+    location: str
+    checksum_type: str = "sha256"
+    header_checksum: Optional[str] = None
+    header_size: Optional[int] = None
+
+
+class UpdateInfoSection:
+    def __init__(self, path: str):
+        # TOD: Add support for .xz metadata.
+        if not path.endswith(".zck"):
+            raise ValueError("Only .zck files are supported")
+        self.path = path
+
+    def strip_to_arches(self, arches, scratch_dir):
+        result = subprocess.run(["unzck", self.path, "--stdout"], stdout=subprocess.PIPE, check=True)
+        root = safe_parse_xml(result.stdout)
+        self._strip(root, arches)
+        return self._update_metadata(root, scratch_dir)
+
+    def _update_metadata(self, root, scratch_dir):
+        stripped_path = os.path.join(scratch_dir, "stripped.xml")
+        ElementTree(root).write(stripped_path)
+
+        # Take open checksum and size.
+        sha256_out = subprocess.check_output(["sha256sum", stripped_path], text=True)
+        open_checksum = sha256_out.split()[0]
+        open_size = os.path.getsize(stripped_path)
+
+        # Now compress and take compressed checksum,size.
+        compressed_stripped_path = os.path.join(scratch_dir, "stripped.xml.zck")
+        subprocess.check_call(["zck", stripped_path, "-o", compressed_stripped_path])
+        sha256_compressed_out = subprocess.check_output(["sha256sum", compressed_stripped_path], text=True)
+        checksum = sha256_compressed_out.split()[0]
+        size = os.path.getsize(compressed_stripped_path)
+
+        # We also need some ZChunk specific metadata.
+        header_out = subprocess.check_output(["zck_read_header", compressed_stripped_path], text=True)
+        header_checksum, header_size = self._parse_zck_read_header(output=header_out)
+        final_path = os.path.join(scratch_dir, f"{checksum}-updateinfo.xml.zck")
+
+        # Rename it in the same format as the other sections.
+        os.rename(compressed_stripped_path, final_path)
+
+        return SectionMetadata(
+            size=size,
+            open_size=open_size,
+            header_size=header_size,
+            header_checksum=header_checksum,
+            open_checksum=open_checksum,
+            checksum=checksum,
+            local_path=final_path,
+            location=f"repodata/{os.path.basename(final_path)}",
+        )
+
+    def _strip(self, root, arches):
+        for update_element in root:
+            for collection in update_element.find("pkglist"):
+                for package in collection.getchildren():
+                    if package.get("arch") not in arches:
+                        collection.remove(package)
+
+    def _parse_zck_read_header(self, output):
+        checksum_match = re.search("Header checksum: (?P<checksum>.*$)", output, flags=re.MULTILINE)
+        if not checksum_match:
+            raise ValueError(f"Failed to locate checksum in output: {output}")
+        size_match = re.search("Header size:(?P<size>.*$)", output, flags=re.MULTILINE)
+        if not size_match:
+            raise ValueError(f"Failed to locate size in output: {output}")
+        return checksum_match.groupdict()["checksum"], int(size_match.groupdict()["size"])
 
 
 class RPMRepository:
@@ -176,17 +245,45 @@ def exists(self):
             return True
         return False
 
-    def get_repodata(self):
-        response = self._req(self.session.get, "repodata/repomd.xml")
-        repodata = self.parse_repomd(xml=safe_parse_xml(response.content))
+    def get_repodata(self, xml_bytes=None):
+        if xml_bytes is None:
+            xml_bytes = self._req(self.session.get, "repodata/repomd.xml").content
+        repodata = self.parse_repomd(xml=safe_parse_xml(xml_bytes))
         return repodata
 
+    def strip_metadata(
+        self,
+        xml_bytes: bytes,
+        target_arches: Tuple[str],
+        scratch_dir: str,
+    ):
+        sync_files, upload_files = [], []
+        repomd_xml = safe_parse_xml(xml_bytes)
+        repodata = self.parse_repomd(xml=repomd_xml)
+        for key, section in repodata.items():
+            if key.startswith("updateinfo_zck"):
+                with self._req(self.session.get, path=section.location, stream=True) as request:
+                    local_path = download_repodata_section(section, request, destination_dir=scratch_dir)
+                    update_section = UpdateInfoSection(path=local_path)
+                    rewritten_section = update_section.strip_to_arches(arches=target_arches, scratch_dir=scratch_dir)
+                    self._rewrite_repomd(repomd_xml=repomd_xml, snapshot=rewritten_section, section_name=key)
+                    upload_files.append(rewritten_section.local_path)
+        repomd_xml_path = join(scratch_dir, "repomd.xml")
+        with open(repomd_xml_path, "wb+") as out:
+            out.write(tostring(repomd_xml, encoding="utf-8"))
+        upload_files.append(repomd_xml_path)
+
+        return RepoDataFiles(
+            sync_files=sync_files,
+            upload_files=upload_files,
+        )
+
     def create_snapshot(self, scratch_dir):
         response = self._req(self.session.get, "repodata/repomd.xml")
         repomd_xml = safe_parse_xml(response.content)
         repodata = self.parse_repomd(xml=repomd_xml)
         snapshot_primary = self._rewrite_primary(temp_dir=scratch_dir, primary=repodata["primary"])
-        self._rewrite_repomd(repomd_xml=repomd_xml, snapshot=snapshot_primary)
+        self._rewrite_repomd(repomd_xml=repomd_xml, snapshot=snapshot_primary, section_name="primary")
         repomd_xml_path = join(scratch_dir, "repomd.xml")
         with open(repomd_xml_path, "wb+") as out:
             out.write(tostring(repomd_xml, encoding="utf-8"))
@@ -199,7 +296,7 @@ def create_snapshot(self, scratch_dir):
                 or section.location.endswith("modules.yaml.gz")
             ):
                 sync_files.append(urlparse(join(self.base_url, section.location)).path)
-        return Snapshot(
+        return RepoDataFiles(
             sync_files=sync_files,
             upload_files=[repomd_xml_path, snapshot_primary.local_path],
         )
@@ -230,7 +327,7 @@ def _rewrite_primary(self, temp_dir, primary: RepodataSection):
             with open(local_path, "wb+") as out:
                 out.write(compressed_xml)
 
-            return SnapshotPrimary(
+            return SectionMetadata(
                 open_checksum=open_checksum,
                 checksum=compressed_sha256,
                 checksum_type="sha256",
@@ -240,14 +337,9 @@ def _rewrite_primary(self, temp_dir, primary: RepodataSection):
                 location=f"repodata/{basename(local_path)}",
             )
 
-    def _rewrite_repomd(self, repomd_xml: Element, snapshot: SnapshotPrimary):
-        for element in repomd_xml.findall("repo:*", namespaces=namespaces):
-            # We only support *.xml.gz files currently
-            if element.attrib.get("type", None) not in {"primary", "filelists", "other", "modules", "updateinfo"}:
-                repomd_xml.remove(element)
-
+    def _rewrite_repomd(self, repomd_xml: Element, snapshot: SectionMetadata, section_name: str):
         # Rewrite the XML with correct metadata for our changed primary.xml
-        for element in repomd_xml.find("repo:data[@type='primary']", namespaces=namespaces):
+        for element in repomd_xml.find(f"repo:data[@type='{section_name}']", namespaces=namespaces):
             _, _, key = element.tag.partition("}")
             if key == "checksum":
                 element.text = snapshot.checksum
@@ -259,6 +351,10 @@ def _rewrite_repomd(self, repomd_xml: Element, snapshot: SnapshotPrimary):
                 element.text = str(snapshot.size)
             elif key == "open-size":
                 element.text = str(snapshot.open_size)
+            elif key == "header-size" and snapshot.header_size is not None:
+                element.text = str(snapshot.header_size)
+            elif key == "header-checksum" and snapshot.header_checksum is not None:
+                element.text = snapshot.header_checksum
 
     def _extract_package_list(self, primary: RepodataSection) -> PackageList:
         with self._req(self.session.get, path=primary.location, stream=True) as request:
diff --git a/rpm_s3_mirror/s3.py b/rpm_s3_mirror/s3.py
index 7eea595..ea3f6ef 100644
--- a/rpm_s3_mirror/s3.py
+++ b/rpm_s3_mirror/s3.py
@@ -81,7 +81,6 @@ def sync_packages(
     def overwrite_repomd(self, repomd_xml_local_path, base_url):
         url = f"{base_url}repodata/repomd.xml"
         remote_path = urlparse(url).path
-        self.log.info("Overwriting repomd.xml")
         self.put_object(repomd_xml_local_path, remote_path, cache_age=0)
 
     def archive_repomd(self, base_url, location):