Fix for e06f965879ab MARC file deletion migration (PP-773) (#1563)

e06f965879ab was hard to test, since it involves both the DB and S3. After the migration ran on Minotaur and Cerberus, there are a couple changes needed: - The latest code puts the MARC file URL in Representation.mirror_url and Representation.url but older versions of the code only put the url in Representation.url. So older files were failing this migration because mirror_url is NULL. So its safe to just use URL for everything (verified this with a query against production DBs). - Some records have NULL in url (as well as mirror_url). I'm not sure how / when this happened. But there is no way we can delete the file in this case, so it doesn't fail the migration, just outputs a log message. - The URLs are stored URL escaped, and S3 keys are expected not to be escaped, so we have to call unquote on the keys. - Don't fail the migration if we can't parse the key out of the URL. Just log what happened, don't delete that file and move on.
ThePalaceProject · Jan 10, 2024 · 88de5fb · 88de5fb
1 parent 8536f29
commit 88de5fb
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 42 deletions.
diff --git a/alembic/versions/20231206_e06f965879ab_marc_s3_file_cleanup.py b/alembic/versions/20231206_e06f965879ab_marc_s3_file_cleanup.py
@@ -6,7 +6,7 @@
 
 """
 from typing import Optional
-from urllib.parse import urlparse
+from urllib.parse import unquote, urlparse
 
 from alembic import op
 from core.migration.util import migration_logger
@@ -19,28 +19,33 @@
 depends_on = None
 
 
-def parse_key_from_url(url: str, bucket: str) -> Optional[str]:
+def parse_key_from_url(url: Optional[str], bucket: str) -> Optional[str]:
     """Parse the key from a URL.
 
     :param url: The URL to parse.
     :return: The key, or None if the URL is not a valid S3 URL.
     """
+    if url is None:
+        return None
+
     parsed_url = urlparse(url)
 
     if f"/{bucket}/" in parsed_url.path:
-        return parsed_url.path.split(f"/{bucket}/", 1)[1]
-
-    if bucket in parsed_url.netloc:
-        return parsed_url.path.lstrip("/")
+        key = parsed_url.path.split(f"/{bucket}/", 1)[1]
+    elif bucket in parsed_url.netloc:
+        key = parsed_url.path.lstrip("/")
+    else:
+        return None
 
-    return None
+    # The key stored in the DB is URL encoded, so we need to decode it
+    return unquote(key)
 
 
 def upgrade() -> None:
     # Before removing the cachedmarcfiles table, we want to clean up
     # the cachedmarcfiles stored in s3.
     #
-    # Note: if you are running this migration on a development system and you want
+    # Note: if you are running this migration on a development system, and you want
     # to skip deleting these files you can just comment out the migration code below.
     services = container_instance()
     public_s3 = services.storage.public()
@@ -49,7 +54,7 @@ def upgrade() -> None:
     # Check if there are any cachedmarcfiles in s3
     connection = op.get_bind()
     cached_files = connection.execute(
-        "SELECT r.mirror_url FROM cachedmarcfiles cmf JOIN representations r ON cmf.representation_id = r.id"
+        "SELECT r.url FROM cachedmarcfiles cmf JOIN representations r ON cmf.representation_id = r.id"
     ).all()
     if public_s3 is None and len(cached_files) > 0:
         raise RuntimeError(
@@ -58,11 +63,12 @@ def upgrade() -> None:
 
     keys_to_delete = []
     for cached_file in cached_files:
-        url = cached_file.mirror_url
+        url = cached_file.url
         bucket = public_s3.bucket
         key = parse_key_from_url(url, bucket)
         if key is None:
-            raise RuntimeError(f"Unexpected URL format: {url} (bucket: {bucket})")
+            log.info(f"Skipping cachedmarcfile with invalid URL: {url}")
+            continue
         generated_url = public_s3.generate_url(key)
         if generated_url != url:
             raise RuntimeError(f"URL mismatch: {url} != {generated_url}")

diff --git a/tests/migration/test_20231206_e06f965879ab.py b/tests/migration/test_20231206_e06f965879ab.py
@@ -21,7 +21,7 @@ class CreateCachedMarcFile:
     def __call__(
         self,
         connection: Connection,
-        url: str,
+        url: Optional[str],
         library_id: Optional[int] = None,
         lane_id: Optional[int] = None,
     ) -> Tuple[int, int]:
@@ -43,9 +43,9 @@ def __call__(
 
         return representation_id, file_id
 
-    def representation(self, connection: Connection, url: str) -> int:
+    def representation(self, connection: Connection, url: Optional[str]) -> int:
         row = connection.execute(
-            "INSERT INTO representations (media_type, mirror_url) "
+            "INSERT INTO representations (media_type, url) "
             "VALUES ('application/marc', %s) returning id",
             url,
         ).first()
@@ -104,31 +104,6 @@ def test_migration_no_s3_integration(
     )
 
 
-def test_migration_bucket_url_not_found(
-    alembic_runner: MigrationContext,
-    alembic_engine: Engine,
-    create_cachedmarcfile: CreateCachedMarcFile,
-    caplog: LogCaptureFixture,
-) -> None:
-    alembic_runner.migrate_down_to(MIGRATION_ID)
-    alembic_runner.migrate_down_one()
-
-    container = container_instance()
-    mock_storage = MagicMock(spec=S3Service)
-    mock_storage.bucket = "test-bucket42"
-
-    # If we can't parse the key from the URL, the migration should fail
-    with alembic_engine.connect() as connection:
-        create_cachedmarcfile(connection, "http://s3.amazonaws.com/test-bucket/1.mrc")
-
-    with pytest.raises(RuntimeError) as excinfo, container.storage.public.override(
-        mock_storage
-    ):
-        alembic_runner.migrate_up_one()
-
-    assert "Unexpected URL format" in str(excinfo.value)
-
-
 def test_migration_bucket_url_different(
     alembic_runner: MigrationContext,
     alembic_engine: Engine,
@@ -188,7 +163,13 @@ def test_migration_success(
             lane_id=lane_id,
             url=url2,
         )
-        url3 = "http://test-bucket.custom-domain.com/3.mrc"
+        create_cachedmarcfile(
+            connection,
+            library_id=library_id,
+            lane_id=lane_id,
+            url=None,
+        )
+        url3 = "https://test-bucket.s3.us-west-2.amazonaws.com/test-1/2023-02-17%2006%3A38%3A01.837167%2B00%3A00-2023-03-21%2005%3A41%3A28.262257%2B00%3A00/Fiction.mrc"
         create_cachedmarcfile(
             connection,
             library_id=library_id,
@@ -216,12 +197,14 @@ def test_migration_success(
     assert mock_storage.delete.call_args_list == [
         call("1.mrc"),
         call("2.mrc"),
-        call("3.mrc"),
+        call(
+            "test-1/2023-02-17 06:38:01.837167+00:00-2023-03-21 05:41:28.262257+00:00/Fiction.mrc"
+        ),
     ]
 
     # But the representations and coveragerecords should still be there
     with alembic_engine.connect() as connection:
-        assert connection.execute("SELECT id FROM representations").rowcount == 4
+        assert connection.execute("SELECT id FROM representations").rowcount == 5
         assert connection.execute("SELECT id FROM coveragerecords").rowcount == 2
 
     # The next migration takes care of those