Skip to content

Commit

Permalink
Fix for e06f965879ab MARC file deletion migration (PP-773) (#1563)
Browse files Browse the repository at this point in the history
e06f965879ab was hard to test, since it involves both the DB and S3. After the migration ran on Minotaur and Cerberus, there are a couple changes needed:

-    The latest code puts the MARC file URL in Representation.mirror_url and Representation.url but older versions of the code only put the url in Representation.url. So older files were failing this migration because mirror_url is NULL. So its safe to just use URL for everything (verified this with a query against production DBs).
-    Some records have NULL in url (as well as mirror_url). I'm not sure how / when this happened. But there is no way we can delete the file in this case, so it doesn't fail the migration, just outputs a log message.
-    The URLs are stored URL escaped, and S3 keys are expected not to be escaped, so we have to call unquote on the keys.
-    Don't fail the migration if we can't parse the key out of the URL. Just log what happened, don't delete that file and move on.
  • Loading branch information
jonathangreen committed Jan 10, 2024
1 parent 8536f29 commit 88de5fb
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 42 deletions.
28 changes: 17 additions & 11 deletions alembic/versions/20231206_e06f965879ab_marc_s3_file_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""
from typing import Optional
from urllib.parse import urlparse
from urllib.parse import unquote, urlparse

from alembic import op
from core.migration.util import migration_logger
Expand All @@ -19,28 +19,33 @@
depends_on = None


def parse_key_from_url(url: str, bucket: str) -> Optional[str]:
def parse_key_from_url(url: Optional[str], bucket: str) -> Optional[str]:
"""Parse the key from a URL.
:param url: The URL to parse.
:return: The key, or None if the URL is not a valid S3 URL.
"""
if url is None:
return None

parsed_url = urlparse(url)

if f"/{bucket}/" in parsed_url.path:
return parsed_url.path.split(f"/{bucket}/", 1)[1]

if bucket in parsed_url.netloc:
return parsed_url.path.lstrip("/")
key = parsed_url.path.split(f"/{bucket}/", 1)[1]
elif bucket in parsed_url.netloc:
key = parsed_url.path.lstrip("/")
else:
return None

return None
# The key stored in the DB is URL encoded, so we need to decode it
return unquote(key)


def upgrade() -> None:
# Before removing the cachedmarcfiles table, we want to clean up
# the cachedmarcfiles stored in s3.
#
# Note: if you are running this migration on a development system and you want
# Note: if you are running this migration on a development system, and you want
# to skip deleting these files you can just comment out the migration code below.
services = container_instance()
public_s3 = services.storage.public()
Expand All @@ -49,7 +54,7 @@ def upgrade() -> None:
# Check if there are any cachedmarcfiles in s3
connection = op.get_bind()
cached_files = connection.execute(
"SELECT r.mirror_url FROM cachedmarcfiles cmf JOIN representations r ON cmf.representation_id = r.id"
"SELECT r.url FROM cachedmarcfiles cmf JOIN representations r ON cmf.representation_id = r.id"
).all()
if public_s3 is None and len(cached_files) > 0:
raise RuntimeError(
Expand All @@ -58,11 +63,12 @@ def upgrade() -> None:

keys_to_delete = []
for cached_file in cached_files:
url = cached_file.mirror_url
url = cached_file.url
bucket = public_s3.bucket
key = parse_key_from_url(url, bucket)
if key is None:
raise RuntimeError(f"Unexpected URL format: {url} (bucket: {bucket})")
log.info(f"Skipping cachedmarcfile with invalid URL: {url}")
continue
generated_url = public_s3.generate_url(key)
if generated_url != url:
raise RuntimeError(f"URL mismatch: {url} != {generated_url}")
Expand Down
45 changes: 14 additions & 31 deletions tests/migration/test_20231206_e06f965879ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CreateCachedMarcFile:
def __call__(
self,
connection: Connection,
url: str,
url: Optional[str],
library_id: Optional[int] = None,
lane_id: Optional[int] = None,
) -> Tuple[int, int]:
Expand All @@ -43,9 +43,9 @@ def __call__(

return representation_id, file_id

def representation(self, connection: Connection, url: str) -> int:
def representation(self, connection: Connection, url: Optional[str]) -> int:
row = connection.execute(
"INSERT INTO representations (media_type, mirror_url) "
"INSERT INTO representations (media_type, url) "
"VALUES ('application/marc', %s) returning id",
url,
).first()
Expand Down Expand Up @@ -104,31 +104,6 @@ def test_migration_no_s3_integration(
)


def test_migration_bucket_url_not_found(
alembic_runner: MigrationContext,
alembic_engine: Engine,
create_cachedmarcfile: CreateCachedMarcFile,
caplog: LogCaptureFixture,
) -> None:
alembic_runner.migrate_down_to(MIGRATION_ID)
alembic_runner.migrate_down_one()

container = container_instance()
mock_storage = MagicMock(spec=S3Service)
mock_storage.bucket = "test-bucket42"

# If we can't parse the key from the URL, the migration should fail
with alembic_engine.connect() as connection:
create_cachedmarcfile(connection, "http://s3.amazonaws.com/test-bucket/1.mrc")

with pytest.raises(RuntimeError) as excinfo, container.storage.public.override(
mock_storage
):
alembic_runner.migrate_up_one()

assert "Unexpected URL format" in str(excinfo.value)


def test_migration_bucket_url_different(
alembic_runner: MigrationContext,
alembic_engine: Engine,
Expand Down Expand Up @@ -188,7 +163,13 @@ def test_migration_success(
lane_id=lane_id,
url=url2,
)
url3 = "http://test-bucket.custom-domain.com/3.mrc"
create_cachedmarcfile(
connection,
library_id=library_id,
lane_id=lane_id,
url=None,
)
url3 = "https://test-bucket.s3.us-west-2.amazonaws.com/test-1/2023-02-17%2006%3A38%3A01.837167%2B00%3A00-2023-03-21%2005%3A41%3A28.262257%2B00%3A00/Fiction.mrc"
create_cachedmarcfile(
connection,
library_id=library_id,
Expand Down Expand Up @@ -216,12 +197,14 @@ def test_migration_success(
assert mock_storage.delete.call_args_list == [
call("1.mrc"),
call("2.mrc"),
call("3.mrc"),
call(
"test-1/2023-02-17 06:38:01.837167+00:00-2023-03-21 05:41:28.262257+00:00/Fiction.mrc"
),
]

# But the representations and coveragerecords should still be there
with alembic_engine.connect() as connection:
assert connection.execute("SELECT id FROM representations").rowcount == 4
assert connection.execute("SELECT id FROM representations").rowcount == 5
assert connection.execute("SELECT id FROM coveragerecords").rowcount == 2

# The next migration takes care of those
Expand Down

0 comments on commit 88de5fb

Please sign in to comment.