From 4dbdc6261e4c60d85f53440ae7085ba750b81eac Mon Sep 17 00:00:00 2001 From: Stephen Astels Date: Tue, 19 Mar 2024 16:20:04 -0400 Subject: [PATCH] delete from s3 in subbatches --- app/aws/s3.py | 9 ++++++--- tests/app/aws/test_s3.py | 30 +++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/app/aws/s3.py b/app/aws/s3.py index 2e8b283904..da9e9450ad 100644 --- a/app/aws/s3.py +++ b/app/aws/s3.py @@ -60,10 +60,13 @@ def get_job_metadata_from_s3(service_id, job_id): return obj.get()["Metadata"] -def remove_jobs_from_s3(jobs): +# AWS has a limit of 1000 objects per delete_objects call +def remove_jobs_from_s3(jobs, batch_size=1000): bucket = resource("s3").Bucket(current_app.config["CSV_UPLOAD_BUCKET_NAME"]) - object_keys = [FILE_LOCATION_STRUCTURE.format(job.service_id, job.id) for job in jobs] - bucket.delete_objects(Delete={"Objects": [{"Key": key} for key in object_keys]}) + + for start in range(0, len(jobs), batch_size): + object_keys = [FILE_LOCATION_STRUCTURE.format(job.service_id, job.id) for job in jobs[start : start + batch_size]] + bucket.delete_objects(Delete={"Objects": [{"Key": key} for key in object_keys]}) def get_s3_bucket_objects(bucket_name, subfolder="", older_than=7, limit_days=2): diff --git a/tests/app/aws/test_s3.py b/tests/app/aws/test_s3.py index bae56c3f45..02de33cbba 100644 --- a/tests/app/aws/test_s3.py +++ b/tests/app/aws/test_s3.py @@ -1,6 +1,6 @@ import uuid from datetime import datetime, timedelta -from unittest.mock import call +from unittest.mock import Mock, call import pytest import pytz @@ -12,6 +12,7 @@ get_list_of_files_by_suffix, get_s3_bucket_objects, get_s3_file, + remove_jobs_from_s3, remove_transformed_dvla_file, upload_job_to_s3, ) @@ -214,3 +215,30 @@ def test_upload_job_to_s3(notify_api, mocker): bucket_name=current_app.config["CSV_UPLOAD_BUCKET_NAME"], file_location=f"service-{service_id}-notify/{upload_id}.csv", ) + + +def test_remove_jobs_from_s3(notify_api, mocker): + mock = Mock() + mocker.patch("app.aws.s3.resource", return_value=mock) + jobs = [ + type("Job", (object,), {"service_id": "foo", "id": "j1"}), + type("Job", (object,), {"service_id": "foo", "id": "j2"}), + type("Job", (object,), {"service_id": "foo", "id": "j3"}), + type("Job", (object,), {"service_id": "foo", "id": "j4"}), + type("Job", (object,), {"service_id": "foo", "id": "j5"}), + ] + + remove_jobs_from_s3(jobs, batch_size=2) + + mock.assert_has_calls( + [ + call.Bucket(current_app.config["CSV_UPLOAD_BUCKET_NAME"]), + call.Bucket().delete_objects( + Delete={"Objects": [{"Key": "service-foo-notify/j1.csv"}, {"Key": "service-foo-notify/j2.csv"}]} + ), + call.Bucket().delete_objects( + Delete={"Objects": [{"Key": "service-foo-notify/j3.csv"}, {"Key": "service-foo-notify/j4.csv"}]} + ), + call.Bucket().delete_objects(Delete={"Objects": [{"Key": "service-foo-notify/j5.csv"}]}), + ] + )