diff --git a/scripts/find_collated_zips.py b/scripts/find_collated_zips.py index b683e491..005494bf 100644 --- a/scripts/find_collated_zips.py +++ b/scripts/find_collated_zips.py @@ -30,9 +30,10 @@ import re -def get_zipfile_list(bucket_name, access_key, secret_key, s3_host, get_contents_metadata): +def get_key_lists(bucket_name, access_key, secret_key, s3_host, get_contents_metadata): zipfile_list = [] contents_list = [] + all_keys_list = [] s3 = bm.get_resource(access_key, secret_key, s3_host) s3_client = bm.get_client(access_key, secret_key, s3_host) bucket = s3.Bucket(bucket_name) @@ -41,21 +42,31 @@ def get_zipfile_list(bucket_name, access_key, secret_key, s3_host, get_contents_ paginator = s3_client.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name) - + key_count = 0 + zipfile_count = 0 for page in page_iterator: if 'Contents' in page: for obj in page['Contents']: + key_count += 1 key = obj['Key'] if pattern.match(key): + zipfile_count += 1 zipfile_list.append(key) if get_contents_metadata: contents = bucket.Object(key).get()['Metadata']['zip-contents'].split(',') - print(f'{key}: {contents}') - else: - print(f'{key}') - - - return zipfile_list, contents + contents_list.append(np.array(contents)) + # print(f'{key}: {contents}') + # else: + else: + all_keys_list.append(key) + print(f'Keys found: {key_count}, Zip files found: {zipfile_count}', end='\r') + if key_count >= 10000: + break + return np.array(zipfile_list), np.array(contents_list, dtype=object), np.array(all_keys_list) + +def verify_zip_contents(bucket_name, access_key, secret_key, s3_host, zipfile_df, all_keys_df): + print(zipfile_df) + def main(): epilog = '' @@ -70,7 +81,8 @@ def error(self, message): formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--bucket-name','-b', type=str, help='Name of the S3 bucket.', required=True) - parser.add_argument('--list-contents','-l', action='store_true', help='List the contents of the zip files from metadata exist in the bucket.') + parser.add_argument('--list-contents','-l', action='store_true', help='List the contents of the zip files.') + parser.add_argument('--verify-contents','-v', action='store_true', help='Verify the contents of the zip files from metadata exist in the bucket.') args = parser.parse_args() bucket_name = args.bucket_name @@ -78,6 +90,10 @@ def error(self, message): list_contents = True else: list_contents = False + if args.verify_contents: + verify_contents = True + else: + verify_contents = False # Setup bucket object s3_host = 'echo.stfc.ac.uk' @@ -96,14 +112,25 @@ def error(self, message): print(f'Bucket {bucket_name} not found in {s3_host}.') sys.exit() - zipfile_list, zipfile_contents = get_zipfile_list(bucket_name, access_key, secret_key, s3_host, list_contents) + zipfiles, zipfile_contents, all_keys = get_key_lists(bucket_name, access_key, secret_key, s3_host, list_contents) if list_contents: for i, contents in enumerate(zipfile_contents): - print(f'Zip file: {zipfile_list[i]}, {contents}') - else: - for zipfile in zipfile_list: - print(zipfile) + print(f'Zip file: {zipfiles[i]}, {contents}') + # else: + # for zipfile in zipfiles: + # print(zipfile) + + if verify_contents: + print('Verifying zip file contents...') + zipfiles = np.array(zipfiles) + zipfile_contents = np.array(zipfile_contents, dtype=object) + print(zipfiles.shape, zipfile_contents.shape) + zipfile_df = pd.DataFrame(np.array([zipfiles, zipfile_contents], dtype=object).reshape(2,len(zipfiles)).T, columns=['zipfile','contents']) + del zipfiles, zipfile_contents + all_keys_df = pd.DataFrame(all_keys, columns=['key']) + del all_keys + verify_zip_contents(bucket_name, access_key, secret_key, s3_host, zipfile_df, all_keys_df) if __name__ == '__main__': main() \ No newline at end of file