From 51179afd5b3557913a678a9fe9e0445a0b435fe3 Mon Sep 17 00:00:00 2001 From: Kyle Burton Date: Wed, 31 May 2023 10:49:20 -0500 Subject: [PATCH 1/6] Logs progress on validation --- .pre-commit-config.yaml | 2 +- scripts/validate.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c54bed54..fd978026 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,6 +12,6 @@ repos: - id: no-commit-to-branch args: [--branch, develop, --branch, master, --pattern, release/.*] - repo: https://github.com/psf/black - rev: 20.8b1 + rev: 22.3.0 hooks: - id: black diff --git a/scripts/validate.py b/scripts/validate.py index 8e4db22e..07953e3a 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -1,4 +1,6 @@ import json +import math + import boto3 import botocore from cdislogging import get_logger @@ -19,6 +21,15 @@ def resume_logger(filename=None): logger = get_logger("Validation", filename) +def log_file_progress(manifest_idx, current_file, total_files): + progress = (current_file / total_files) * 100 + progress = math.floor(progress / 10) * 10 # Round down to nearest 10 + if progress % 10 == 0: + logger.info( + f"Manifest #: {manifest_idx} - Processing {current_file}/{total_files} files ({progress}% complete)" + ) + + def run(global_config): """ Given manifests run validation process to check if all the objects exist and are indexed correctly @@ -115,7 +126,7 @@ def run(global_config): manifest_file = manifest_file.strip() files = utils.get_fileinfo_list_from_s3_manifest(manifest_file) fail_list = [] - for fi in files: + for i, fi in enumerate(files): del fi["url"] fi["aws_url"], fi["gs_url"], fi["indexd_url"] = None, None, None @@ -181,6 +192,7 @@ def run(global_config): fi["id"], fi["gs_url"], fi["indexd_url"] ) ) + log_file_progress(idx, i, len(files)) if total_gs_index_failures + total_gs_copy_failures == 0: logger.info( From 024836c1b52988137bd5e35a1947e563fb1ff1f1 Mon Sep 17 00:00:00 2001 From: MaribelleHGomez Date: Mon, 17 Jul 2023 09:41:25 -0500 Subject: [PATCH 2/6] small changes --- scripts/validate.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/validate.py b/scripts/validate.py index 8e4db22e..f0aaba29 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -1,10 +1,11 @@ import json import boto3 import botocore -from cdislogging import get_logger + from urllib.parse import urlparse -from indexclient.client import IndexClient +from cdislogging import get_logger +from scripts import file_utils from scripts import utils from scripts.errors import UserError @@ -127,6 +128,7 @@ def run(global_config): logger.error("There is no indexd record for {}".format(fi["id"])) # validate aws + # TO NOTE: doing extra checks here because of logic of open prod accounts for buckets aws_bucket = utils.get_aws_bucket_name(fi, PROJECT_ACL) object_path = "{}/{}/{}".format(aws_bucket, fi["id"], fi["file_name"]) object_path_2 = "{}/{}/{}".format( @@ -291,12 +293,14 @@ def run(global_config): return pass_validation -def _pass_preliminary_check(FORCE_CREATE_MANIFEST, manifest_files): +def _pass_preliminary_check(manifest_files: str, FORCE_CREATE_MANIFEST=False): """ Check if manifests are in the manifest bucket - - 'FORCE_CREATE_MANIFEST': True, False command arg parameter - 'manifest_files': 's3://input/active_manifest.tsv, s3://input/legacy_manifest.tsv' + Args: + manifest_files(str): + Expression to match (for example: >5, ==3, ==this_guid) + Returns: + None """ session = boto3.session.Session() From 7b4ec89740a35cf8c8bedcb236b9fa75af371b43 Mon Sep 17 00:00:00 2001 From: MaribelleHGomez Date: Mon, 17 Jul 2023 09:51:31 -0500 Subject: [PATCH 3/6] hold off on adding utils --- scripts/validate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/validate.py b/scripts/validate.py index 651c6976..77b88d15 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -7,7 +7,8 @@ from urllib.parse import urlparse from cdislogging import get_logger -from scripts import file_utils + +# from scripts import file_utils from scripts import utils from scripts.errors import UserError From 4ea3c78a8f5c5a316ee25cdf910c40537f85f8c8 Mon Sep 17 00:00:00 2001 From: MaribelleHGomez Date: Tue, 25 Jul 2023 12:17:52 -0500 Subject: [PATCH 4/6] print(bucket --- scripts/validate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/validate.py b/scripts/validate.py index 77b88d15..2e3f36a5 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -324,6 +324,7 @@ def _pass_preliminary_check(manifest_files: str, FORCE_CREATE_MANIFEST=False): parsed = urlparse(url) bucket_name = parsed.netloc key = parsed.path.strip("/") + print(bucket_name) s3.meta.client.head_object(Bucket=bucket_name, Key=key) except botocore.exceptions.ClientError as e: error_code = int(e.response["Error"]["Code"]) From 65edde26e06140830610d7ba03ce44832b9eb055 Mon Sep 17 00:00:00 2001 From: MaribelleHGomez Date: Tue, 25 Jul 2023 12:24:48 -0500 Subject: [PATCH 5/6] properly this time --- scripts/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/validate.py b/scripts/validate.py index 2e3f36a5..af0e4e06 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -324,7 +324,7 @@ def _pass_preliminary_check(manifest_files: str, FORCE_CREATE_MANIFEST=False): parsed = urlparse(url) bucket_name = parsed.netloc key = parsed.path.strip("/") - print(bucket_name) + logger.info(f"Bucket name: {bucket_name}") s3.meta.client.head_object(Bucket=bucket_name, Key=key) except botocore.exceptions.ClientError as e: error_code = int(e.response["Error"]["Code"]) From 8bfec76a5de360cc891c751171550550edb4f158 Mon Sep 17 00:00:00 2001 From: MaribelleHGomez Date: Tue, 25 Jul 2023 12:33:07 -0500 Subject: [PATCH 6/6] fix param --- scripts/validate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/validate.py b/scripts/validate.py index af0e4e06..581b79cc 100644 --- a/scripts/validate.py +++ b/scripts/validate.py @@ -83,7 +83,7 @@ def run(global_config): "number of output manifests and number of manifest_files are not the same" ) - if not _pass_preliminary_check(FORCE_CREATE_MANIFEST, manifest_files): + if not _pass_preliminary_check(manifest_files, FORCE_CREATE_MANIFEST): raise UserError("The input does not pass the preliminary check") logger.info("scan all copied objects") @@ -321,6 +321,7 @@ def _pass_preliminary_check(manifest_files: str, FORCE_CREATE_MANIFEST=False): for url in manifest_files: try: + logger.info(f"{manifest_files}, {url}") parsed = urlparse(url) bucket_name = parsed.netloc key = parsed.path.strip("/")