From 24f528a4b430a1e8739dd0ae5dc82689c3c17623 Mon Sep 17 00:00:00 2001 From: lchen-2101 <73617864+lchen-2101@users.noreply.github.com> Date: Wed, 23 Oct 2024 10:02:37 -0400 Subject: [PATCH] stats for parquet --- src/regtech_data_validator/validator.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/regtech_data_validator/validator.py b/src/regtech_data_validator/validator.py index 455e36f..c0ea76a 100644 --- a/src/regtech_data_validator/validator.py +++ b/src/regtech_data_validator/validator.py @@ -18,6 +18,8 @@ import shutil import os import boto3.session +from datetime import datetime +import psutil from regtech_data_validator.phase_validations import ( get_phase_1_schema_for_lei, @@ -177,6 +179,8 @@ def validate_batch_parquet( batch_count: int = 1, max_errors=1000000, ): + + start = datetime.now() has_syntax_errors = False syntax_schema = get_phase_1_schema_for_lei(context) syntax_checks = [check for col_schema in syntax_schema.columns.values() for check in col_schema.checks] @@ -216,6 +220,9 @@ def validate_batch_parquet( ): yield validation_results + print(f"Total time parquet: {(datetime.now() - start).total_seconds()} seconds") + print(f"Total Memory: {psutil.Process(os.getpid()).memory_info().rss / (1024*1024)}MB") + # This function is a Generator, and will yield the results of each batch of processing, along with the # phase (SYNTACTICAL/LOGICAL) that the findings were found. Callers of this function will want to @@ -227,8 +234,6 @@ def validate_batch_csv( batch_count: int = 1, max_errors=1000000, ): - from datetime import datetime - import psutil start = datetime.now() has_syntax_errors = False