Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: correctly offset the index for batched validation #280

Merged
merged 4 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/regtech_data_validator/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def validate(

if check_output is not None:
# Filter data not associated with failed Check, and update index for merging with findings_df
check_output = check_output.with_columns(pl.col('index').add(row_start))
failed_records_df = _filter_valid_records(submission_df, check_output, fields)
failed_record_fields_df = _records_to_fields(failed_records_df)
findings = _add_validation_metadata(failed_record_fields_df, check)
Expand All @@ -133,16 +134,16 @@ def validate(
if check_findings:
findings_df = pl.concat(check_findings)

updated_df = add_uid(findings_df, submission_df)
updated_df = add_uid(findings_df, submission_df, row_start)
return updated_df


# Add the uid for the record throwing the error/warning to the error dataframe
def add_uid(results_df: pl.DataFrame, submission_df: pl.DataFrame) -> pl.DataFrame:
def add_uid(results_df: pl.DataFrame, submission_df: pl.DataFrame, offset: int) -> pl.DataFrame:
if results_df.is_empty():
return results_df

uid_records = results_df['record_no'] - 1
uid_records = results_df['record_no'] - 1 - offset
results_df = results_df.with_columns(submission_df['uid'].gather(uid_records).alias('uid'))
return results_df

Expand Down
3 changes: 2 additions & 1 deletion tests/test_sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def test_all_logic_errors(self):
vresults = []
for vresult in validate_batch_csv(ALL_LOGIC_ERRORS):
vresults.append(vresult)

# 3 phases
assert len(vresults) == 3
results = pl.concat([vr.findings for vr in vresults], how="diagonal")

logic_schema = get_phase_2_schema_for_lei()
Expand Down
Loading