From de26c42174fcab9651e1c7bd15541243459d0ac7 Mon Sep 17 00:00:00 2001
From: lchen <73617864+lchen-2101@users.noreply.github.com>
Date: Thu, 14 Nov 2024 10:36:42 -0800
Subject: [PATCH 1/3] fix: correctly offset the index for batched validation

---
 src/regtech_data_validator/validator.py |  7 +++---
 tests/test_sample_data.py               | 29 ++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/regtech_data_validator/validator.py b/src/regtech_data_validator/validator.py
index 917f62a..a32c309 100644
--- a/src/regtech_data_validator/validator.py
+++ b/src/regtech_data_validator/validator.py
@@ -121,6 +121,7 @@ def validate(
 
                 if check_output is not None:
                     # Filter data not associated with failed Check, and update index for merging with findings_df
+                    check_output = check_output.with_columns(pl.col('index').add(row_start))
                     failed_records_df = _filter_valid_records(submission_df, check_output, fields)
                     failed_record_fields_df = _records_to_fields(failed_records_df)
                     findings = _add_validation_metadata(failed_record_fields_df, check)
@@ -133,16 +134,16 @@ def validate(
             if check_findings:
                 findings_df = pl.concat(check_findings)
 
-    updated_df = add_uid(findings_df, submission_df)
+    updated_df = add_uid(findings_df, submission_df, row_start)
     return updated_df
 
 
 # Add the uid for the record throwing the error/warning to the error dataframe
-def add_uid(results_df: pl.DataFrame, submission_df: pl.DataFrame) -> pl.DataFrame:
+def add_uid(results_df: pl.DataFrame, submission_df: pl.DataFrame, offset: int) -> pl.DataFrame:
     if results_df.is_empty():
         return results_df
 
-    uid_records = results_df['record_no'] - 1
+    uid_records = results_df['record_no'] - 1 - offset
     results_df = results_df.with_columns(submission_df['uid'].gather(uid_records).alias('uid'))
     return results_df
 
diff --git a/tests/test_sample_data.py b/tests/test_sample_data.py
index 3033a50..cac9cf6 100644
--- a/tests/test_sample_data.py
+++ b/tests/test_sample_data.py
@@ -44,7 +44,8 @@ def test_all_logic_errors(self):
         vresults = []
         for vresult in validate_batch_csv(ALL_LOGIC_ERRORS):
             vresults.append(vresult)
-
+        # 3 phases
+        assert len(vresults) == 3
         results = pl.concat([vr.findings for vr in vresults], how="diagonal")
 
         logic_schema = get_phase_2_schema_for_lei()
@@ -85,3 +86,29 @@ def test_all_logic_warnings(self):
         # check that the findings validation_id Series contains at least 1 of every logic warning check id
         assert len(set(results['validation_id'].to_list()).difference(set(logic_checks))) == 0
         assert results.select(pl.col('phase').eq(ValidationPhase.LOGICAL.value).all()).item()
+
+    def test_all_logic_errors_batched(self):
+        vresults = []
+        for vresult in validate_batch_csv(ALL_LOGIC_ERRORS, batch_size=3):
+            vresults.append(vresult)
+        # 3 phases with 3 batches
+        assert len(vresults) == 9
+        results = pl.concat([vr.findings for vr in vresults], how="diagonal")
+
+        logic_schema = get_phase_2_schema_for_lei()
+        register_schema = get_register_schema()
+        logic_checks = [
+            check.title
+            for col_schema in logic_schema.columns.values()
+            for check in col_schema.checks
+            if check.severity == Severity.ERROR
+        ]
+        logic_checks.extend(
+            [check.title for col_schema in register_schema.columns.values() for check in col_schema.checks]
+        )
+
+        results = results.filter(pl.col('validation_type') == 'Error')
+
+        # check that the findings validation_id Series contains at least 1 of every logic error check id
+        assert len(set(results['validation_id'].to_list()).difference(set(logic_checks))) == 0
+        assert results.select(pl.col('phase').eq(ValidationPhase.LOGICAL.value).all()).item()
\ No newline at end of file

From 02b303d26cf4a713ad7e723168af144ad40ba0d6 Mon Sep 17 00:00:00 2001
From: lchen <73617864+lchen-2101@users.noreply.github.com>
Date: Thu, 14 Nov 2024 10:38:49 -0800
Subject: [PATCH 2/3] fix: linter

---
 tests/test_sample_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_sample_data.py b/tests/test_sample_data.py
index cac9cf6..0b15d5e 100644
--- a/tests/test_sample_data.py
+++ b/tests/test_sample_data.py
@@ -111,4 +111,4 @@ def test_all_logic_errors_batched(self):
 
         # check that the findings validation_id Series contains at least 1 of every logic error check id
         assert len(set(results['validation_id'].to_list()).difference(set(logic_checks))) == 0
-        assert results.select(pl.col('phase').eq(ValidationPhase.LOGICAL.value).all()).item()
\ No newline at end of file
+        assert results.select(pl.col('phase').eq(ValidationPhase.LOGICAL.value).all()).item()

From a41f7b39640277c6c1d808913fbfbbba8f76b1a6 Mon Sep 17 00:00:00 2001
From: lchen <73617864+lchen-2101@users.noreply.github.com>
Date: Thu, 14 Nov 2024 11:39:59 -0800
Subject: [PATCH 3/3] polars csv batch reader no longer respects user specified
 batch_size...

---
 tests/test_sample_data.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/tests/test_sample_data.py b/tests/test_sample_data.py
index 0b15d5e..a56211b 100644
--- a/tests/test_sample_data.py
+++ b/tests/test_sample_data.py
@@ -86,29 +86,3 @@ def test_all_logic_warnings(self):
         # check that the findings validation_id Series contains at least 1 of every logic warning check id
         assert len(set(results['validation_id'].to_list()).difference(set(logic_checks))) == 0
         assert results.select(pl.col('phase').eq(ValidationPhase.LOGICAL.value).all()).item()
-
-    def test_all_logic_errors_batched(self):
-        vresults = []
-        for vresult in validate_batch_csv(ALL_LOGIC_ERRORS, batch_size=3):
-            vresults.append(vresult)
-        # 3 phases with 3 batches
-        assert len(vresults) == 9
-        results = pl.concat([vr.findings for vr in vresults], how="diagonal")
-
-        logic_schema = get_phase_2_schema_for_lei()
-        register_schema = get_register_schema()
-        logic_checks = [
-            check.title
-            for col_schema in logic_schema.columns.values()
-            for check in col_schema.checks
-            if check.severity == Severity.ERROR
-        ]
-        logic_checks.extend(
-            [check.title for col_schema in register_schema.columns.values() for check in col_schema.checks]
-        )
-
-        results = results.filter(pl.col('validation_type') == 'Error')
-
-        # check that the findings validation_id Series contains at least 1 of every logic error check id
-        assert len(set(results['validation_id'].to_list()).difference(set(logic_checks))) == 0
-        assert results.select(pl.col('phase').eq(ValidationPhase.LOGICAL.value).all()).item()