From 82e9f0fbe5d130cac5e4c5e487d4368180a42f12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:51:15 +0100 Subject: [PATCH 01/15] TDD: add test case for ignore header case in validating resource with schema_sync option --- tests/validator/resource/test_schema.py | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index 616f0a2728..b05fe9f003 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -4,6 +4,7 @@ import frictionless from frictionless import Checklist, Detector, FrictionlessException, Schema, fields +from frictionless import Dialect from frictionless.resources import TableResource # General @@ -373,3 +374,36 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]) == tc["expected_flattened_report"] ) + + +def test_validate_resource_ignoring_header_case_issue_1635(): + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [ + { + "name": "AA", + "title": "Field A", + "type": "string", + "constraints": {"required": True}, + }, + {"name": "BB", "title": "Field B", "type": "string"}, + {"name": "CC", "title": "Field C", "type": "string"}, + ], + } + + source = [["aa", "bb", "cc"], ["a", "b", "c"]] + report = frictionless.validate( + source=source, + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True), + dialect=Dialect(header_case=False) + ) + assert report.valid + + report = frictionless.validate( + source=source, + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True) + ) + assert not report.valid + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [[None, 4, "AA", "missing-label"]] From f832751b45d661baab0cc4741124e26bf1bbbe90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:52:10 +0100 Subject: [PATCH 02/15] TDD: solve new test case --- frictionless/detector/detector.py | 20 ++++++++++++++++---- frictionless/resources/table.py | 6 ++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index dd656427d5..59d773a8b0 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -298,6 +298,7 @@ def detect_schema( labels: Optional[List[str]] = None, schema: Optional[Schema] = None, field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES, + **options: Any ) -> Schema: """Detect schema from fragment @@ -408,17 +409,28 @@ def detect_schema( if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - mapping = {field.name: field for field in schema.fields} # type: ignore + if options["header_case"]: + mapping = {field.name: field for field in schema.fields} # type: ignore + else: + mapping = {field.name.lower(): field for field in schema.fields} # type: ignore schema.clear_fields() for name in labels: - field = mapping.get(name) + if options["header_case"]: + field = mapping.get(name) + else: + field = mapping.get(name.lower()) if not field: field = Field.from_descriptor({"name": name, "type": "any"}) schema.add_field(field) # For required fields that are missing for _, field in mapping.items(): - if field and field.required and field.name not in labels: - schema.add_field(field) + if options["header_case"]: + if field and field.required and field.name not in labels: + schema.add_field(field) + else: + if field and field.required and field.name.lower() not in [ + label.lower() for label in labels]: + schema.add_field(field) # Patch schema if self.schema_patch: diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 98a76f3473..4ba335b459 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -204,6 +204,7 @@ def __open_schema(self): labels=self.labels, schema=self.schema, field_candidates=system.detect_field_candidates(), + header_case=self.dialect.header_case ) self.stats.fields = len(self.schema.fields) @@ -388,14 +389,15 @@ def row_stream(): # NB: missing required labels are not included in the # field_info parameter used for row creation - if self.detector.schema_sync: + if self.detector.schema_sync and self.dialect.header_case: for field in self.schema.fields: if field.name not in self.labels and field.name in field_info["names"]: field_index = field_info["names"].index(field.name) del field_info["names"][field_index] del field_info["objects"][field_index] del field_info["mapping"][field.name] - # # Create row stream + + # Create row stream self.__row_stream = row_stream() # Read From 36b99ae4e29454edd5d97b561e1ae3469db1f7f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:55:38 +0100 Subject: [PATCH 03/15] Refacto: refactoring test_schema --- tests/validator/resource/test_schema.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index b05fe9f003..e3db265894 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -392,18 +392,23 @@ def test_validate_resource_ignoring_header_case_issue_1635(): } source = [["aa", "bb", "cc"], ["a", "b", "c"]] + + schema = Schema.from_descriptor(schema_descriptor) + + detector = Detector(schema_sync=True) + report = frictionless.validate( - source=source, - schema=Schema.from_descriptor(schema_descriptor), - detector=Detector(schema_sync=True), + source, + schema=schema, + detector=detector, dialect=Dialect(header_case=False) ) assert report.valid report = frictionless.validate( - source=source, - schema=Schema.from_descriptor(schema_descriptor), - detector=Detector(schema_sync=True) + source, + schema=schema, + detector=detector, ) assert not report.valid assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [[None, 4, "AA", "missing-label"]] From dd5edc583ecaeb3eaa586334f9c01d69ca55af00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:57:34 +0100 Subject: [PATCH 04/15] Linting: format files --- frictionless/detector/detector.py | 12 ++++++++---- frictionless/resources/table.py | 2 +- tests/validator/resource/test_schema.py | 15 +++++++-------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 59d773a8b0..7b53f9d23c 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -298,7 +298,7 @@ def detect_schema( labels: Optional[List[str]] = None, schema: Optional[Schema] = None, field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES, - **options: Any + **options: Any, ) -> Schema: """Detect schema from fragment @@ -412,7 +412,7 @@ def detect_schema( if options["header_case"]: mapping = {field.name: field for field in schema.fields} # type: ignore else: - mapping = {field.name.lower(): field for field in schema.fields} # type: ignore + mapping = {field.name.lower(): field for field in schema.fields} # type: ignore schema.clear_fields() for name in labels: if options["header_case"]: @@ -428,8 +428,12 @@ def detect_schema( if field and field.required and field.name not in labels: schema.add_field(field) else: - if field and field.required and field.name.lower() not in [ - label.lower() for label in labels]: + if ( + field + and field.required + and field.name.lower() + not in [label.lower() for label in labels] + ): schema.add_field(field) # Patch schema diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 4ba335b459..b143529b33 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -204,7 +204,7 @@ def __open_schema(self): labels=self.labels, schema=self.schema, field_candidates=system.detect_field_candidates(), - header_case=self.dialect.header_case + header_case=self.dialect.header_case, ) self.stats.fields = len(self.schema.fields) diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index e3db265894..cd954f551d 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -3,8 +3,8 @@ import pytest import frictionless -from frictionless import Checklist, Detector, FrictionlessException, Schema, fields -from frictionless import Dialect +from frictionless import Checklist, Detector, Dialect, FrictionlessException, Schema +from frictionless import fields from frictionless.resources import TableResource # General @@ -394,14 +394,11 @@ def test_validate_resource_ignoring_header_case_issue_1635(): source = [["aa", "bb", "cc"], ["a", "b", "c"]] schema = Schema.from_descriptor(schema_descriptor) - + detector = Detector(schema_sync=True) report = frictionless.validate( - source, - schema=schema, - detector=detector, - dialect=Dialect(header_case=False) + source, schema=schema, detector=detector, dialect=Dialect(header_case=False) ) assert report.valid @@ -411,4 +408,6 @@ def test_validate_resource_ignoring_header_case_issue_1635(): detector=detector, ) assert not report.valid - assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [[None, 4, "AA", "missing-label"]] + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ + [None, 4, "AA", "missing-label"] + ] From 125e4514c9cedbcc38350013cb3da409ba3122f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 09:59:38 +0100 Subject: [PATCH 05/15] TDD: add test case for missing label in tabular data and ignoring case: test fails --- tests/validator/resource/test_schema.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index cd954f551d..380cb6da0c 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -369,12 +369,34 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 tc["source"], schema=schema, detector=Detector(schema_sync=True) ) report = frictionless.validate(resource) - print(report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) assert ( report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]) == tc["expected_flattened_report"] ) + # Ignore case + schema_descriptor_3 = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [ + {"name": "Aa", "constraints": {"required": True}}, + {"name": "BB", "constraints": {"required": True}}, + {"name": "cc"} + ] + } + schema = Schema.from_descriptor(schema_descriptor_3) + source = [["bb", "CC"], ["foo", "bar"]] + report = frictionless.validate( + source, + schema=schema, + detector=Detector(schema_sync=True), + dialect=Dialect(header_case=False) + ) + assert not report.valid + # Expect one single error misisng-label related to missing column 'Aa' + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ + [None, 3, "Aa", "missing-label"] + ] + def test_validate_resource_ignoring_header_case_issue_1635(): schema_descriptor = { From a114d6c3b5ff6bcced05384c2fe28db04ee100f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 10:50:57 +0100 Subject: [PATCH 06/15] TDD: fix missing required field with 'header_case=False' dialect and 'schema_sync=True' options --- frictionless/resources/table.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index b143529b33..a855f494bb 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -389,13 +389,21 @@ def row_stream(): # NB: missing required labels are not included in the # field_info parameter used for row creation - if self.detector.schema_sync and self.dialect.header_case: + if self.detector.schema_sync: for field in self.schema.fields: - if field.name not in self.labels and field.name in field_info["names"]: - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] + if self.dialect.header_case: + if field.name not in self.labels and field.name in field_info["names"]: + field_index = field_info["names"].index(field.name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field.name] + else: # Ignore case + if field.name.lower() not in [label.lower() for label in self.labels] \ + and field.name.lower() in [field_info_name.lower() for field_info_name in field_info["names"]]: + field_index = field_info["names"].index(field.name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field.name] # Create row stream self.__row_stream = row_stream() From 227e386611bf206631b201016bf6488eacabb483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 10:52:06 +0100 Subject: [PATCH 07/15] Linting: format files and sort imports --- frictionless/resources/table.py | 12 +++++++++--- tests/validator/resource/test_schema.py | 10 +++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index a855f494bb..03093cb989 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -392,14 +392,20 @@ def row_stream(): if self.detector.schema_sync: for field in self.schema.fields: if self.dialect.header_case: - if field.name not in self.labels and field.name in field_info["names"]: + if ( + field.name not in self.labels + and field.name in field_info["names"] + ): field_index = field_info["names"].index(field.name) del field_info["names"][field_index] del field_info["objects"][field_index] del field_info["mapping"][field.name] else: # Ignore case - if field.name.lower() not in [label.lower() for label in self.labels] \ - and field.name.lower() in [field_info_name.lower() for field_info_name in field_info["names"]]: + if field.name.lower() not in [ + label.lower() for label in self.labels + ] and field.name.lower() in [ + field_info_name.lower() for field_info_name in field_info["names"] + ]: field_index = field_info["names"].index(field.name) del field_info["names"][field_index] del field_info["objects"][field_index] diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index 380cb6da0c..d7aa9ff674 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -378,10 +378,10 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 schema_descriptor_3 = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ - {"name": "Aa", "constraints": {"required": True}}, - {"name": "BB", "constraints": {"required": True}}, - {"name": "cc"} - ] + {"name": "Aa", "constraints": {"required": True}}, + {"name": "BB", "constraints": {"required": True}}, + {"name": "cc"}, + ], } schema = Schema.from_descriptor(schema_descriptor_3) source = [["bb", "CC"], ["foo", "bar"]] @@ -389,7 +389,7 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 source, schema=schema, detector=Detector(schema_sync=True), - dialect=Dialect(header_case=False) + dialect=Dialect(header_case=False), ) assert not report.valid # Expect one single error misisng-label related to missing column 'Aa' From 4431d951d9765b2a67530cf3c25e9de587e0ed92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 11:04:25 +0100 Subject: [PATCH 08/15] Refacto: refactoring test --- tests/validator/resource/test_schema.py | 86 ++++++++++++------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index d7aa9ff674..69a3acd36b 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -374,62 +374,60 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 == tc["expected_flattened_report"] ) - # Ignore case - schema_descriptor_3 = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [ - {"name": "Aa", "constraints": {"required": True}}, - {"name": "BB", "constraints": {"required": True}}, - {"name": "cc"}, - ], - } - schema = Schema.from_descriptor(schema_descriptor_3) - source = [["bb", "CC"], ["foo", "bar"]] - report = frictionless.validate( - source, - schema=schema, - detector=Detector(schema_sync=True), - dialect=Dialect(header_case=False), - ) - assert not report.valid - # Expect one single error misisng-label related to missing column 'Aa' - assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ - [None, 3, "Aa", "missing-label"] - ] - def test_validate_resource_ignoring_header_case_issue_1635(): schema_descriptor = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ { - "name": "AA", + "name": "aa", "title": "Field A", "type": "string", "constraints": {"required": True}, }, - {"name": "BB", "title": "Field B", "type": "string"}, + { + "name": "BB", + "title": "Field B", + "type": "string", + "constraints": {"required": True}, + }, {"name": "CC", "title": "Field C", "type": "string"}, ], } - source = [["aa", "bb", "cc"], ["a", "b", "c"]] - - schema = Schema.from_descriptor(schema_descriptor) - - detector = Detector(schema_sync=True) - - report = frictionless.validate( - source, schema=schema, detector=detector, dialect=Dialect(header_case=False) - ) - assert report.valid - - report = frictionless.validate( - source, - schema=schema, - detector=detector, - ) - assert not report.valid - assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ - [None, 4, "AA", "missing-label"] + test_cases = [ + { + "source": [["AA", "bb", "cc"], ["a", "b", "c"]], + "dialect": Dialect(header_case=False), + "expected_valid_report": True, + "expected_flattened_report": [], + }, + { + "source": [["AA", "bb", "cc"], ["a", "b", "c"]], + "dialect": Dialect(header_case=True), + "expected_valid_report": False, + "expected_flattened_report": [ + [None, 4, "aa", "missing-label"], + [None, 5, "BB", "missing-label"], + ], + }, + { + "source": [["bb", "CC"], ["foo", "bar"]], + "dialect": Dialect(header_case=False), + "expected_valid_report": False, + "expected_flattened_report": [[None, 3, "aa", "missing-label"]], + }, ] + + for tc in test_cases: + resource = TableResource( + tc["source"], + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True), + dialect=tc["dialect"], + ) + report = frictionless.validate(resource) + assert report.valid == tc["expected_valid_report"] + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[ + "expected_flattened_report" + ] From 5107e53a577537d4627481b5241fa90bda00de83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 15:02:06 +0100 Subject: [PATCH 09/15] Refacto: refactoring test --- tests/validator/resource/test_schema.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/validator/resource/test_schema.py b/tests/validator/resource/test_schema.py index 69a3acd36b..458d25ca3b 100644 --- a/tests/validator/resource/test_schema.py +++ b/tests/validator/resource/test_schema.py @@ -391,31 +391,30 @@ def test_validate_resource_ignoring_header_case_issue_1635(): "type": "string", "constraints": {"required": True}, }, - {"name": "CC", "title": "Field C", "type": "string"}, ], } test_cases = [ { - "source": [["AA", "bb", "cc"], ["a", "b", "c"]], - "dialect": Dialect(header_case=False), + "source": [["AA", "bb"], ["a", "b"]], + "header_case": False, "expected_valid_report": True, "expected_flattened_report": [], }, { - "source": [["AA", "bb", "cc"], ["a", "b", "c"]], - "dialect": Dialect(header_case=True), + "source": [["AA", "bb"], ["a", "b"]], + "header_case": True, "expected_valid_report": False, "expected_flattened_report": [ - [None, 4, "aa", "missing-label"], - [None, 5, "BB", "missing-label"], + [None, 3, "aa", "missing-label"], + [None, 4, "BB", "missing-label"], ], }, { - "source": [["bb", "CC"], ["foo", "bar"]], - "dialect": Dialect(header_case=False), + "source": [["bb"], ["foo"]], + "header_case": False, "expected_valid_report": False, - "expected_flattened_report": [[None, 3, "aa", "missing-label"]], + "expected_flattened_report": [[None, 2, "aa", "missing-label"]], }, ] @@ -424,7 +423,7 @@ def test_validate_resource_ignoring_header_case_issue_1635(): tc["source"], schema=Schema.from_descriptor(schema_descriptor), detector=Detector(schema_sync=True), - dialect=tc["dialect"], + dialect=Dialect(header_case=tc["header_case"]), ) report = frictionless.validate(resource) assert report.valid == tc["expected_valid_report"] From 2b9e6052b251a308590fc875be75a05d974629dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 16:10:58 +0100 Subject: [PATCH 10/15] Refacto: refactoring removing missing required label from field info part in '__open_row_stream()' 'TableResource' method --- frictionless/resources/table.py | 61 +++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 03093cb989..89b9fcbc7d 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -5,6 +5,8 @@ import warnings from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from frictionless.schema.field import Field + from .. import errors, helpers, settings from ..analyzer import Analyzer from ..dialect import Dialect @@ -387,33 +389,50 @@ def row_stream(): # Yield row yield row - # NB: missing required labels are not included in the - # field_info parameter used for row creation if self.detector.schema_sync: + # Missing required labels have in 'field_info' + # parameter used for row creation for field in self.schema.fields: - if self.dialect.header_case: - if ( - field.name not in self.labels - and field.name in field_info["names"] - ): - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] - else: # Ignore case - if field.name.lower() not in [ - label.lower() for label in self.labels - ] and field.name.lower() in [ - field_info_name.lower() for field_info_name in field_info["names"] - ]: - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] + self.remove_missing_required_label_from_field_info(field, field_info) # Create row stream self.__row_stream = row_stream() + def remove_missing_required_label_from_field_info( + self, field: Field, field_info: Dict[str, Any] + ): + is_case_sensitive = self.dialect.header_case + if self.field_is_missing( + field.name, field_info["names"], self.labels, is_case_sensitive + ): + self.remove_field_from_field_info(field.name, field_info) + + @staticmethod + def field_is_missing( + field_name: str, + expected_fields_names: List[str], + table_labels: types.ILabels, + case_sensitive: bool, + ) -> bool: + """Check if a schema field name is missing from the TableResource + labels. + """ + if not case_sensitive: + field_name = field_name.lower() + table_labels = [label.lower() for label in table_labels] + expected_fields_names = [ + field_name.lower() for field_name in expected_fields_names + ] + + return field_name not in table_labels and field_name in expected_fields_names + + @staticmethod + def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): + field_index = field_info["names"].index(field_name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field_name] + # Read def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]: From 8ab835b094090a106aa79b46a8673733f57218de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 17:41:43 +0100 Subject: [PATCH 11/15] Refacto: WIP --- frictionless/detector/detector.py | 96 ++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 7b53f9d23c..72eb2da7e5 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -3,7 +3,7 @@ import codecs import os from copy import copy, deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import attrs @@ -409,32 +409,21 @@ def detect_schema( if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - if options["header_case"]: - mapping = {field.name: field for field in schema.fields} # type: ignore - else: - mapping = {field.name.lower(): field for field in schema.fields} # type: ignore - schema.clear_fields() - for name in labels: - if options["header_case"]: - field = mapping.get(name) - else: - field = mapping.get(name.lower()) - if not field: - field = Field.from_descriptor({"name": name, "type": "any"}) - schema.add_field(field) + + case_sensitive = options["header_case"] + + fields_mapped = self.mapped_schema_fields_names( + schema.fields, case_sensitive + ) + + self.add_fields_to_schema_among_labels( + fields_mapped, schema, labels, case_sensitive + ) + # For required fields that are missing - for _, field in mapping.items(): - if options["header_case"]: - if field and field.required and field.name not in labels: - schema.add_field(field) - else: - if ( - field - and field.required - and field.name.lower() - not in [label.lower() for label in labels] - ): - schema.add_field(field) + self.add_missing_required_labels_to_schema_fields( + fields_mapped, schema, labels, options["header_case"] + ) # Patch schema if self.schema_patch: @@ -449,3 +438,58 @@ def detect_schema( schema = Schema.from_descriptor(descriptor) return schema # type: ignore + + @staticmethod + def mapped_schema_fields_names( + fields: Union[List[None], List[Field]], case_sensitive: bool + ) -> Dict[str, Optional[Field]]: + """Create a dictionnary to map fields name with schema fields + considering case sensitivity + + Args: + fields (Union[List[None], List[Field]]): list of original + schema fields + case_sensitive (bool) + + Returns: + Dict[str, Optional[Field]] + """ + if case_sensitive: + return {field.name: field for field in fields} # type:ignore + else: + return {field.name.lower(): field for field in fields} # type: ignore + + @staticmethod + def add_fields_to_schema_among_labels( + fields_mapped: Dict[str, Optional[Field]], + schema: Schema, + labels: List[str], + case_sensitive: bool, + ): + schema.clear_fields() + for name in labels: + default_field = Field.from_descriptor({"name": name, "type": "any"}) + if case_sensitive: + field = fields_mapped.get(name, default_field) + else: + field = fields_mapped.get(name.lower(), default_field) + schema.add_field(field) # type: ignore + + @staticmethod + def add_missing_required_labels_to_schema_fields( + fields_map: Dict[str, Optional[Field]], + schema: Schema, + labels: List[str], + case_sensitive: bool, + ): + for _, field in fields_map.items(): + if case_sensitive: + if field and field.required and field.name not in labels: + schema.add_field(field) + else: + if ( + field + and field.required + and field.name.lower() not in [label.lower() for label in labels] + ): + schema.add_field(field) From 182d8c36f2cbbd9c917870ba553ebaad24cab464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 17:47:57 +0100 Subject: [PATCH 12/15] Refacto: WIP2 --- frictionless/detector/detector.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 72eb2da7e5..10bfc8d267 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -483,6 +483,7 @@ def add_missing_required_labels_to_schema_fields( case_sensitive: bool, ): for _, field in fields_map.items(): + #TODO use self.field_name_not_in_labels if case_sensitive: if field and field.required and field.name not in labels: schema.add_field(field) @@ -493,3 +494,20 @@ def add_missing_required_labels_to_schema_fields( and field.name.lower() not in [label.lower() for label in labels] ): schema.add_field(field) + + @staticmethod + def field_name_not_in_labels( + field: Field, + labels: List[str], + case_sensitive: bool + ) -> bool: + if case_sensitive: + return field and field.required and field.name not in labels + else: + return ( + field + and field.required + and field.name.lower() not in [ + label.lower() for label in labels + ] + ) From 36210abb7e933231efe767e2f844ff4fd4d534f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:02:49 +0100 Subject: [PATCH 13/15] Refacto: refactoring creating schema fields among labels when using 'schema_sync' 'Detector' option --- frictionless/detector/detector.py | 53 +++++++++++++------------------ 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 10bfc8d267..31b02b87e7 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -3,7 +3,7 @@ import codecs import os from copy import copy, deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional import attrs @@ -412,17 +412,21 @@ def detect_schema( case_sensitive = options["header_case"] - fields_mapped = self.mapped_schema_fields_names( - schema.fields, case_sensitive + assert schema + assert schema.fields + assert all(isinstance(field, Field) for field in schema.fields) + + mapped_fields = self.mapped_schema_fields_names( + schema.fields, case_sensitive # type: ignore ) self.add_fields_to_schema_among_labels( - fields_mapped, schema, labels, case_sensitive + mapped_fields, schema, labels, case_sensitive # type: ignore ) # For required fields that are missing self.add_missing_required_labels_to_schema_fields( - fields_mapped, schema, labels, options["header_case"] + mapped_fields, schema, labels, case_sensitive # type: ignore ) # Patch schema @@ -441,7 +445,7 @@ def detect_schema( @staticmethod def mapped_schema_fields_names( - fields: Union[List[None], List[Field]], case_sensitive: bool + fields: List[Field], case_sensitive: bool ) -> Dict[str, Optional[Field]]: """Create a dictionnary to map fields name with schema fields considering case sensitivity @@ -455,13 +459,13 @@ def mapped_schema_fields_names( Dict[str, Optional[Field]] """ if case_sensitive: - return {field.name: field for field in fields} # type:ignore + return {field.name: field for field in fields} else: - return {field.name.lower(): field for field in fields} # type: ignore + return {field.name.lower(): field for field in fields} @staticmethod def add_fields_to_schema_among_labels( - fields_mapped: Dict[str, Optional[Field]], + fields_mapped: Dict[str, Field], schema: Schema, labels: List[str], case_sensitive: bool, @@ -473,41 +477,28 @@ def add_fields_to_schema_among_labels( field = fields_mapped.get(name, default_field) else: field = fields_mapped.get(name.lower(), default_field) - schema.add_field(field) # type: ignore + schema.add_field(field) - @staticmethod def add_missing_required_labels_to_schema_fields( - fields_map: Dict[str, Optional[Field]], + self, + fields_map: Dict[str, Field], schema: Schema, labels: List[str], case_sensitive: bool, ): for _, field in fields_map.items(): - #TODO use self.field_name_not_in_labels - if case_sensitive: - if field and field.required and field.name not in labels: - schema.add_field(field) - else: - if ( - field - and field.required - and field.name.lower() not in [label.lower() for label in labels] - ): - schema.add_field(field) + if self.field_name_not_in_labels(field, labels, case_sensitive): + schema.add_field(field) @staticmethod def field_name_not_in_labels( - field: Field, - labels: List[str], - case_sensitive: bool + field: Field, labels: List[str], case_sensitive: bool ) -> bool: if case_sensitive: return field and field.required and field.name not in labels else: return ( - field - and field.required - and field.name.lower() not in [ - label.lower() for label in labels - ] + field + and field.required + and field.name.lower() not in [label.lower() for label in labels] ) From a830182e7bc29913cbed8a6f20df4d1c52a5bc36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:08:14 +0100 Subject: [PATCH 14/15] Doc: docstring --- frictionless/detector/detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 31b02b87e7..8722f69a10 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -453,7 +453,7 @@ def mapped_schema_fields_names( Args: fields (Union[List[None], List[Field]]): list of original schema fields - case_sensitive (bool) + case_sensitive (bool): True if case sensitive, False else Returns: Dict[str, Optional[Field]] From 7ab1fc10209a84142c450c51c73f79a8b0d324ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:11:31 +0100 Subject: [PATCH 15/15] Refacto: remve useless lines --- frictionless/detector/detector.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 8722f69a10..ba269d0a8a 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -495,10 +495,9 @@ def field_name_not_in_labels( field: Field, labels: List[str], case_sensitive: bool ) -> bool: if case_sensitive: - return field and field.required and field.name not in labels + return field.required and field.name not in labels else: return ( - field - and field.required + field.required and field.name.lower() not in [label.lower() for label in labels] )