Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 1635 unexpected missing label error when ignoring header case #1638

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 78 additions & 10 deletions frictionless/detector/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ def detect_schema(
labels: Optional[List[str]] = None,
schema: Optional[Schema] = None,
field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES,
**options: Any,
) -> Schema:
"""Detect schema from fragment

Expand Down Expand Up @@ -408,17 +409,25 @@ def detect_schema(
if len(labels) != len(set(labels)):
note = '"schema_sync" requires unique labels in the header'
raise FrictionlessException(note)
mapping = {field.name: field for field in schema.fields} # type: ignore
schema.clear_fields()
for name in labels:
field = mapping.get(name)
if not field:
field = Field.from_descriptor({"name": name, "type": "any"})
schema.add_field(field)

case_sensitive = options["header_case"]

assert schema
assert schema.fields
assert all(isinstance(field, Field) for field in schema.fields)

mapped_fields = self.mapped_schema_fields_names(
schema.fields, case_sensitive # type: ignore
)

self.add_fields_to_schema_among_labels(
mapped_fields, schema, labels, case_sensitive # type: ignore
)

# For required fields that are missing
for _, field in mapping.items():
if field and field.required and field.name not in labels:
schema.add_field(field)
self.add_missing_required_labels_to_schema_fields(
mapped_fields, schema, labels, case_sensitive # type: ignore
)

# Patch schema
if self.schema_patch:
Expand All @@ -433,3 +442,62 @@ def detect_schema(
schema = Schema.from_descriptor(descriptor)

return schema # type: ignore

@staticmethod
def mapped_schema_fields_names(
fields: List[Field], case_sensitive: bool
) -> Dict[str, Optional[Field]]:
"""Create a dictionnary to map fields name with schema fields
considering case sensitivity

Args:
fields (Union[List[None], List[Field]]): list of original
schema fields
case_sensitive (bool): True if case sensitive, False else

Returns:
Dict[str, Optional[Field]]
"""
if case_sensitive:
return {field.name: field for field in fields}
else:
return {field.name.lower(): field for field in fields}

@staticmethod
def add_fields_to_schema_among_labels(
fields_mapped: Dict[str, Field],
schema: Schema,
labels: List[str],
case_sensitive: bool,
):
schema.clear_fields()
for name in labels:
default_field = Field.from_descriptor({"name": name, "type": "any"})
if case_sensitive:
field = fields_mapped.get(name, default_field)
else:
field = fields_mapped.get(name.lower(), default_field)
schema.add_field(field)

def add_missing_required_labels_to_schema_fields(
self,
fields_map: Dict[str, Field],
schema: Schema,
labels: List[str],
case_sensitive: bool,
):
for _, field in fields_map.items():
if self.field_name_not_in_labels(field, labels, case_sensitive):
schema.add_field(field)

@staticmethod
def field_name_not_in_labels(
field: Field, labels: List[str], case_sensitive: bool
) -> bool:
if case_sensitive:
return field.required and field.name not in labels
else:
return (
field.required
and field.name.lower() not in [label.lower() for label in labels]
)
51 changes: 43 additions & 8 deletions frictionless/resources/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import warnings
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union

from frictionless.schema.field import Field

from .. import errors, helpers, settings
from ..analyzer import Analyzer
from ..dialect import Dialect
Expand Down Expand Up @@ -204,6 +206,7 @@ def __open_schema(self):
labels=self.labels,
schema=self.schema,
field_candidates=system.detect_field_candidates(),
header_case=self.dialect.header_case,
)
self.stats.fields = len(self.schema.fields)

Expand Down Expand Up @@ -386,18 +389,50 @@ def row_stream():
# Yield row
yield row

# NB: missing required labels are not included in the
# field_info parameter used for row creation
if self.detector.schema_sync:
# Missing required labels have in 'field_info'
# parameter used for row creation
for field in self.schema.fields:
if field.name not in self.labels and field.name in field_info["names"]:
field_index = field_info["names"].index(field.name)
del field_info["names"][field_index]
del field_info["objects"][field_index]
del field_info["mapping"][field.name]
# # Create row stream
self.remove_missing_required_label_from_field_info(field, field_info)

# Create row stream
self.__row_stream = row_stream()

def remove_missing_required_label_from_field_info(
self, field: Field, field_info: Dict[str, Any]
):
is_case_sensitive = self.dialect.header_case
if self.field_is_missing(
field.name, field_info["names"], self.labels, is_case_sensitive
):
self.remove_field_from_field_info(field.name, field_info)

@staticmethod
def field_is_missing(
field_name: str,
expected_fields_names: List[str],
table_labels: types.ILabels,
case_sensitive: bool,
) -> bool:
"""Check if a schema field name is missing from the TableResource
labels.
"""
if not case_sensitive:
field_name = field_name.lower()
table_labels = [label.lower() for label in table_labels]
expected_fields_names = [
field_name.lower() for field_name in expected_fields_names
]

return field_name not in table_labels and field_name in expected_fields_names

@staticmethod
def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]):
field_index = field_info["names"].index(field_name)
del field_info["names"][field_index]
del field_info["objects"][field_index]
del field_info["mapping"][field_name]

# Read

def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]:
Expand Down
61 changes: 59 additions & 2 deletions tests/validator/resource/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import pytest

import frictionless
from frictionless import Checklist, Detector, FrictionlessException, Schema, fields
from frictionless import Checklist, Detector, Dialect, FrictionlessException, Schema
from frictionless import fields
from frictionless.resources import TableResource

# General
Expand Down Expand Up @@ -368,8 +369,64 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16
tc["source"], schema=schema, detector=Detector(schema_sync=True)
)
report = frictionless.validate(resource)
print(report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]))
assert (
report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])
== tc["expected_flattened_report"]
)


def test_validate_resource_ignoring_header_case_issue_1635():
schema_descriptor = {
"$schema": "https://frictionlessdata.io/schemas/table-schema.json",
"fields": [
{
"name": "aa",
"title": "Field A",
"type": "string",
"constraints": {"required": True},
},
{
"name": "BB",
"title": "Field B",
"type": "string",
"constraints": {"required": True},
},
],
}

test_cases = [
{
"source": [["AA", "bb"], ["a", "b"]],
"header_case": False,
"expected_valid_report": True,
"expected_flattened_report": [],
},
{
"source": [["AA", "bb"], ["a", "b"]],
"header_case": True,
"expected_valid_report": False,
"expected_flattened_report": [
[None, 3, "aa", "missing-label"],
[None, 4, "BB", "missing-label"],
],
},
{
"source": [["bb"], ["foo"]],
"header_case": False,
"expected_valid_report": False,
"expected_flattened_report": [[None, 2, "aa", "missing-label"]],
},
]

for tc in test_cases:
resource = TableResource(
tc["source"],
schema=Schema.from_descriptor(schema_descriptor),
detector=Detector(schema_sync=True),
dialect=Dialect(header_case=tc["header_case"]),
)
report = frictionless.validate(resource)
assert report.valid == tc["expected_valid_report"]
assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[
"expected_flattened_report"
]
Loading