Skip to content

Commit

Permalink
Include dictionary features to crs
Browse files Browse the repository at this point in the history
  • Loading branch information
ccl-core committed Nov 12, 2024
1 parent a7f42bb commit 7bfac62
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 10 deletions.
13 changes: 13 additions & 0 deletions libs/libcommon/src/libcommon/croissant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,19 @@ def feature_to_croissant_field(
"dataType": "sc:Integer",
"source": {"fileSet": {"@id": distribution_name}, "extract": {"column": column}},
}
# Field with sub-fields.
elif isinstance(feature, dict):
return {
"@type": "cr:Field",
"@id": field_name,
"name": field_name,
"description": f"Column '{column}' from the Hugging Face parquet file.",
"subField" : [
feature_to_croissant_field(
distribution_name, f"{field_name}/{subfeature_name}", column, sub_feature, add_transform=True, json_path=subfeature_name)
for subfeature_name, sub_feature in feature.items()
],
}
elif isinstance(feature, (Sequence, list)):
if isinstance(feature, Sequence):
sub_feature = feature.feature
Expand Down
33 changes: 23 additions & 10 deletions services/worker/tests/job_runners/dataset/test_croissant_crumbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,30 +128,43 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None:
assert croissant_crumbs["recordSet"][3]["name"] == "record_set_user_squad_with_space_0"
assert isinstance(croissant_crumbs["recordSet"][1]["field"], list)
assert isinstance(squad_info["features"], dict)
assert "1 skipped column: answers" in croissant_crumbs["recordSet"][1]["description"]
assert "skipped column" not in croissant_crumbs["recordSet"][1]["description"]
assert croissant_crumbs["recordSet"][1]["@id"] == "record_set_user_squad_with_space"
assert croissant_crumbs["recordSet"][3]["@id"] == "record_set_user_squad_with_space_0"
for i in [1, 3]:
for field in croissant_crumbs["recordSet"][i]["field"]:
assert "source" in field
assert "fileSet" in field["source"]
assert "@id" in field["source"]["fileSet"]
assert field["source"]["fileSet"]["@id"]
assert "extract" in field["source"]
if "subField" not in field:
assert "source" in field
assert "fileSet" in field["source"]
assert "@id" in field["source"]["fileSet"]
assert field["source"]["fileSet"]["@id"]
assert "extract" in field["source"]
else:
for sub_field in field["subField"]:
assert "source" in sub_field
assert "fileSet" in sub_field["source"]
assert "@id" in sub_field["source"]["fileSet"]
assert sub_field["source"]["fileSet"]["@id"]
assert "extract" in sub_field["source"]
assert "transform" in sub_field["source"]
if field["description"] == "Split to which the example belongs to.":
assert "regex" in field["source"]["transform"]
assert field["source"]["extract"]["fileProperty"] == "fullpath"
assert field["references"]["field"]["@id"] == croissant_crumbs["recordSet"][i - 1]["field"][0]["@id"]
else:
assert field["source"]["extract"]["column"] == field["@id"].split("/")[-1]
if "subField" not in field:
assert field["source"]["extract"]["column"] == field["@id"].split("/")[-1]
else:
for sub_field in field["subField"]:
assert sub_field["source"]["extract"]["column"] == field["@id"].split("/")[-1]

# Test fields.
assert len(croissant_crumbs["recordSet"][1]["field"]) == 5
assert len(croissant_crumbs["recordSet"][3]["field"]) == 5
assert len(croissant_crumbs["recordSet"][1]["field"]) == 6
assert len(croissant_crumbs["recordSet"][3]["field"]) == 6
for field in croissant_crumbs["recordSet"][1]["field"]:
assert field["@type"] == "cr:Field"
assert field["dataType"] == "sc:Text"
assert len(croissant_crumbs["recordSet"][1]["field"]) == len(squad_info["features"])
assert len(croissant_crumbs["recordSet"][1]["field"]) == len(squad_info["features"]) + 1

# Test distribution.
assert "distribution" in croissant_crumbs
Expand Down

0 comments on commit 7bfac62

Please sign in to comment.