From 7fa0b0f57eea7aee8e5278597169e205473566e3 Mon Sep 17 00:00:00 2001 From: ccl-core Date: Tue, 3 Dec 2024 14:05:32 +0000 Subject: [PATCH 1/2] Avoid redundant names/descriptions --- libs/libcommon/src/libcommon/croissant_utils.py | 9 --------- .../src/worker/job_runners/dataset/croissant_crumbs.py | 7 ------- .../tests/job_runners/dataset/test_croissant_crumbs.py | 4 +--- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/libs/libcommon/src/libcommon/croissant_utils.py b/libs/libcommon/src/libcommon/croissant_utils.py index d22cd12d3..a2eab5c9a 100644 --- a/libs/libcommon/src/libcommon/croissant_utils.py +++ b/libs/libcommon/src/libcommon/croissant_utils.py @@ -76,8 +76,6 @@ def feature_to_croissant_field( return { "@type": "cr:Field", "@id": field_name, - "name": field_name, - "description": f"Column '{column}' from the Hugging Face parquet file.", "dataType": HF_TO_CROISSANT_VALUE_TYPE[feature.dtype], "source": get_source(distribution_name, column, add_transform, json_path), } @@ -90,8 +88,6 @@ def feature_to_croissant_field( return { "@type": "cr:Field", "@id": field_name, - "name": field_name, - "description": f"Image column '{column}' from the Hugging Face parquet file.", "dataType": "sc:ImageObject", "source": source, } @@ -99,9 +95,6 @@ def feature_to_croissant_field( return { "@type": "cr:Field", "@id": field_name, - "name": field_name, - "description": f"ClassLabel column '{column}' from the Hugging Face parquet file.\nLabels:\n" - + ", ".join(f"{name} ({i})" for i, name in enumerate(feature.names)), "dataType": "sc:Integer", "source": get_source(distribution_name, column, add_transform, json_path), } @@ -110,8 +103,6 @@ def feature_to_croissant_field( return { "@type": "cr:Field", "@id": field_name, - "name": field_name, - "description": f"Column '{column}' from the Hugging Face parquet file.", "subField": [ feature_to_croissant_field( distribution_name, diff --git a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py index 935fa22d1..36fce6e33 100644 --- a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py +++ b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py @@ -82,8 +82,6 @@ def get_croissant_crumbs_from_dataset_infos( { "@type": "cr:FileSet", "@id": distribution_name, - "name": distribution_name, - "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).", "containedIn": {"@id": repo_name}, "encodingFormat": "application/x-parquet", "includes": f"{config}/*/*.parquet", @@ -99,8 +97,6 @@ def get_croissant_crumbs_from_dataset_infos( { "@type": "cr:Field", "@id": f"{split_record_set_name}/split_name", - "name": "split_name", - "description": "The name of the split.", "dataType": "sc:Text", } ) @@ -124,8 +120,6 @@ def get_croissant_crumbs_from_dataset_infos( { "@type": "cr:Field", "@id": f"{record_set_name}/split", - "name": f"{record_set_name}/split", - "description": "Split to which the example belongs to.", "dataType": "sc:Text", "source": { "fileSet": {"@id": distribution_name}, @@ -160,7 +154,6 @@ def get_croissant_crumbs_from_dataset_infos( { "@type": "cr:RecordSet", "@id": record_set_name, - "name": record_set_name, "description": description, "field": fields, } diff --git a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py index aa139d906..0e80c29a8 100644 --- a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py +++ b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py @@ -124,8 +124,6 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None: assert croissant_crumbs["recordSet"][i]["dataType"] == "cr:Split" assert croissant_crumbs["recordSet"][i]["key"]["@id"].endswith("name") assert croissant_crumbs["recordSet"][1]["@type"] == croissant_crumbs["recordSet"][3]["@type"] == "cr:RecordSet" - assert croissant_crumbs["recordSet"][1]["name"] == "record_set_user_squad_with_space" - assert croissant_crumbs["recordSet"][3]["name"] == "record_set_user_squad_with_space_0" assert isinstance(croissant_crumbs["recordSet"][1]["field"], list) assert isinstance(squad_info["features"], dict) assert "skipped column" not in croissant_crumbs["recordSet"][1]["description"] @@ -147,7 +145,7 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None: assert sub_field["source"]["fileSet"]["@id"] assert "extract" in sub_field["source"] assert "transform" in sub_field["source"] - if field["description"] == "Split to which the example belongs to.": + if field["@id"].endswith("split"): assert "regex" in field["source"]["transform"] assert field["source"]["extract"]["fileProperty"] == "fullpath" assert field["references"]["field"]["@id"] == croissant_crumbs["recordSet"][i - 1]["field"][0]["@id"] From 60cb3541420ef424865cf9b38f1c9a5846ff9354 Mon Sep 17 00:00:00 2001 From: ccl-core Date: Tue, 3 Dec 2024 14:54:02 +0000 Subject: [PATCH 2/2] Update test_croissant_utils --- libs/libcommon/tests/test_croissant_utils.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/libs/libcommon/tests/test_croissant_utils.py b/libs/libcommon/tests/test_croissant_utils.py index 970e8d756..cd9dc277e 100644 --- a/libs/libcommon/tests/test_croissant_utils.py +++ b/libs/libcommon/tests/test_croissant_utils.py @@ -38,8 +38,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N { "@type": "cr:Field", "@id": "field_name", - "name": "field_name", - "description": "Column 'column_name' from the Hugging Face parquet file.", "dataType": "sc:Integer", "source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}}, }, @@ -49,8 +47,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N { "@type": "cr:Field", "@id": "field_name", - "name": "field_name", - "description": "Column 'column_name' from the Hugging Face parquet file.", "dataType": "sc:Integer", "source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}}, "repeated": True, @@ -61,8 +57,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N { "@type": "cr:Field", "@id": "field_name", - "name": "field_name", - "description": "Column 'column_name' from the Hugging Face parquet file.", "dataType": "sc:Integer", "source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}}, "repeated": True,