/croissant -> /croissant-crumbs (only specific fields) (#2643)

* update in compatible-libraries * update in docs * bump compatible-libraries version * remove fields that will be created on Hub's side name, description, identifier, license and url will be created by the Hub's API endpoint (see https://huggingface.co/api/datasets/mnist/croissant). This endpoint only returns Croissant-specific fields. * fix test * rename /croissant to /croissant-crumbs We keep /croissant for compatibility, until we change it in moonlanding Also: I didn't change in the docs yet, to not interfere with #2642 * Croissant version is now 1.0, no more early draft * rename datasets-server URL * link to the Hub API docs see huggingface/hub-docs#1259 * Update croissant.md --------- Co-authored-by: Quentin Lhoest <[email protected]> Co-authored-by: Quentin Lhoest <[email protected]>
huggingface · Mar 28, 2024 · 4dddea2 · 4dddea2
1 parent 86a6759
commit 4dddea2
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 100 deletions.
diff --git a/docs/source/croissant.md b/docs/source/croissant.md
@@ -6,15 +6,9 @@ Datasets Server automatically generates the metadata in [Croissant](https://gith
 
 Croissant is a metadata format build on top of [schema.org](https://schema.org/) aimed at describing datasets used for machine learning to help indexing, searching and loading them programmatically.
 
-<Tip>
-
-The [specification](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md) is still in early draft status. It may evolve in the future, and backward compatibility is not guaranteed.
-
-</Tip>
-
 ## Get the metadata
 
-This guide shows you how to use Hugging Face `/croissant` endpoint to retrieve the Croissant metadata associated to a dataset.
+This guide shows you how to use [Hugging Face `/croissant` endpoint](https://huggingface.co/docs/hub/api#get-apidatasetsrepoidcroissant) to retrieve the Croissant metadata associated to a dataset.
 
 The `/croissant` endpoint takes the dataset name in the URL, for example for the `ibm/duorc` dataset:
 
@@ -58,7 +52,7 @@ curl https://huggingface.co/api/datasets/ibm/duorc/croissant \
 </curl>
 </inferencesnippet>
 
-Under the hood it uses the `https://datasets-server.huggingface.co/croissant` endpoint and enriches it with the Hub metadata.
+Under the hood it uses the `https://datasets-server.huggingface.co/croissant-crumbs` endpoint and enriches it with the Hub metadata.
 
 The endpoint response is a [JSON-LD](https://json-ld.org/) containing the metadata in the Croissant format. For example, the [`ibm/duorc`](https://huggingface.co/datasets/ibm/duorc) dataset has two configurations, `ParaphraseRC` and `SelfRC` (see the [List splits and configurations](./splits) guide for more details about splits and configurations). The metadata links to their Parquet files and describes the type of each of the six columns: `plot_id`, `plot`, `title`, `question_id`, `question`, and `no_answer`:
 

diff --git a/services/api/src/api/app.py b/services/api/src/api/app.py
@@ -20,7 +20,7 @@
 from starlette_prometheus import PrometheusMiddleware
 
 from api.config import AppConfig, EndpointConfig
-from api.routes.croissant import create_croissant_endpoint
+from api.routes.croissant_crumbs import create_croissant_crumbs_endpoint
 from api.routes.endpoint import EndpointsDefinition, create_endpoint
 from api.routes.webhook import create_webhook_endpoint
 
@@ -104,8 +104,8 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi
         for endpoint_name, step_by_input_type in endpoints_definition.step_by_input_type_and_endpoint.items()
     ] + [
         Route(
-            "/croissant",
-            endpoint=create_croissant_endpoint(
+            "/croissant-crumbs",
+            endpoint=create_croissant_crumbs_endpoint(
                 hf_endpoint=app_config.common.hf_endpoint,
                 hf_token=app_config.common.hf_token,
                 blocked_datasets=app_config.common.blocked_datasets,
@@ -118,6 +118,22 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi
                 storage_clients=storage_clients,
             ),
         ),
+        Route(
+            "/croissant",
+            endpoint=create_croissant_crumbs_endpoint(
+                hf_endpoint=app_config.common.hf_endpoint,
+                hf_token=app_config.common.hf_token,
+                blocked_datasets=app_config.common.blocked_datasets,
+                hf_jwt_public_keys=hf_jwt_public_keys,
+                hf_jwt_algorithm=app_config.api.hf_jwt_algorithm,
+                external_auth_url=app_config.api.external_auth_url,
+                hf_timeout_seconds=app_config.api.hf_timeout_seconds,
+                max_age_long=app_config.api.max_age_long,
+                max_age_short=app_config.api.max_age_short,
+                storage_clients=storage_clients,
+                endpoint_name="croissant",
+            ),
+        ),  # ^ will be deprecated soon
         Route("/healthcheck", endpoint=healthcheck_endpoint),
         Route("/metrics", endpoint=create_metrics_endpoint()),
         # ^ called by Prometheus

diff --git a/services/api/src/api/routes/croissant.py → ...es/api/src/api/routes/croissant_crumbs.py b/services/api/src/api/routes/croissant.py → ...es/api/src/api/routes/croissant_crumbs.py
@@ -3,7 +3,7 @@
 from collections.abc import Mapping
 from http import HTTPStatus
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any, Optional
 
 from datasets import ClassLabel, Features, Image, Value
 from libapi.authentication import auth_check
@@ -64,24 +64,19 @@ def _escape_name(name: str, names: set[str]) -> str:
     return escaped_name
 
 
-def _extract_doi_tag(info: Mapping[str, Any]) -> Union[str, None]:
-    """Extracts https://huggingface.co/docs/hub/en/doi."""
-    tags = info.get("tags", [])
-    if isinstance(tags, list):
-        for tag in tags:
-            if isinstance(tag, str) and tag.startswith("doi:"):
-                return tag.replace("doi:", "", 1)
-    return None
-
-
 def _remove_none_values(json: Mapping[str, Any]) -> Mapping[str, Any]:
     """Removes None values in the first depth of a dict."""
     return {k: v for k, v in json.items() if v is not None}
 
 
-def get_croissant_from_dataset_infos(
+def get_croissant_crumbs_from_dataset_infos(
     dataset: str, infos: list[Mapping[str, Any]], partial: bool, full_jsonld: bool
 ) -> Mapping[str, Any]:
+    """Generates the "crumbs" of the Croissant JSON-LD metadata from the dataset infos.
+
+    It's only a subset of the full JSON-LD metadata. See the Hugging Face API `/croissant` endpoint
+    to get the complete Croissant JSON-LD metadata.
+    """
     repo_name = "repo"
     names: set[str] = set(repo_name)
     distribution = [
@@ -97,13 +92,9 @@ def get_croissant_from_dataset_infos(
             }
         )
     ]
-    identifier = None
-    _license = None
     record_set = []
     for info in infos:
         description_body = ""
-        _license = info.get("license")
-        identifier = _extract_doi_tag(info)
         config = info["config_name"]
         features = Features.from_dict(info["features"])
         fields: list[dict[str, Any]] = []
@@ -233,12 +224,7 @@ def get_croissant_from_dataset_infos(
         {
             "@context": context,
             "@type": "sc:Dataset",
-            "name": _escape_name(dataset, names),
             "conformsTo": "http://mlcommons.org/croissant/1.0",
-            "description": f"{dataset} dataset hosted on Hugging Face and contributed by the HF Datasets community",
-            "identifier": identifier,
-            "license": _license,
-            "url": f"https://huggingface.co/datasets/{dataset}",
             "distribution": distribution,
             "recordSet": record_set,
         }
@@ -253,7 +239,7 @@ def _get_full_jsonld_parameter(request: Request) -> bool:
     return True
 
 
-def create_croissant_endpoint(
+def create_croissant_crumbs_endpoint(
     hf_endpoint: str,
     blocked_datasets: list[str],
     hf_token: Optional[str] = None,
@@ -264,15 +250,16 @@ def create_croissant_endpoint(
     max_age_long: int = 0,
     max_age_short: int = 0,
     storage_clients: Optional[list[StorageClient]] = None,
+    endpoint_name: str = "croissant-crumbs",
 ) -> Endpoint:
-    async def croissant_endpoint(request: Request) -> Response:
-        endpoint_name = "croissant"
+    async def croissant_crumbs_endpoint(request: Request) -> Response:
         context = f"endpoint: {endpoint_name}"
+        method = "croissant_crumbs_endpoint"
         revision: Optional[str] = None
-        with StepProfiler(method="croissant_endpoint", step="all", context=context):
+        with StepProfiler(method=method, step="all", context=context):
             try:
                 with StepProfiler(
-                    method="croissant_endpoint",
+                    method=method,
                     step="validate parameters and get processing steps",
                     context=context,
                 ):
@@ -282,7 +269,7 @@ async def croissant_endpoint(request: Request) -> Response:
                     if not are_valid_parameters([dataset]):
                         raise MissingRequiredParameterError("Parameter 'dataset' is required")
                 # if auth_check fails, it will raise an exception that will be caught below
-                with StepProfiler(method="croissant_endpoint", step="check authentication", context=context):
+                with StepProfiler(method=method, step="check authentication", context=context):
                     await auth_check(
                         dataset,
                         external_auth_url=external_auth_url,
@@ -292,7 +279,7 @@ async def croissant_endpoint(request: Request) -> Response:
                         hf_timeout_seconds=hf_timeout_seconds,
                     )
                 # getting result based on processing steps
-                with StepProfiler(method="croissant_endpoint", step="get info cache entry", context=context):
+                with StepProfiler(method=method, step="get info cache entry", context=context):
                     info_result = get_cache_entry_from_step(
                         processing_step_name=DATASET_INFO_KIND,
                         dataset=dataset,
@@ -311,17 +298,17 @@ async def croissant_endpoint(request: Request) -> Response:
                 if http_status == HTTPStatus.OK:
                     infos = list(islice(content["dataset_info"].values(), CROISSANT_MAX_CONFIGS))
                     partial = content["partial"]
-                    with StepProfiler(method="croissant_endpoint", step="generate croissant json", context=context):
-                        croissant = get_croissant_from_dataset_infos(
+                    with StepProfiler(method=method, step="generate croissant crumbs json", context=context):
+                        croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
                             dataset=dataset,
                             infos=infos,
                             partial=partial,
                             full_jsonld=full_jsonld,
                         )
-                    with StepProfiler(method="croissant_endpoint", step="generate OK response", context=context):
-                        return get_json_ok_response(content=croissant, max_age=max_age_long, revision=revision)
+                    with StepProfiler(method=method, step="generate OK response", context=context):
+                        return get_json_ok_response(content=croissant_crumbs, max_age=max_age_long, revision=revision)
                 else:
-                    with StepProfiler(method="croissant_endpoint", step="generate error response", context=context):
+                    with StepProfiler(method=method, step="generate error response", context=context):
                         return get_json_error_response(
                             content=content,
                             status_code=http_status,
@@ -331,7 +318,7 @@ async def croissant_endpoint(request: Request) -> Response:
                         )
             except Exception as e:
                 error = e if isinstance(e, ApiError) else UnexpectedApiError("Unexpected error.", e)
-                with StepProfiler(method="croissant_endpoint", step="generate API error response", context=context):
+                with StepProfiler(method=method, step="generate API error response", context=context):
                     return get_json_api_error_response(error=error, max_age=max_age_short, revision=revision)
 
-    return croissant_endpoint
+    return croissant_crumbs_endpoint
diff --git a/services/api/tests/routes/test_croissant.py → ...api/tests/routes/test_croissant_crumbs.py b/services/api/tests/routes/test_croissant.py → ...api/tests/routes/test_croissant_crumbs.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from api.routes.croissant import get_croissant_from_dataset_infos
+from api.routes.croissant_crumbs import get_croissant_crumbs_from_dataset_infos
 
 squad_info = {
     "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n",
@@ -92,42 +92,40 @@
 
 
 def test_get_croissant_context_from_dataset_infos() -> None:
-    croissant = get_croissant_from_dataset_infos(
+    croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
         "user/squad with space",
         [squad_info, squad_info],
         partial=False,
         full_jsonld=False,
     )
-    assert croissant["@context"] == v1_context
+    assert croissant_crumbs["@context"] == v1_context
 
 
-def test_get_croissant_from_dataset_infos() -> None:
-    croissant = get_croissant_from_dataset_infos(
+def test_get_croissant_crumbs_from_dataset_infos() -> None:
+    croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
         "user/squad with space",
         [squad_info, squad_info],
         partial=False,
         full_jsonld=False,
     )
-    assert "@context" in croissant
-    assert "@type" in croissant
-    assert "name" in croissant
-    assert croissant["name"] == "user_squad_with_space"
+    assert "@context" in croissant_crumbs
+    assert "@type" in croissant_crumbs
 
     # Test recordSet.
-    assert "recordSet" in croissant
-    assert croissant["recordSet"]
-    assert isinstance(croissant["recordSet"], list)
-    assert len(croissant["recordSet"]) == 2
-    assert croissant["recordSet"][0]["@type"] == croissant["recordSet"][1]["@type"] == "cr:RecordSet"
-    assert croissant["recordSet"][0]["name"] == "record_set_user_squad_with_space"
-    assert croissant["recordSet"][1]["name"] == "record_set_user_squad_with_space_0"
-    assert isinstance(croissant["recordSet"][0]["field"], list)
+    assert "recordSet" in croissant_crumbs
+    assert croissant_crumbs["recordSet"]
+    assert isinstance(croissant_crumbs["recordSet"], list)
+    assert len(croissant_crumbs["recordSet"]) == 2
+    assert croissant_crumbs["recordSet"][0]["@type"] == croissant_crumbs["recordSet"][1]["@type"] == "cr:RecordSet"
+    assert croissant_crumbs["recordSet"][0]["name"] == "record_set_user_squad_with_space"
+    assert croissant_crumbs["recordSet"][1]["name"] == "record_set_user_squad_with_space_0"
+    assert isinstance(croissant_crumbs["recordSet"][0]["field"], list)
     assert isinstance(squad_info["features"], dict)
-    assert "1 skipped column: answers" in croissant["recordSet"][0]["description"]
-    assert croissant["recordSet"][0]["@id"] == "record_set_user_squad_with_space"
-    assert croissant["recordSet"][1]["@id"] == "record_set_user_squad_with_space_0"
-    for i, _ in enumerate(croissant["recordSet"]):
-        for field in croissant["recordSet"][i]["field"]:
+    assert "1 skipped column: answers" in croissant_crumbs["recordSet"][0]["description"]
+    assert croissant_crumbs["recordSet"][0]["@id"] == "record_set_user_squad_with_space"
+    assert croissant_crumbs["recordSet"][1]["@id"] == "record_set_user_squad_with_space_0"
+    for i, _ in enumerate(croissant_crumbs["recordSet"]):
+        for field in croissant_crumbs["recordSet"][i]["field"]:
             assert "source" in field
             assert "fileSet" in field["source"]
             assert "@id" in field["source"]["fileSet"]
@@ -136,41 +134,26 @@ def test_get_croissant_from_dataset_infos() -> None:
             assert field["source"]["extract"]["column"] == field["@id"].split("/")[-1]
 
     # Test fields.
-    assert len(croissant["recordSet"][0]["field"]) == 4
-    assert len(croissant["recordSet"][1]["field"]) == 4
-    for field in croissant["recordSet"][0]["field"]:
+    assert len(croissant_crumbs["recordSet"][0]["field"]) == 4
+    assert len(croissant_crumbs["recordSet"][1]["field"]) == 4
+    for field in croissant_crumbs["recordSet"][0]["field"]:
         assert field["@type"] == "cr:Field"
         assert field["dataType"] == "sc:Text"
-    assert len(croissant["recordSet"][0]["field"]) == len(squad_info["features"]) - 1
+    assert len(croissant_crumbs["recordSet"][0]["field"]) == len(squad_info["features"]) - 1
 
     # Test distribution.
-    assert "distribution" in croissant
-    assert croissant["distribution"]
-    assert isinstance(croissant["distribution"], list)
-    assert croissant["distribution"][0]["@type"] == "cr:FileObject"
-    assert croissant["distribution"][1]["@type"] == "cr:FileSet"
-    assert croissant["distribution"][2]["@type"] == "cr:FileSet"
-    assert croissant["distribution"][0]["name"] == "repo"
-    for distribution in croissant["distribution"]:
+    assert "distribution" in croissant_crumbs
+    assert croissant_crumbs["distribution"]
+    assert isinstance(croissant_crumbs["distribution"], list)
+    assert croissant_crumbs["distribution"][0]["@type"] == "cr:FileObject"
+    assert croissant_crumbs["distribution"][1]["@type"] == "cr:FileSet"
+    assert croissant_crumbs["distribution"][2]["@type"] == "cr:FileSet"
+    assert croissant_crumbs["distribution"][0]["name"] == "repo"
+    for distribution in croissant_crumbs["distribution"]:
         assert "@id" in distribution
         if "containedIn" in distribution:
             assert "@id" in distribution["containedIn"]
 
-    # Test others.
-    assert croissant["license"] == ["mit"]
-    assert croissant["identifier"] == "hf/123456789"
-
-    # If the parameter doesn't exist, check that it is not kept:
-    squad_licenseless_info = squad_info.copy()
-    del squad_licenseless_info["license"]
-    croissant = get_croissant_from_dataset_infos(
-        "user/squad with space",
-        [squad_licenseless_info, squad_licenseless_info],
-        partial=False,
-        full_jsonld=False,
-    )
-    assert "license" not in croissant
-
 
 MAX_COLUMNS = 3
 
@@ -182,13 +165,13 @@ def test_get_croissant_from_dataset_infos() -> None:
         (False, MAX_COLUMNS),
     ],
 )
-def test_get_croissant_from_dataset_infos_max_columns(full_jsonld: bool, num_columns: int) -> None:
-    with patch("api.routes.croissant.MAX_COLUMNS", MAX_COLUMNS):
-        croissant = get_croissant_from_dataset_infos(
+def test_get_croissant_crumbs_from_dataset_infos_max_columns(full_jsonld: bool, num_columns: int) -> None:
+    with patch("api.routes.croissant_crumbs.MAX_COLUMNS", MAX_COLUMNS):
+        croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
             "user/squad with space",
             [squad_info, squad_info],
             partial=False,
             full_jsonld=full_jsonld,
         )
-    assert len(croissant["recordSet"][0]["field"]) == num_columns
-    assert full_jsonld or "max number of columns reached" in croissant["recordSet"][0]["description"]
+    assert len(croissant_crumbs["recordSet"][0]["field"]) == num_columns
+    assert full_jsonld or "max number of columns reached" in croissant_crumbs["recordSet"][0]["description"]