Skip to content

Commit

Permalink
/croissant -> /croissant-crumbs (only specific fields) (#2643)
Browse files Browse the repository at this point in the history
* update in compatible-libraries

* update in docs

* bump compatible-libraries version

* remove fields that will be created on Hub's side

name, description, identifier, license and url will be created by
the Hub's API endpoint (see
https://huggingface.co/api/datasets/mnist/croissant).

This endpoint only returns Croissant-specific fields.

* fix test

* rename /croissant to /croissant-crumbs

We keep /croissant for compatibility, until we change it in moonlanding

Also: I didn't change in the docs yet, to not interfere with
#2642

* Croissant version is now 1.0, no more early draft

* rename datasets-server URL

* link to the Hub API docs

see huggingface/hub-docs#1259

* Update croissant.md

---------

Co-authored-by: Quentin Lhoest <[email protected]>
Co-authored-by: Quentin Lhoest <[email protected]>
  • Loading branch information
3 people authored Mar 28, 2024
1 parent 86a6759 commit 4dddea2
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 100 deletions.
10 changes: 2 additions & 8 deletions docs/source/croissant.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,9 @@ Datasets Server automatically generates the metadata in [Croissant](https://gith

Croissant is a metadata format build on top of [schema.org](https://schema.org/) aimed at describing datasets used for machine learning to help indexing, searching and loading them programmatically.

<Tip>

The [specification](https://github.com/mlcommons/croissant/blob/main/docs/croissant-spec.md) is still in early draft status. It may evolve in the future, and backward compatibility is not guaranteed.

</Tip>

## Get the metadata

This guide shows you how to use Hugging Face `/croissant` endpoint to retrieve the Croissant metadata associated to a dataset.
This guide shows you how to use [Hugging Face `/croissant` endpoint](https://huggingface.co/docs/hub/api#get-apidatasetsrepoidcroissant) to retrieve the Croissant metadata associated to a dataset.

The `/croissant` endpoint takes the dataset name in the URL, for example for the `ibm/duorc` dataset:

Expand Down Expand Up @@ -58,7 +52,7 @@ curl https://huggingface.co/api/datasets/ibm/duorc/croissant \
</curl>
</inferencesnippet>

Under the hood it uses the `https://datasets-server.huggingface.co/croissant` endpoint and enriches it with the Hub metadata.
Under the hood it uses the `https://datasets-server.huggingface.co/croissant-crumbs` endpoint and enriches it with the Hub metadata.

The endpoint response is a [JSON-LD](https://json-ld.org/) containing the metadata in the Croissant format. For example, the [`ibm/duorc`](https://huggingface.co/datasets/ibm/duorc) dataset has two configurations, `ParaphraseRC` and `SelfRC` (see the [List splits and configurations](./splits) guide for more details about splits and configurations). The metadata links to their Parquet files and describes the type of each of the six columns: `plot_id`, `plot`, `title`, `question_id`, `question`, and `no_answer`:

Expand Down
22 changes: 19 additions & 3 deletions services/api/src/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from starlette_prometheus import PrometheusMiddleware

from api.config import AppConfig, EndpointConfig
from api.routes.croissant import create_croissant_endpoint
from api.routes.croissant_crumbs import create_croissant_crumbs_endpoint
from api.routes.endpoint import EndpointsDefinition, create_endpoint
from api.routes.webhook import create_webhook_endpoint

Expand Down Expand Up @@ -104,8 +104,8 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi
for endpoint_name, step_by_input_type in endpoints_definition.step_by_input_type_and_endpoint.items()
] + [
Route(
"/croissant",
endpoint=create_croissant_endpoint(
"/croissant-crumbs",
endpoint=create_croissant_crumbs_endpoint(
hf_endpoint=app_config.common.hf_endpoint,
hf_token=app_config.common.hf_token,
blocked_datasets=app_config.common.blocked_datasets,
Expand All @@ -118,6 +118,22 @@ def create_app_with_config(app_config: AppConfig, endpoint_config: EndpointConfi
storage_clients=storage_clients,
),
),
Route(
"/croissant",
endpoint=create_croissant_crumbs_endpoint(
hf_endpoint=app_config.common.hf_endpoint,
hf_token=app_config.common.hf_token,
blocked_datasets=app_config.common.blocked_datasets,
hf_jwt_public_keys=hf_jwt_public_keys,
hf_jwt_algorithm=app_config.api.hf_jwt_algorithm,
external_auth_url=app_config.api.external_auth_url,
hf_timeout_seconds=app_config.api.hf_timeout_seconds,
max_age_long=app_config.api.max_age_long,
max_age_short=app_config.api.max_age_short,
storage_clients=storage_clients,
endpoint_name="croissant",
),
), # ^ will be deprecated soon
Route("/healthcheck", endpoint=healthcheck_endpoint),
Route("/metrics", endpoint=create_metrics_endpoint()),
# ^ called by Prometheus
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections.abc import Mapping
from http import HTTPStatus
from itertools import islice
from typing import Any, Optional, Union
from typing import Any, Optional

from datasets import ClassLabel, Features, Image, Value
from libapi.authentication import auth_check
Expand Down Expand Up @@ -64,24 +64,19 @@ def _escape_name(name: str, names: set[str]) -> str:
return escaped_name


def _extract_doi_tag(info: Mapping[str, Any]) -> Union[str, None]:
"""Extracts https://huggingface.co/docs/hub/en/doi."""
tags = info.get("tags", [])
if isinstance(tags, list):
for tag in tags:
if isinstance(tag, str) and tag.startswith("doi:"):
return tag.replace("doi:", "", 1)
return None


def _remove_none_values(json: Mapping[str, Any]) -> Mapping[str, Any]:
"""Removes None values in the first depth of a dict."""
return {k: v for k, v in json.items() if v is not None}


def get_croissant_from_dataset_infos(
def get_croissant_crumbs_from_dataset_infos(
dataset: str, infos: list[Mapping[str, Any]], partial: bool, full_jsonld: bool
) -> Mapping[str, Any]:
"""Generates the "crumbs" of the Croissant JSON-LD metadata from the dataset infos.
It's only a subset of the full JSON-LD metadata. See the Hugging Face API `/croissant` endpoint
to get the complete Croissant JSON-LD metadata.
"""
repo_name = "repo"
names: set[str] = set(repo_name)
distribution = [
Expand All @@ -97,13 +92,9 @@ def get_croissant_from_dataset_infos(
}
)
]
identifier = None
_license = None
record_set = []
for info in infos:
description_body = ""
_license = info.get("license")
identifier = _extract_doi_tag(info)
config = info["config_name"]
features = Features.from_dict(info["features"])
fields: list[dict[str, Any]] = []
Expand Down Expand Up @@ -233,12 +224,7 @@ def get_croissant_from_dataset_infos(
{
"@context": context,
"@type": "sc:Dataset",
"name": _escape_name(dataset, names),
"conformsTo": "http://mlcommons.org/croissant/1.0",
"description": f"{dataset} dataset hosted on Hugging Face and contributed by the HF Datasets community",
"identifier": identifier,
"license": _license,
"url": f"https://huggingface.co/datasets/{dataset}",
"distribution": distribution,
"recordSet": record_set,
}
Expand All @@ -253,7 +239,7 @@ def _get_full_jsonld_parameter(request: Request) -> bool:
return True


def create_croissant_endpoint(
def create_croissant_crumbs_endpoint(
hf_endpoint: str,
blocked_datasets: list[str],
hf_token: Optional[str] = None,
Expand All @@ -264,15 +250,16 @@ def create_croissant_endpoint(
max_age_long: int = 0,
max_age_short: int = 0,
storage_clients: Optional[list[StorageClient]] = None,
endpoint_name: str = "croissant-crumbs",
) -> Endpoint:
async def croissant_endpoint(request: Request) -> Response:
endpoint_name = "croissant"
async def croissant_crumbs_endpoint(request: Request) -> Response:
context = f"endpoint: {endpoint_name}"
method = "croissant_crumbs_endpoint"
revision: Optional[str] = None
with StepProfiler(method="croissant_endpoint", step="all", context=context):
with StepProfiler(method=method, step="all", context=context):
try:
with StepProfiler(
method="croissant_endpoint",
method=method,
step="validate parameters and get processing steps",
context=context,
):
Expand All @@ -282,7 +269,7 @@ async def croissant_endpoint(request: Request) -> Response:
if not are_valid_parameters([dataset]):
raise MissingRequiredParameterError("Parameter 'dataset' is required")
# if auth_check fails, it will raise an exception that will be caught below
with StepProfiler(method="croissant_endpoint", step="check authentication", context=context):
with StepProfiler(method=method, step="check authentication", context=context):
await auth_check(
dataset,
external_auth_url=external_auth_url,
Expand All @@ -292,7 +279,7 @@ async def croissant_endpoint(request: Request) -> Response:
hf_timeout_seconds=hf_timeout_seconds,
)
# getting result based on processing steps
with StepProfiler(method="croissant_endpoint", step="get info cache entry", context=context):
with StepProfiler(method=method, step="get info cache entry", context=context):
info_result = get_cache_entry_from_step(
processing_step_name=DATASET_INFO_KIND,
dataset=dataset,
Expand All @@ -311,17 +298,17 @@ async def croissant_endpoint(request: Request) -> Response:
if http_status == HTTPStatus.OK:
infos = list(islice(content["dataset_info"].values(), CROISSANT_MAX_CONFIGS))
partial = content["partial"]
with StepProfiler(method="croissant_endpoint", step="generate croissant json", context=context):
croissant = get_croissant_from_dataset_infos(
with StepProfiler(method=method, step="generate croissant crumbs json", context=context):
croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
dataset=dataset,
infos=infos,
partial=partial,
full_jsonld=full_jsonld,
)
with StepProfiler(method="croissant_endpoint", step="generate OK response", context=context):
return get_json_ok_response(content=croissant, max_age=max_age_long, revision=revision)
with StepProfiler(method=method, step="generate OK response", context=context):
return get_json_ok_response(content=croissant_crumbs, max_age=max_age_long, revision=revision)
else:
with StepProfiler(method="croissant_endpoint", step="generate error response", context=context):
with StepProfiler(method=method, step="generate error response", context=context):
return get_json_error_response(
content=content,
status_code=http_status,
Expand All @@ -331,7 +318,7 @@ async def croissant_endpoint(request: Request) -> Response:
)
except Exception as e:
error = e if isinstance(e, ApiError) else UnexpectedApiError("Unexpected error.", e)
with StepProfiler(method="croissant_endpoint", step="generate API error response", context=context):
with StepProfiler(method=method, step="generate API error response", context=context):
return get_json_api_error_response(error=error, max_age=max_age_short, revision=revision)

return croissant_endpoint
return croissant_crumbs_endpoint
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from api.routes.croissant import get_croissant_from_dataset_infos
from api.routes.croissant_crumbs import get_croissant_crumbs_from_dataset_infos

squad_info = {
"description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n",
Expand Down Expand Up @@ -92,42 +92,40 @@


def test_get_croissant_context_from_dataset_infos() -> None:
croissant = get_croissant_from_dataset_infos(
croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
"user/squad with space",
[squad_info, squad_info],
partial=False,
full_jsonld=False,
)
assert croissant["@context"] == v1_context
assert croissant_crumbs["@context"] == v1_context


def test_get_croissant_from_dataset_infos() -> None:
croissant = get_croissant_from_dataset_infos(
def test_get_croissant_crumbs_from_dataset_infos() -> None:
croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
"user/squad with space",
[squad_info, squad_info],
partial=False,
full_jsonld=False,
)
assert "@context" in croissant
assert "@type" in croissant
assert "name" in croissant
assert croissant["name"] == "user_squad_with_space"
assert "@context" in croissant_crumbs
assert "@type" in croissant_crumbs

# Test recordSet.
assert "recordSet" in croissant
assert croissant["recordSet"]
assert isinstance(croissant["recordSet"], list)
assert len(croissant["recordSet"]) == 2
assert croissant["recordSet"][0]["@type"] == croissant["recordSet"][1]["@type"] == "cr:RecordSet"
assert croissant["recordSet"][0]["name"] == "record_set_user_squad_with_space"
assert croissant["recordSet"][1]["name"] == "record_set_user_squad_with_space_0"
assert isinstance(croissant["recordSet"][0]["field"], list)
assert "recordSet" in croissant_crumbs
assert croissant_crumbs["recordSet"]
assert isinstance(croissant_crumbs["recordSet"], list)
assert len(croissant_crumbs["recordSet"]) == 2
assert croissant_crumbs["recordSet"][0]["@type"] == croissant_crumbs["recordSet"][1]["@type"] == "cr:RecordSet"
assert croissant_crumbs["recordSet"][0]["name"] == "record_set_user_squad_with_space"
assert croissant_crumbs["recordSet"][1]["name"] == "record_set_user_squad_with_space_0"
assert isinstance(croissant_crumbs["recordSet"][0]["field"], list)
assert isinstance(squad_info["features"], dict)
assert "1 skipped column: answers" in croissant["recordSet"][0]["description"]
assert croissant["recordSet"][0]["@id"] == "record_set_user_squad_with_space"
assert croissant["recordSet"][1]["@id"] == "record_set_user_squad_with_space_0"
for i, _ in enumerate(croissant["recordSet"]):
for field in croissant["recordSet"][i]["field"]:
assert "1 skipped column: answers" in croissant_crumbs["recordSet"][0]["description"]
assert croissant_crumbs["recordSet"][0]["@id"] == "record_set_user_squad_with_space"
assert croissant_crumbs["recordSet"][1]["@id"] == "record_set_user_squad_with_space_0"
for i, _ in enumerate(croissant_crumbs["recordSet"]):
for field in croissant_crumbs["recordSet"][i]["field"]:
assert "source" in field
assert "fileSet" in field["source"]
assert "@id" in field["source"]["fileSet"]
Expand All @@ -136,41 +134,26 @@ def test_get_croissant_from_dataset_infos() -> None:
assert field["source"]["extract"]["column"] == field["@id"].split("/")[-1]

# Test fields.
assert len(croissant["recordSet"][0]["field"]) == 4
assert len(croissant["recordSet"][1]["field"]) == 4
for field in croissant["recordSet"][0]["field"]:
assert len(croissant_crumbs["recordSet"][0]["field"]) == 4
assert len(croissant_crumbs["recordSet"][1]["field"]) == 4
for field in croissant_crumbs["recordSet"][0]["field"]:
assert field["@type"] == "cr:Field"
assert field["dataType"] == "sc:Text"
assert len(croissant["recordSet"][0]["field"]) == len(squad_info["features"]) - 1
assert len(croissant_crumbs["recordSet"][0]["field"]) == len(squad_info["features"]) - 1

# Test distribution.
assert "distribution" in croissant
assert croissant["distribution"]
assert isinstance(croissant["distribution"], list)
assert croissant["distribution"][0]["@type"] == "cr:FileObject"
assert croissant["distribution"][1]["@type"] == "cr:FileSet"
assert croissant["distribution"][2]["@type"] == "cr:FileSet"
assert croissant["distribution"][0]["name"] == "repo"
for distribution in croissant["distribution"]:
assert "distribution" in croissant_crumbs
assert croissant_crumbs["distribution"]
assert isinstance(croissant_crumbs["distribution"], list)
assert croissant_crumbs["distribution"][0]["@type"] == "cr:FileObject"
assert croissant_crumbs["distribution"][1]["@type"] == "cr:FileSet"
assert croissant_crumbs["distribution"][2]["@type"] == "cr:FileSet"
assert croissant_crumbs["distribution"][0]["name"] == "repo"
for distribution in croissant_crumbs["distribution"]:
assert "@id" in distribution
if "containedIn" in distribution:
assert "@id" in distribution["containedIn"]

# Test others.
assert croissant["license"] == ["mit"]
assert croissant["identifier"] == "hf/123456789"

# If the parameter doesn't exist, check that it is not kept:
squad_licenseless_info = squad_info.copy()
del squad_licenseless_info["license"]
croissant = get_croissant_from_dataset_infos(
"user/squad with space",
[squad_licenseless_info, squad_licenseless_info],
partial=False,
full_jsonld=False,
)
assert "license" not in croissant


MAX_COLUMNS = 3

Expand All @@ -182,13 +165,13 @@ def test_get_croissant_from_dataset_infos() -> None:
(False, MAX_COLUMNS),
],
)
def test_get_croissant_from_dataset_infos_max_columns(full_jsonld: bool, num_columns: int) -> None:
with patch("api.routes.croissant.MAX_COLUMNS", MAX_COLUMNS):
croissant = get_croissant_from_dataset_infos(
def test_get_croissant_crumbs_from_dataset_infos_max_columns(full_jsonld: bool, num_columns: int) -> None:
with patch("api.routes.croissant_crumbs.MAX_COLUMNS", MAX_COLUMNS):
croissant_crumbs = get_croissant_crumbs_from_dataset_infos(
"user/squad with space",
[squad_info, squad_info],
partial=False,
full_jsonld=full_jsonld,
)
assert len(croissant["recordSet"][0]["field"]) == num_columns
assert full_jsonld or "max number of columns reached" in croissant["recordSet"][0]["description"]
assert len(croissant_crumbs["recordSet"][0]["field"]) == num_columns
assert full_jsonld or "max number of columns reached" in croissant_crumbs["recordSet"][0]["description"]

0 comments on commit 4dddea2

Please sign in to comment.