From f45af9cf63ceabe9c99474456eb69b54ef58de8b Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Wed, 13 Mar 2024 10:54:52 +0100 Subject: [PATCH] Replace canonical datasets with community ones in the docs/tests (#2579) * replace glue with nyu-mll/glue (or mnist in one case) * remove unused pytest marks (somewhat unrelated to the PR, sorry) * fix ibm/duorc * fix assets URL * add statistics field in docs (from #2577) * replace emotion and c4 with their moved new neame * replace squad with its moved version * rename datasets in openapi spec + add missing links to docs * fix test --- docs/source/filter.md | 6 +- docs/source/first_rows.md | 2 +- docs/source/info.md | 112 +++++-------- docs/source/openapi.json | 157 +++++++++--------- docs/source/rows.md | 6 +- docs/source/search.md | 4 +- docs/source/statistics.md | 12 +- docs/source/valid.md | 17 ++ e2e/pyproject.toml | 3 - e2e/tests/test_12_splits.py | 2 +- front/admin_ui/app.py | 26 ++- jobs/cache_maintenance/pyproject.toml | 3 - jobs/mongodb_migration/pyproject.toml | 3 - libs/libcommon/pyproject.toml | 3 +- libs/libcommon/tests/test_duckdb_utils.py | 6 +- libs/libcommon/tests/test_parquet_utils.py | 13 +- services/admin/pyproject.toml | 3 +- services/admin/tests/test_app_real.py | 2 +- services/api/pyproject.toml | 3 +- services/api/tests/test_app_real.py | 4 +- services/rows/pyproject.toml | 4 - services/search/pyproject.toml | 4 - services/worker/pyproject.toml | 3 +- .../job_runners/config/parquet_and_info.py | 4 +- .../config/test_parquet_and_info.py | 8 +- 25 files changed, 195 insertions(+), 215 deletions(-) diff --git a/docs/source/filter.md b/docs/source/filter.md index bd4a21aa6b..34c8b3bf27 100644 --- a/docs/source/filter.md +++ b/docs/source/filter.md @@ -12,7 +12,7 @@ This guide shows you how to use Datasets Server's `/filter` endpoint to filter r Feel free to also try it out with [ReDoc](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/filterRows). The `/filter` endpoint accepts the following query parameters: -- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0` +- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0` - `config`: the configuration name, for example `cola` - `split`: the split name, for example `train` - `where`: the filter condition @@ -88,7 +88,7 @@ The endpoint response is a JSON containing two keys (same format as [`/rows`](./ The rows are ordered by the row index. -For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm.duorc`/`SelfRC` train split for the `where` condition `no_answer=true`: +For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm/duorc`/`SelfRC` train split for the `where` condition `no_answer=true`: ```json { @@ -197,4 +197,4 @@ For example, here are the `features` and the slice 150-151 of matching `rows` of If the result has `partial: true` it means that the filtering couldn't be run on the full dataset because it's too big. -Indeed, the indexing for `/filter` can be partial if the dataset is bigger than 5GB. In that case, it only uses the first 5GB. \ No newline at end of file +Indeed, the indexing for `/filter` can be partial if the dataset is bigger than 5GB. In that case, it only uses the first 5GB. diff --git a/docs/source/first_rows.md b/docs/source/first_rows.md index 7943a11a65..7605f87c03 100644 --- a/docs/source/first_rows.md +++ b/docs/source/first_rows.md @@ -8,7 +8,7 @@ This guide shows you how to use Datasets Server's `/first-rows` endpoint to prev The `/first-rows` endpoint accepts three query parameters: -- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0` +- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0` - `config`: the configuration name, for example `cola` - `split`: the split name, for example `train` diff --git a/docs/source/info.md b/docs/source/info.md index a7b75230d8..bc6dcbb4b8 100644 --- a/docs/source/info.md +++ b/docs/source/info.md @@ -51,76 +51,50 @@ The endpoint response is a JSON with the `dataset_info` key. Its structure and c ```json { - "dataset_info":{ - "description":"", - "citation":"", - "homepage":"", - "license":"", - "features":{ - "plot_id":{ - "dtype":"string", - "_type":"Value" - }, - "plot":{ - "dtype":"string", - "_type":"Value" - }, - "title":{ - "dtype":"string", - "_type":"Value" - }, - "question_id":{ - "dtype":"string", - "_type":"Value" - }, - "question":{ - "dtype":"string", - "_type":"Value" - }, - "answers":{ - "feature":{ - "dtype":"string", - "_type":"Value" - }, - "_type":"Sequence" - }, - "no_answer":{ - "dtype":"bool", - "_type":"Value" - } + "dataset_info": { + "description": "", + "citation": "", + "homepage": "", + "license": "", + "features": { + "plot_id": { "dtype": "string", "_type": "Value" }, + "plot": { "dtype": "string", "_type": "Value" }, + "title": { "dtype": "string", "_type": "Value" }, + "question_id": { "dtype": "string", "_type": "Value" }, + "question": { "dtype": "string", "_type": "Value" }, + "answers": { + "feature": { "dtype": "string", "_type": "Value" }, + "_type": "Sequence" }, - "builder_name":"parquet", - "dataset_name":"duorc", - "config_name":"SelfRC", - "version":{ - "version_str":"0.0.0", - "major":0, - "minor":0, - "patch":0 + "no_answer": { "dtype": "bool", "_type": "Value" } + }, + "builder_name": "parquet", + "dataset_name": "duorc", + "config_name": "SelfRC", + "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 }, + "splits": { + "train": { + "name": "train", + "num_bytes": 248966361, + "num_examples": 60721, + "dataset_name": null }, - "splits":{ - "train":{ - "name":"train", - "num_bytes":248966361, - "num_examples":60721, - "dataset_name":null - }, - "validation":{ - "name":"validation", - "num_bytes":56359392, - "num_examples":12961, - "dataset_name":null - }, - "test":{ - "name":"test", - "num_bytes":51022318, - "num_examples":12559, - "dataset_name":null - } + "validation": { + "name": "validation", + "num_bytes": 56359392, + "num_examples": 12961, + "dataset_name": null }, - "download_size":21001846, - "dataset_size":356348071 - }, - "partial":false + "test": { + "name": "test", + "num_bytes": 51022318, + "num_examples": 12559, + "dataset_name": null + } + }, + "download_size": 21001846, + "dataset_size": 356348071 + }, + "partial": false } -``` \ No newline at end of file +``` diff --git a/docs/source/openapi.json b/docs/source/openapi.json index a0c6713085..9bab560cfd 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1394,9 +1394,9 @@ "type": "string" }, "examples": { - "glue": { + "mnist": { "summary": "A canonical dataset", - "value": "glue" + "value": "mnist" }, "Helsinki-NLP/tatoeba_mt": { "summary": "A namespaced dataset", @@ -1414,7 +1414,7 @@ }, "examples": { "cola": { - "summary": "A subset of the glue dataset", + "summary": "A subset of the nyu-mll/glue dataset", "value": "cola" }, "yangdong/ecqa": { @@ -1455,7 +1455,7 @@ }, "examples": { "cola": { - "summary": "A subset of the glue dataset", + "summary": "A subset of the nyu-mll/glue dataset", "value": "cola" }, "yangdong/ecqa": { @@ -1767,12 +1767,12 @@ } }, "splits for a single config": { - "summary": "emotion has two configs. Setting config=unsplit only returns the splits for this config.", - "description": "Try with https://datasets-server.huggingface.co/splits?dataset=emotion&config=unsplit.", + "summary": "dair-ai/emotion has two configs. Setting config=unsplit only returns the splits for this config.", + "description": "Try with https://datasets-server.huggingface.co/splits?dataset=dair-ai/emotion&config=unsplit.", "value": { "splits": [ { - "dataset": "emotion", + "dataset": "dair-ai/emotion", "config": "unsplit", "split": "train" } @@ -4054,8 +4054,8 @@ "$ref": "#/components/schemas/ParquetResponse" }, "examples": { - "duorc": { - "summary": "duorc: six parquet files, one per split", + "ibm/duorc": { + "summary": "ibm/duorc: six parquet files, one per split", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=ibm/duorc", "value": { "parquet_files": [ @@ -4323,110 +4323,113 @@ }, "partial parquet export": { "summary": "c4 (en): the parquet export is partial (first 5GB)", - "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=c4&config=en", + "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=allenai/c4&config=en", "value": { "parquet_files": [ { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet", "filename": "0000.parquet", - "size": 309207547 + "size": 312302655 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0001.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0001.parquet", "filename": "0001.parquet", - "size": 308665905 + "size": 314250060 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0002.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0002.parquet", "filename": "0002.parquet", - "size": 309066442 + "size": 312268050 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0003.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0003.parquet", "filename": "0003.parquet", - "size": 309257276 + "size": 312065965 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0004.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0004.parquet", "filename": "0004.parquet", - "size": 309040649 + "size": 308599130 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0005.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0005.parquet", "filename": "0005.parquet", - "size": 308850445 + "size": 312308752 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0006.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0006.parquet", "filename": "0006.parquet", - "size": 308432549 + "size": 313118966 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0007.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0007.parquet", "filename": "0007.parquet", - "size": 308621018 + "size": 313275039 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0008.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0008.parquet", "filename": "0008.parquet", - "size": 309109536 + "size": 312402829 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "train", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0009.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0009.parquet", "filename": "0009.parquet", - "size": 300817682 + "size": 273854946 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "validation", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial/validation/0000.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-validation/0000.parquet", "filename": "0000.parquet", - "size": 308896113 + "size": 311994499 }, { - "dataset": "c4", + "dataset": "allenai/c4", "config": "en", "split": "validation", - "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial/validation/0001.parquet", + "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-validation/0001.parquet", "filename": "0001.parquet", - "size": 200085262 + "size": 197281279 } ], - "pending": [], - "failed": [], + "features": { + "text": { "dtype": "string", "_type": "Value" }, + "timestamp": { "dtype": "string", "_type": "Value" }, + "url": { "dtype": "string", "_type": "Value" } + }, "partial": true - } + } }, "dataset where no parquet file could be created": { "summary": "When the parquet files cannot be created for a configuration, it's listed in 'failed'.", @@ -4834,12 +4837,12 @@ }, "config metadata": { "summary": "metadata for a dataset config", - "description": "Try with https://datasets-server.huggingface.co/info?dataset=glue&config=ax", + "description": "Try with https://datasets-server.huggingface.co/info?dataset=nyu-mll/glue&config=ax", "value": { "dataset_info": { - "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.\n\n", - "citation": "\n@inproceedings{wang2019glue,\n title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},\n author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},\n note={In the Proceedings of ICLR.},\n year={2019}\n}\n", - "homepage": "https://gluebenchmark.com/diagnostics", + "description": "", + "citation": "", + "homepage": "", "license": "", "features": { "premise": { "dtype": "string", "_type": "Value" }, @@ -4850,32 +4853,20 @@ }, "idx": { "dtype": "int32", "_type": "Value" } }, - "builder_name": "glue", + "builder_name": "parquet", + "dataset_name": "glue", "config_name": "ax", - "version": { - "version_str": "1.0.0", - "description": "", - "major": 1, - "minor": 0, - "patch": 0 - }, + "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 }, "splits": { "test": { "name": "test", - "num_bytes": 237694, + "num_bytes": 243791, "num_examples": 1104, - "dataset_name": "glue" - } - }, - "download_checksums": { - "https://dl.fbaipublicfiles.com/glue/data/AX.tsv": { - "num_bytes": 222257, - "checksum": null + "dataset_name": null } }, - "download_size": 222257, - "dataset_size": 237694, - "size_in_bytes": 459951 + "download_size": 80767, + "dataset_size": 243791 }, "partial": false } @@ -5005,8 +4996,8 @@ "summary": "Get the size of a dataset.", "description": "Returns the size (number of rows, storage) of the dataset. Use the optional config parameter to filter the response.", "externalDocs": { - "description": "See size (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.", - "url": "https://huggingface.co/docs/datasets-server/" + "description": "See size in the Hub docs.", + "url": "https://huggingface.co/docs/datasets-server/size" }, "operationId": "getSize", "security": [ @@ -5094,11 +5085,11 @@ }, "config size": { "summary": "size of a dataset config", - "description": "Try with https://datasets-server.huggingface.co/size?dataset=glue&config=ax", + "description": "Try with https://datasets-server.huggingface.co/size?dataset=nyu-mll/glue&config=ax", "value": { "size": { "config": { - "dataset": "glue", + "dataset": "nyu-mll/glue", "config": "ax", "num_bytes_original_files": 222257, "num_bytes_parquet_files": 80767, @@ -5108,7 +5099,7 @@ }, "splits": [ { - "dataset": "glue", + "dataset": "nyu-mll/glue", "config": "ax", "split": "test", "num_bytes_parquet_files": 80767, @@ -5431,8 +5422,8 @@ "summary": "Descriptive statistics of a split's columns", "description": "Returns descriptive statistics, such as min, max, average, histogram, of the columns of a split.", "externalDocs": { - "description": "See statistics (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.", - "url": "https://huggingface.co/docs/datasets-server/" + "description": "See statistics (Hub docs).", + "url": "https://huggingface.co/docs/datasets-server/statistics" }, "operationId": "getStatistics", "security": [ @@ -5775,9 +5766,9 @@ "partial": false } }, - "A split (glue) with a string (text) column": { + "A split (nyu-mll/glue) with a string (text) column": { "summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.", - "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=glue&config=ax&split=test.", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=ax&split=test.", "value": { "num_examples": 1104, "statistics": [ @@ -5867,9 +5858,9 @@ "partial": false } }, - "A split (hellaswag) with a string (label) column": { + "A split (Rowan/hellaswag) with a string (label) column": { "summary": "Statistics on a string column. The column 'label' contains less than 30 different strings, so each string is considered as a label, and the statistics are a count per label.", - "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=hellaswag&config=default&split=train.", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=Rowan/hellaswag&config=default&split=train.", "value": { "num_examples": 39905, "statistics": [ diff --git a/docs/source/rows.md b/docs/source/rows.md index 21c55e07cb..63209cad22 100644 --- a/docs/source/rows.md +++ b/docs/source/rows.md @@ -21,7 +21,7 @@ or [ReDoc](https://redocly.github.io/redoc/?url=https://datasets-server.huggingf The `/rows` endpoint accepts five query parameters: -- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0` +- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0` - `config`: the configuration name, for example `cola` - `split`: the split name, for example `train` - `offset`: the offset of the slice, for example `150` @@ -159,7 +159,7 @@ Image and audio are represented by a URL that points to the file. Images are represented as a JSON object with three fields: -- `src`: URL to the image file +- `src`: URL to the image file. It's a [signed URL](https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/private-content-signed-urls.html) that expires after a certain time. - `height`: height (in pixels) of the image - `width`: width (in pixels) of the image @@ -177,7 +177,7 @@ Here is an example of image, from the first row of the cifar100 dataset: "row_idx": 0, "row": { "img": { - "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/main/--/cifar100/train/0/img/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/aadb3af77e9048adbea6b47c21a81e47dd092ae5/--/cifar100/train/0/img/image.jpg?Expires=1710283469&Signature=A1v0cG07nuaBxYbuPR5EUZpJ9Se072SBDr4935gEsOESHGVyeqvd3qmvdsy1fuqbHk0dnx~p6MLtQ-hg3aCBOJ8eIJ5ItIoyYT4riJRuPQC0VFUb~b1maEwU8LRoXXuvrSysSz2QhBbC~ofv6cQudm~~bgGxXWAslDs180KnmPDsMU55ySsKyKQYNEkQKyuYvrGIJbFeg4lEps0f5CEwUstAwRAwlk~mzRpzUDBq7nJ~DcujTlllLv36nJX~too8mMnFn6dCn2nfGOFYwUiyYM73Czv-laLhVaIVUzcuJum90No~KNGzfYeFZpPqktA7MjCzRLf1gz5kA7wBqnY-8Q__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 32, "width": 32 }, diff --git a/docs/source/search.md b/docs/source/search.md index 08351dd355..e1da5c2972 100644 --- a/docs/source/search.md +++ b/docs/source/search.md @@ -15,7 +15,7 @@ The text is searched in the columns of type `string`, even if the values are nes The `/search` endpoint accepts five query parameters: -- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0` +- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0` - `config`: the configuration name, for example `cola` - `split`: the split name, for example `train` - `query`: the text to search @@ -71,7 +71,7 @@ The endpoint response is a JSON containing two keys (same format as [`/rows`](./ The rows are ordered by the row index, and the text strings matching the query are not highlighted. -For example, here are the `features` and the slice 150-151 of matching `rows` of the `duorc`/`SelfRC` train split for the query `dog`: +For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm/duorc`/`SelfRC` train split for the query `dog`: ```json { diff --git a/docs/source/statistics.md b/docs/source/statistics.md index 1d1a58dd76..dbbe137c84 100644 --- a/docs/source/statistics.md +++ b/docs/source/statistics.md @@ -8,18 +8,18 @@ Datasets Server provides a `/statistics` endpoint for fetching some basic statis The `/statistics` endpoint requires three query parameters: -- `dataset`: the dataset name, for example `glue` +- `dataset`: the dataset name, for example `nyu-mll/glue` - `config`: the configuration name, for example `cola` - `split`: the split name, for example `train` -Let's get some stats for `glue` dataset, `cola` config, `train` split: +Let's get some stats for `nyu-mll/glue` dataset, `cola` config, `train` split: ```python import requests headers = {"Authorization": f"Bearer {API_TOKEN}"} -API_URL = "https://datasets-server.huggingface.co/statistics?dataset=glue&config=cola&split=train" +API_URL = "https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=cola&split=train" def query(): response = requests.get(API_URL, headers=headers) return response.json() @@ -31,7 +31,7 @@ data = query() import fetch from "node-fetch"; async function query(data) { const response = await fetch( - "https://datasets-server.huggingface.co/statistics?dataset=glue&config=cola&split=train", + "https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=cola&split=train", { headers: { Authorization: `Bearer ${API_TOKEN}` }, method: "GET" @@ -47,7 +47,7 @@ query().then((response) => { ```curl -curl https://datasets-server.huggingface.co/statistics?dataset=glue&config=cola&split=train \ +curl https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=cola&split=train \ -X GET \ -H "Authorization: Bearer ${API_TOKEN}" ``` @@ -425,4 +425,4 @@ If string column does not satisfy the conditions to be treated as a `string_labe ```

- \ No newline at end of file + diff --git a/docs/source/valid.md b/docs/source/valid.md index 6508fd87d0..09f8b66ea0 100644 --- a/docs/source/valid.md +++ b/docs/source/valid.md @@ -68,6 +68,7 @@ The response looks like this if a dataset is valid: "preview": true, "search": true, "filter": true, + "statistics": true, } ``` @@ -79,6 +80,7 @@ The response looks like this if a dataset is valid but /search is not available "preview": true, "search": false, "filter": true, + "statistics": true, } ``` @@ -90,6 +92,19 @@ The response looks like this if a dataset is valid but /filter is not available "preview": true, "search": true, "filter": false, + "statistics": true, +} +``` + +Similarly, if the statistics are not available: + +```json +{ + "viewer": true, + "preview": true, + "search": true, + "filter": true, + "statistics": false, } ``` @@ -101,6 +116,7 @@ If only the first rows of a dataset are available, then the response looks like: "preview": true, "search": true, "filter": true, + "statistics": true, } ``` @@ -112,6 +128,7 @@ Finally, if the dataset is not valid at all, then the response is: "preview": false, "search": false, "filter": false, + "statistics": false, } ``` diff --git a/e2e/pyproject.toml b/e2e/pyproject.toml index 251609f5a1..36503bd8b3 100644 --- a/e2e/pyproject.toml +++ b/e2e/pyproject.toml @@ -24,9 +24,6 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] -markers = [ - "wip: tests being developed" -] [tool.mypy] strict = true diff --git a/e2e/tests/test_12_splits.py b/e2e/tests/test_12_splits.py index 65a89002b1..f439b4b6bf 100644 --- a/e2e/tests/test_12_splits.py +++ b/e2e/tests/test_12_splits.py @@ -10,7 +10,7 @@ "status,name,dataset,config,error_code", [ # (200, "all splits in a dataset", "ibm/duorc", None, None), - # (200, "splits for a single config", "emotion", "unsplit", None) + # (200, "splits for a single config", "dair-ai/emotion", "unsplit", None) ( 401, "inexistent dataset, and not authenticated", diff --git a/front/admin_ui/app.py b/front/admin_ui/app.py index f1ff741f39..2fbb3adb11 100644 --- a/front/admin_ui/app.py +++ b/front/admin_ui/app.py @@ -398,13 +398,20 @@ def query_jobs(pending_jobs_query): processing_step.job_type for processing_step in processing_graph.get_topologically_ordered_processing_steps() ] + def on_change_refresh_job_type(job_type): return processing_graph.get_processing_step(job_type).difficulty refresh_type = gr.Dropdown( - job_types, multiselect=False, type="value", label="job type", value=job_types[0] + job_types, + multiselect=False, + type="value", + label="job type", + value=job_types[0], + ) + refresh_dataset_name = gr.Textbox( + label="dataset", placeholder="allenai/c4" ) - refresh_dataset_name = gr.Textbox(label="dataset", placeholder="c4") refresh_config_name = gr.Textbox( label="config (optional)", placeholder="en" ) @@ -415,8 +422,17 @@ def on_change_refresh_job_type(job_type): "*you can select multiple values by separating them with commas, e.g. split='train, test'*" ) - refresh_difficulty = gr.Slider(0, 100, processing_graph.get_processing_step(job_types[0]).difficulty, step=10, interactive=True, label="difficulty") - refresh_type.change(on_change_refresh_job_type, refresh_type, refresh_difficulty) + refresh_difficulty = gr.Slider( + 0, + 100, + processing_graph.get_processing_step(job_types[0]).difficulty, + step=10, + interactive=True, + label="difficulty", + ) + refresh_type.change( + on_change_refresh_job_type, refresh_type, refresh_difficulty + ) refresh_priority = gr.Dropdown( ["low", "normal", "high"], @@ -551,7 +567,7 @@ def delete_and_recreate_dataset( outputs=delete_and_recreate_dataset_output, ) with gr.Tab("Dataset status"): - dataset_name = gr.Textbox(label="dataset", placeholder="c4") + dataset_name = gr.Textbox(label="dataset", placeholder="allenai/c4") dataset_status_button = gr.Button("Get dataset status") gr.Markdown("### Pending jobs") jobs_table = gr.DataFrame() diff --git a/jobs/cache_maintenance/pyproject.toml b/jobs/cache_maintenance/pyproject.toml index c9f312670d..8c108be8cf 100644 --- a/jobs/cache_maintenance/pyproject.toml +++ b/jobs/cache_maintenance/pyproject.toml @@ -24,9 +24,6 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] -markers = [ - "wip: tests being developed" -] [tool.mypy] strict = true diff --git a/jobs/mongodb_migration/pyproject.toml b/jobs/mongodb_migration/pyproject.toml index ded39be0cd..b58e2087a5 100644 --- a/jobs/mongodb_migration/pyproject.toml +++ b/jobs/mongodb_migration/pyproject.toml @@ -23,9 +23,6 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] -markers = [ - "wip: tests being developed" -] [tool.mypy] strict = true diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml index 633ea4962d..ee08fcb4d1 100644 --- a/libs/libcommon/pyproject.toml +++ b/libs/libcommon/pyproject.toml @@ -51,8 +51,7 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] markers = [ - "real_dataset: tests on the Hub", - "wip: tests being developed" + "real_dataset: tests on the Hub" ] [tool.mypy] diff --git a/libs/libcommon/tests/test_duckdb_utils.py b/libs/libcommon/tests/test_duckdb_utils.py index 3d8da718be..ec24bec601 100644 --- a/libs/libcommon/tests/test_duckdb_utils.py +++ b/libs/libcommon/tests/test_duckdb_utils.py @@ -3,11 +3,11 @@ def test_duckdb_index_is_partial() -> None: assert duckdb_index_is_partial( - "https://hf.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/index.duckdb" + "https://hf.co/datasets/canonical/resolve/refs%2Fconvert%2Fduckdb/en/partial-train/index.duckdb" ) assert duckdb_index_is_partial( - "https://hf.co/datasets/bigcode/the-stack/resolve/refs%2Fconvert%2Fparquet/default/train/partial-index.duckdb" + "https://hf.co/datasets/organization/not-canonical/resolve/refs%2Fconvert%2Fduckdb/default/train/partial-index.duckdb" ) assert not duckdb_index_is_partial( - "https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/index.duckdb" + "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fduckdb/plain_text/train/index.duckdb" ) diff --git a/libs/libcommon/tests/test_parquet_utils.py b/libs/libcommon/tests/test_parquet_utils.py index 141bd1ab89..21b5285b39 100644 --- a/libs/libcommon/tests/test_parquet_utils.py +++ b/libs/libcommon/tests/test_parquet_utils.py @@ -402,13 +402,13 @@ def indexer( def test_parquet_export_is_partial() -> None: assert parquet_export_is_partial( - "https://hf.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet" + "https://hf.co/datasets/canonical/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet" ) assert not parquet_export_is_partial( - "https://hf.co/datasets/bigcode/the-stack/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet" + "https://hf.co/datasets/organization/not-canonical/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet" ) assert not parquet_export_is_partial( - "https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet" + "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet" ) @@ -494,9 +494,12 @@ def test_indexer_schema_mistmatch_error( @pytest.mark.parametrize( "parquet_url,expected", [ - ("https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet", "train"), ( - "https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/partial-test/0000.parquet", + "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet", + "train", + ), + ( + "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/partial-test/0000.parquet", "partial-test", ), ], diff --git a/services/admin/pyproject.toml b/services/admin/pyproject.toml index 03d82fbe3a..4a3289e520 100644 --- a/services/admin/pyproject.toml +++ b/services/admin/pyproject.toml @@ -32,8 +32,7 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] markers = [ - "real_dataset: tests on the Hub", - "wip: tests being developed" + "real_dataset: tests on the Hub" ] [tool.mypy] diff --git a/services/admin/tests/test_app_real.py b/services/admin/tests/test_app_real.py index da2cf2ae46..649293f21a 100644 --- a/services/admin/tests/test_app_real.py +++ b/services/admin/tests/test_app_real.py @@ -43,7 +43,7 @@ def test_force_refresh( real_app_config: AppConfig, real_client: TestClient, ) -> None: - dataset = "glue" + dataset = "nyu-mll/glue" first_step = processing_graph.get_processing_steps(order="topological")[0] path = first_step.job_type response = real_client.request("post", f"/force-refresh/{path}?dataset={dataset}") diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml index a0042e839f..e0d12592a5 100644 --- a/services/api/pyproject.toml +++ b/services/api/pyproject.toml @@ -31,8 +31,7 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] markers = [ - "real_dataset: tests on the Hub", - "wip: tests being developed" + "real_dataset: tests on the Hub" ] [tool.mypy] diff --git a/services/api/tests/test_app_real.py b/services/api/tests/test_app_real.py index 49bf96d257..658bc51700 100644 --- a/services/api/tests/test_app_real.py +++ b/services/api/tests/test_app_real.py @@ -46,7 +46,7 @@ def test_webhook_untrusted( ) -> None: payload = { "event": "add", - "repo": {"type": "dataset", "name": "glue", "gitalyUid": "123", "headSha": "revision"}, + "repo": {"type": "dataset", "name": "nyu-mll/glue", "gitalyUid": "123", "headSha": "revision"}, "scope": "repo", } response = real_client.post("/webhook", json=payload) @@ -57,7 +57,7 @@ def test_webhook_untrusted( def test_webhook_trusted(real_client: TestClient) -> None: payload = { "event": "add", - "repo": {"type": "dataset", "name": "glue", "gitalyUid": "123", "headSha": "revision"}, + "repo": {"type": "dataset", "name": "nyu-mll/glue", "gitalyUid": "123", "headSha": "revision"}, "scope": "repo", } response = real_client.post("/webhook", json=payload, headers={"x-webhook-secret": API_HF_WEBHOOK_SECRET}) diff --git a/services/rows/pyproject.toml b/services/rows/pyproject.toml index e5871f7949..f44936e9c3 100644 --- a/services/rows/pyproject.toml +++ b/services/rows/pyproject.toml @@ -32,10 +32,6 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] -markers = [ - "real_dataset: tests on the Hub", - "wip: tests being developed" -] [tool.mypy] strict = true diff --git a/services/search/pyproject.toml b/services/search/pyproject.toml index 5c2fbd003b..8c7146b58e 100644 --- a/services/search/pyproject.toml +++ b/services/search/pyproject.toml @@ -31,10 +31,6 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] -markers = [ - "real_dataset: tests on the Hub", - "wip: tests being developed" -] [tool.mypy] strict = true diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index 6f80c68cc5..6f67f13cee 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -72,8 +72,7 @@ requires = ["poetry-core>=1.0.0"] [tool.pytest.ini_options] filterwarnings = ["ignore::DeprecationWarning"] markers = [ - "real_dataset: tests on the Hub", - "wip: tests being developed" + "real_dataset: tests on the Hub" ] [tool.mypy] diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py index e02e935fe9..6b1f2bf951 100644 --- a/services/worker/src/worker/job_runners/config/parquet_and_info.py +++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py @@ -751,7 +751,7 @@ class limit_parquet_writes: Example of usage: ```python - builder = load_dataset_builder("squad") + builder = load_dataset_builder("rajpurkar/squad") max_dataset_size_bytes = 10_000_000 with limit_parquet_writes(builder, max_dataset_size_bytes=max_dataset_size_bytes) as limiter: builder.download_and_prepare(file_format="parquet") @@ -762,7 +762,7 @@ class limit_parquet_writes: the full dataset: ```python - builder = load_dataset_builder("squad") + builder = load_dataset_builder("rajpurkar/squad") max_dataset_size_bytes = 10_000_000 dl_manager = StreamingDownloadManager(...) for split_generator in builder._split_generators(dl_manager): diff --git a/services/worker/tests/job_runners/config/test_parquet_and_info.py b/services/worker/tests/job_runners/config/test_parquet_and_info.py index 02cca837c3..55c183be8f 100644 --- a/services/worker/tests/job_runners/config/test_parquet_and_info.py +++ b/services/worker/tests/job_runners/config/test_parquet_and_info.py @@ -855,12 +855,12 @@ def test_resolve_trust_remote_code() -> None: resolve_trust_remote_code("lhoestq/demo1", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/d*"]) is True ) - assert resolve_trust_remote_code("squad", allow_list=[]) is False - assert resolve_trust_remote_code("squad", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}"]) is True - assert resolve_trust_remote_code("squad", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/s*"]) is True + assert resolve_trust_remote_code("mnist", allow_list=[]) is False + assert resolve_trust_remote_code("mnist", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}"]) is True + assert resolve_trust_remote_code("mnist", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/s*"]) is True assert ( resolve_trust_remote_code( - "lhoestq/custom_squad", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/d*"] + "lhoestq/custom_mnist", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/d*"] ) is False )