From f45af9cf63ceabe9c99474456eb69b54ef58de8b Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Wed, 13 Mar 2024 10:54:52 +0100
Subject: [PATCH] Replace canonical datasets with community ones in the
 docs/tests (#2579)

* replace glue with nyu-mll/glue (or mnist in one case)

* remove unused pytest marks (somewhat unrelated to the PR, sorry)

* fix ibm/duorc

* fix assets URL

* add statistics field in docs (from #2577)

* replace emotion and c4 with their moved new neame

* replace squad with its moved version

* rename datasets in openapi spec + add missing links to docs

* fix test
---
 docs/source/filter.md                         |   6 +-
 docs/source/first_rows.md                     |   2 +-
 docs/source/info.md                           | 112 +++++--------
 docs/source/openapi.json                      | 157 +++++++++---------
 docs/source/rows.md                           |   6 +-
 docs/source/search.md                         |   4 +-
 docs/source/statistics.md                     |  12 +-
 docs/source/valid.md                          |  17 ++
 e2e/pyproject.toml                            |   3 -
 e2e/tests/test_12_splits.py                   |   2 +-
 front/admin_ui/app.py                         |  26 ++-
 jobs/cache_maintenance/pyproject.toml         |   3 -
 jobs/mongodb_migration/pyproject.toml         |   3 -
 libs/libcommon/pyproject.toml                 |   3 +-
 libs/libcommon/tests/test_duckdb_utils.py     |   6 +-
 libs/libcommon/tests/test_parquet_utils.py    |  13 +-
 services/admin/pyproject.toml                 |   3 +-
 services/admin/tests/test_app_real.py         |   2 +-
 services/api/pyproject.toml                   |   3 +-
 services/api/tests/test_app_real.py           |   4 +-
 services/rows/pyproject.toml                  |   4 -
 services/search/pyproject.toml                |   4 -
 services/worker/pyproject.toml                |   3 +-
 .../job_runners/config/parquet_and_info.py    |   4 +-
 .../config/test_parquet_and_info.py           |   8 +-
 25 files changed, 195 insertions(+), 215 deletions(-)

diff --git a/docs/source/filter.md b/docs/source/filter.md
index bd4a21aa6b..34c8b3bf27 100644
--- a/docs/source/filter.md
+++ b/docs/source/filter.md
@@ -12,7 +12,7 @@ This guide shows you how to use Datasets Server's `/filter` endpoint to filter r
 Feel free to also try it out with [ReDoc](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/filterRows).
 
 The `/filter` endpoint accepts the following query parameters:
-- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0`
+- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 - `where`: the filter condition
@@ -88,7 +88,7 @@ The endpoint response is a JSON containing two keys (same format as [`/rows`](./
 
 The rows are ordered by the row index.
 
-For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm.duorc`/`SelfRC` train split for the `where` condition `no_answer=true`:
+For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm/duorc`/`SelfRC` train split for the `where` condition `no_answer=true`:
 
 ```json
 {
@@ -197,4 +197,4 @@ For example, here are the `features` and the slice 150-151 of matching `rows` of
 
 If the result has `partial: true` it means that the filtering couldn't be run on the full dataset because it's too big.
 
-Indeed, the indexing for `/filter` can be partial if the dataset is bigger than 5GB. In that case, it only uses the first 5GB.
\ No newline at end of file
+Indeed, the indexing for `/filter` can be partial if the dataset is bigger than 5GB. In that case, it only uses the first 5GB.
diff --git a/docs/source/first_rows.md b/docs/source/first_rows.md
index 7943a11a65..7605f87c03 100644
--- a/docs/source/first_rows.md
+++ b/docs/source/first_rows.md
@@ -8,7 +8,7 @@ This guide shows you how to use Datasets Server's `/first-rows` endpoint to prev
 
 The `/first-rows` endpoint accepts three query parameters:
 
-- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0`
+- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 
diff --git a/docs/source/info.md b/docs/source/info.md
index a7b75230d8..bc6dcbb4b8 100644
--- a/docs/source/info.md
+++ b/docs/source/info.md
@@ -51,76 +51,50 @@ The endpoint response is a JSON with the `dataset_info` key. Its structure and c
 
 ```json
 {
-   "dataset_info":{
-      "description":"",
-      "citation":"",
-      "homepage":"",
-      "license":"",
-      "features":{
-         "plot_id":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "plot":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "title":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "question_id":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "question":{
-            "dtype":"string",
-            "_type":"Value"
-         },
-         "answers":{
-            "feature":{
-               "dtype":"string",
-               "_type":"Value"
-            },
-            "_type":"Sequence"
-         },
-         "no_answer":{
-            "dtype":"bool",
-            "_type":"Value"
-         }
+  "dataset_info": {
+    "description": "",
+    "citation": "",
+    "homepage": "",
+    "license": "",
+    "features": {
+      "plot_id": { "dtype": "string", "_type": "Value" },
+      "plot": { "dtype": "string", "_type": "Value" },
+      "title": { "dtype": "string", "_type": "Value" },
+      "question_id": { "dtype": "string", "_type": "Value" },
+      "question": { "dtype": "string", "_type": "Value" },
+      "answers": {
+        "feature": { "dtype": "string", "_type": "Value" },
+        "_type": "Sequence"
       },
-      "builder_name":"parquet",
-      "dataset_name":"duorc",
-      "config_name":"SelfRC",
-      "version":{
-         "version_str":"0.0.0",
-         "major":0,
-         "minor":0,
-         "patch":0
+      "no_answer": { "dtype": "bool", "_type": "Value" }
+    },
+    "builder_name": "parquet",
+    "dataset_name": "duorc",
+    "config_name": "SelfRC",
+    "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 },
+    "splits": {
+      "train": {
+        "name": "train",
+        "num_bytes": 248966361,
+        "num_examples": 60721,
+        "dataset_name": null
       },
-      "splits":{
-         "train":{
-            "name":"train",
-            "num_bytes":248966361,
-            "num_examples":60721,
-            "dataset_name":null
-         },
-         "validation":{
-            "name":"validation",
-            "num_bytes":56359392,
-            "num_examples":12961,
-            "dataset_name":null
-         },
-         "test":{
-            "name":"test",
-            "num_bytes":51022318,
-            "num_examples":12559,
-            "dataset_name":null
-         }
+      "validation": {
+        "name": "validation",
+        "num_bytes": 56359392,
+        "num_examples": 12961,
+        "dataset_name": null
       },
-      "download_size":21001846,
-      "dataset_size":356348071
-   },
-   "partial":false
+      "test": {
+        "name": "test",
+        "num_bytes": 51022318,
+        "num_examples": 12559,
+        "dataset_name": null
+      }
+    },
+    "download_size": 21001846,
+    "dataset_size": 356348071
+  },
+  "partial": false
 }
-```
\ No newline at end of file
+```
diff --git a/docs/source/openapi.json b/docs/source/openapi.json
index a0c6713085..9bab560cfd 100644
--- a/docs/source/openapi.json
+++ b/docs/source/openapi.json
@@ -1394,9 +1394,9 @@
           "type": "string"
         },
         "examples": {
-          "glue": {
+          "mnist": {
             "summary": "A canonical dataset",
-            "value": "glue"
+            "value": "mnist"
           },
           "Helsinki-NLP/tatoeba_mt": {
             "summary": "A namespaced dataset",
@@ -1414,7 +1414,7 @@
         },
         "examples": {
           "cola": {
-            "summary": "A subset of the glue dataset",
+            "summary": "A subset of the nyu-mll/glue dataset",
             "value": "cola"
           },
           "yangdong/ecqa": {
@@ -1455,7 +1455,7 @@
         },
         "examples": {
           "cola": {
-            "summary": "A subset of the glue dataset",
+            "summary": "A subset of the nyu-mll/glue dataset",
             "value": "cola"
           },
           "yangdong/ecqa": {
@@ -1767,12 +1767,12 @@
                     }
                   },
                   "splits for a single config": {
-                    "summary": "emotion has two configs. Setting config=unsplit only returns the splits for this config.",
-                    "description": "Try with https://datasets-server.huggingface.co/splits?dataset=emotion&config=unsplit.",
+                    "summary": "dair-ai/emotion has two configs. Setting config=unsplit only returns the splits for this config.",
+                    "description": "Try with https://datasets-server.huggingface.co/splits?dataset=dair-ai/emotion&config=unsplit.",
                     "value": {
                       "splits": [
                         {
-                          "dataset": "emotion",
+                          "dataset": "dair-ai/emotion",
                           "config": "unsplit",
                           "split": "train"
                         }
@@ -4054,8 +4054,8 @@
                   "$ref": "#/components/schemas/ParquetResponse"
                 },
                 "examples": {
-                  "duorc": {
-                    "summary": "duorc: six parquet files, one per split",
+                  "ibm/duorc": {
+                    "summary": "ibm/duorc: six parquet files, one per split",
                     "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=ibm/duorc",
                     "value": {
                       "parquet_files": [
@@ -4323,110 +4323,113 @@
                   },
                   "partial parquet export": {
                     "summary": "c4 (en): the parquet export is partial (first 5GB)",
-                    "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=c4&config=en",
+                    "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=allenai/c4&config=en",
                     "value": {
                       "parquet_files": [
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet",
                           "filename": "0000.parquet",
-                          "size": 309207547
+                          "size": 312302655
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0001.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0001.parquet",
                           "filename": "0001.parquet",
-                          "size": 308665905
+                          "size": 314250060
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0002.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0002.parquet",
                           "filename": "0002.parquet",
-                          "size": 309066442
+                          "size": 312268050
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0003.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0003.parquet",
                           "filename": "0003.parquet",
-                          "size": 309257276
+                          "size": 312065965
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0004.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0004.parquet",
                           "filename": "0004.parquet",
-                          "size": 309040649
+                          "size": 308599130
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0005.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0005.parquet",
                           "filename": "0005.parquet",
-                          "size": 308850445
+                          "size": 312308752
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0006.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0006.parquet",
                           "filename": "0006.parquet",
-                          "size": 308432549
+                          "size": 313118966
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0007.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0007.parquet",
                           "filename": "0007.parquet",
-                          "size": 308621018
+                          "size": 313275039
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0008.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0008.parquet",
                           "filename": "0008.parquet",
-                          "size": 309109536
+                          "size": 312402829
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "train",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0009.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0009.parquet",
                           "filename": "0009.parquet",
-                          "size": 300817682
+                          "size": 273854946
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "validation",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial/validation/0000.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-validation/0000.parquet",
                           "filename": "0000.parquet",
-                          "size": 308896113
+                          "size": 311994499
                         },
                         {
-                          "dataset": "c4",
+                          "dataset": "allenai/c4",
                           "config": "en",
                           "split": "validation",
-                          "url": "https://huggingface.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial/validation/0001.parquet",
+                          "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-validation/0001.parquet",
                           "filename": "0001.parquet",
-                          "size": 200085262
+                          "size": 197281279
                         }
                       ],
-                      "pending": [],
-                      "failed": [],
+                      "features": {
+                        "text": { "dtype": "string", "_type": "Value" },
+                        "timestamp": { "dtype": "string", "_type": "Value" },
+                        "url": { "dtype": "string", "_type": "Value" }
+                      },
                       "partial": true
-                    }
+                    }                    
                   },
                   "dataset where no parquet file could be created": {
                     "summary": "When the parquet files cannot be created for a configuration, it's listed in 'failed'.",
@@ -4834,12 +4837,12 @@
                   },
                   "config metadata": {
                     "summary": "metadata for a dataset config",
-                    "description": "Try with https://datasets-server.huggingface.co/info?dataset=glue&config=ax",
+                    "description": "Try with https://datasets-server.huggingface.co/info?dataset=nyu-mll/glue&config=ax",
                     "value": {
                       "dataset_info": {
-                        "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.\n\n",
-                        "citation": "\n@inproceedings{wang2019glue,\n  title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},\n  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},\n  note={In the Proceedings of ICLR.},\n  year={2019}\n}\n",
-                        "homepage": "https://gluebenchmark.com/diagnostics",
+                        "description": "",
+                        "citation": "",
+                        "homepage": "",
                         "license": "",
                         "features": {
                           "premise": { "dtype": "string", "_type": "Value" },
@@ -4850,32 +4853,20 @@
                           },
                           "idx": { "dtype": "int32", "_type": "Value" }
                         },
-                        "builder_name": "glue",
+                        "builder_name": "parquet",
+                        "dataset_name": "glue",
                         "config_name": "ax",
-                        "version": {
-                          "version_str": "1.0.0",
-                          "description": "",
-                          "major": 1,
-                          "minor": 0,
-                          "patch": 0
-                        },
+                        "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 },
                         "splits": {
                           "test": {
                             "name": "test",
-                            "num_bytes": 237694,
+                            "num_bytes": 243791,
                             "num_examples": 1104,
-                            "dataset_name": "glue"
-                          }
-                        },
-                        "download_checksums": {
-                          "https://dl.fbaipublicfiles.com/glue/data/AX.tsv": {
-                            "num_bytes": 222257,
-                            "checksum": null
+                            "dataset_name": null
                           }
                         },
-                        "download_size": 222257,
-                        "dataset_size": 237694,
-                        "size_in_bytes": 459951
+                        "download_size": 80767,
+                        "dataset_size": 243791
                       },
                       "partial": false
                     }
@@ -5005,8 +4996,8 @@
         "summary": "Get the size of a dataset.",
         "description": "Returns the size (number of rows, storage) of the dataset. Use the optional config parameter to filter the response.",
         "externalDocs": {
-          "description": "See size (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.",
-          "url": "https://huggingface.co/docs/datasets-server/"
+          "description": "See size in the Hub docs.",
+          "url": "https://huggingface.co/docs/datasets-server/size"
         },
         "operationId": "getSize",
         "security": [
@@ -5094,11 +5085,11 @@
                   },
                   "config size": {
                     "summary": "size of a dataset config",
-                    "description": "Try with https://datasets-server.huggingface.co/size?dataset=glue&config=ax",
+                    "description": "Try with https://datasets-server.huggingface.co/size?dataset=nyu-mll/glue&config=ax",
                     "value": {
                       "size": {
                         "config": {
-                          "dataset": "glue",
+                          "dataset": "nyu-mll/glue",
                           "config": "ax",
                           "num_bytes_original_files": 222257,
                           "num_bytes_parquet_files": 80767,
@@ -5108,7 +5099,7 @@
                         },
                         "splits": [
                           {
-                            "dataset": "glue",
+                            "dataset": "nyu-mll/glue",
                             "config": "ax",
                             "split": "test",
                             "num_bytes_parquet_files": 80767,
@@ -5431,8 +5422,8 @@
         "summary": "Descriptive statistics of a split's columns",
         "description": "Returns descriptive statistics, such as min, max, average, histogram, of the columns of a split.",
         "externalDocs": {
-          "description": "See statistics (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/datasets-server/issues/1664.",
-          "url": "https://huggingface.co/docs/datasets-server/"
+          "description": "See statistics (Hub docs).",
+          "url": "https://huggingface.co/docs/datasets-server/statistics"
         },
         "operationId": "getStatistics",
         "security": [
@@ -5775,9 +5766,9 @@
                       "partial": false
                     }
                   },
-                  "A split (glue) with a string (text) column": {
+                  "A split (nyu-mll/glue) with a string (text) column": {
                     "summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.",
-                    "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=glue&config=ax&split=test.",
+                    "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=ax&split=test.",
                     "value": {
                       "num_examples": 1104,
                       "statistics": [
@@ -5867,9 +5858,9 @@
                       "partial": false
                     }
                   },
-                  "A split (hellaswag) with a string (label) column": {
+                  "A split (Rowan/hellaswag) with a string (label) column": {
                     "summary": "Statistics on a string column. The column 'label' contains less than 30 different strings, so each string is considered as a label, and the statistics are a count per label.",
-                    "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=hellaswag&config=default&split=train.",
+                    "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=Rowan/hellaswag&config=default&split=train.",
                     "value": {
                       "num_examples": 39905,
                       "statistics": [
diff --git a/docs/source/rows.md b/docs/source/rows.md
index 21c55e07cb..63209cad22 100644
--- a/docs/source/rows.md
+++ b/docs/source/rows.md
@@ -21,7 +21,7 @@ or [ReDoc](https://redocly.github.io/redoc/?url=https://datasets-server.huggingf
 
 The `/rows` endpoint accepts five query parameters:
 
-- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0`
+- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 - `offset`: the offset of the slice, for example `150`
@@ -159,7 +159,7 @@ Image and audio are represented by a URL that points to the file.
 
 Images are represented as a JSON object with three fields:
 
-- `src`: URL to the image file
+- `src`: URL to the image file. It's a [signed URL](https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/private-content-signed-urls.html) that expires after a certain time.
 - `height`: height (in pixels) of the image
 - `width`: width (in pixels) of the image
 
@@ -177,7 +177,7 @@ Here is an example of image, from the first row of the cifar100 dataset:
       "row_idx": 0,
       "row": {
         "img": {
-          "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/main/--/cifar100/train/0/img/image.jpg",
+          "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/aadb3af77e9048adbea6b47c21a81e47dd092ae5/--/cifar100/train/0/img/image.jpg?Expires=1710283469&Signature=A1v0cG07nuaBxYbuPR5EUZpJ9Se072SBDr4935gEsOESHGVyeqvd3qmvdsy1fuqbHk0dnx~p6MLtQ-hg3aCBOJ8eIJ5ItIoyYT4riJRuPQC0VFUb~b1maEwU8LRoXXuvrSysSz2QhBbC~ofv6cQudm~~bgGxXWAslDs180KnmPDsMU55ySsKyKQYNEkQKyuYvrGIJbFeg4lEps0f5CEwUstAwRAwlk~mzRpzUDBq7nJ~DcujTlllLv36nJX~too8mMnFn6dCn2nfGOFYwUiyYM73Czv-laLhVaIVUzcuJum90No~KNGzfYeFZpPqktA7MjCzRLf1gz5kA7wBqnY-8Q__&Key-Pair-Id=K3EI6M078Z3AC3",
           "height": 32,
           "width": 32
         },
diff --git a/docs/source/search.md b/docs/source/search.md
index 08351dd355..e1da5c2972 100644
--- a/docs/source/search.md
+++ b/docs/source/search.md
@@ -15,7 +15,7 @@ The text is searched in the columns of type `string`, even if the values are nes
 
 The `/search` endpoint accepts five query parameters:
 
-- `dataset`: the dataset name, for example `glue` or `mozilla-foundation/common_voice_10_0`
+- `dataset`: the dataset name, for example `nyu-mll/glue` or `mozilla-foundation/common_voice_10_0`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 - `query`: the text to search
@@ -71,7 +71,7 @@ The endpoint response is a JSON containing two keys (same format as [`/rows`](./
 
 The rows are ordered by the row index, and the text strings matching the query are not highlighted.
 
-For example, here are the `features` and the slice 150-151 of matching `rows` of the `duorc`/`SelfRC` train split for the query `dog`:
+For example, here are the `features` and the slice 150-151 of matching `rows` of the `ibm/duorc`/`SelfRC` train split for the query `dog`:
 
 ```json
 {
diff --git a/docs/source/statistics.md b/docs/source/statistics.md
index 1d1a58dd76..dbbe137c84 100644
--- a/docs/source/statistics.md
+++ b/docs/source/statistics.md
@@ -8,18 +8,18 @@ Datasets Server provides a `/statistics` endpoint for fetching some basic statis
 
 The `/statistics` endpoint requires three query parameters:
 
-- `dataset`: the dataset name, for example `glue`
+- `dataset`: the dataset name, for example `nyu-mll/glue`
 - `config`: the configuration name, for example `cola`
 - `split`: the split name, for example `train`
 
-Let's get some stats for `glue` dataset, `cola` config, `train` split:
+Let's get some stats for `nyu-mll/glue` dataset, `cola` config, `train` split:
 
 <inferencesnippet>
 <python>
 ```python
 import requests
 headers = {"Authorization": f"Bearer {API_TOKEN}"}
-API_URL = "https://datasets-server.huggingface.co/statistics?dataset=glue&config=cola&split=train"
+API_URL = "https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=cola&split=train"
 def query():
     response = requests.get(API_URL, headers=headers)
     return response.json()
@@ -31,7 +31,7 @@ data = query()
 import fetch from "node-fetch";
 async function query(data) {
     const response = await fetch(
-        "https://datasets-server.huggingface.co/statistics?dataset=glue&config=cola&split=train",
+        "https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=cola&split=train",
         {
             headers: { Authorization: `Bearer ${API_TOKEN}` },
             method: "GET"
@@ -47,7 +47,7 @@ query().then((response) => {
 </js>
 <curl>
 ```curl
-curl https://datasets-server.huggingface.co/statistics?dataset=glue&config=cola&split=train \
+curl https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=cola&split=train \
         -X GET \
         -H "Authorization: Bearer ${API_TOKEN}"
 ```
@@ -425,4 +425,4 @@ If string column does not satisfy the conditions to be treated as a `string_labe
 ```
 
 </p>
-</details>
\ No newline at end of file
+</details>
diff --git a/docs/source/valid.md b/docs/source/valid.md
index 6508fd87d0..09f8b66ea0 100644
--- a/docs/source/valid.md
+++ b/docs/source/valid.md
@@ -68,6 +68,7 @@ The response looks like this if a dataset is valid:
   "preview": true,
   "search": true,
   "filter": true,
+  "statistics": true,
 }
 ```
 
@@ -79,6 +80,7 @@ The response looks like this if a dataset is valid but /search is not available
   "preview": true,
   "search": false,
   "filter": true,
+  "statistics": true,
 }
 ```
 
@@ -90,6 +92,19 @@ The response looks like this if a dataset is valid but /filter is not available
   "preview": true,
   "search": true,
   "filter": false,
+  "statistics": true,
+}
+```
+
+Similarly, if the statistics are not available:
+
+```json
+{
+  "viewer": true,
+  "preview": true,
+  "search": true,
+  "filter": true,
+  "statistics": false,
 }
 ```
 
@@ -101,6 +116,7 @@ If only the first rows of a dataset are available, then the response looks like:
   "preview": true,
   "search": true,
   "filter": true,
+  "statistics": true,
 }
 ```
 
@@ -112,6 +128,7 @@ Finally, if the dataset is not valid at all, then the response is:
   "preview": false,
   "search": false,
   "filter": false,
+  "statistics": false,
 }
 ```
 
diff --git a/e2e/pyproject.toml b/e2e/pyproject.toml
index 251609f5a1..36503bd8b3 100644
--- a/e2e/pyproject.toml
+++ b/e2e/pyproject.toml
@@ -24,9 +24,6 @@ requires = ["poetry-core>=1.0.0"]
 
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
-markers = [
-    "wip: tests being developed"
-]
 
 [tool.mypy]
 strict = true
diff --git a/e2e/tests/test_12_splits.py b/e2e/tests/test_12_splits.py
index 65a89002b1..f439b4b6bf 100644
--- a/e2e/tests/test_12_splits.py
+++ b/e2e/tests/test_12_splits.py
@@ -10,7 +10,7 @@
     "status,name,dataset,config,error_code",
     [
         #  (200, "all splits in a dataset", "ibm/duorc", None, None),
-        #  (200, "splits for a single config", "emotion", "unsplit", None)
+        #  (200, "splits for a single config", "dair-ai/emotion", "unsplit", None)
         (
             401,
             "inexistent dataset, and not authenticated",
diff --git a/front/admin_ui/app.py b/front/admin_ui/app.py
index f1ff741f39..2fbb3adb11 100644
--- a/front/admin_ui/app.py
+++ b/front/admin_ui/app.py
@@ -398,13 +398,20 @@ def query_jobs(pending_jobs_query):
                     processing_step.job_type
                     for processing_step in processing_graph.get_topologically_ordered_processing_steps()
                 ]
+
                 def on_change_refresh_job_type(job_type):
                     return processing_graph.get_processing_step(job_type).difficulty
 
                 refresh_type = gr.Dropdown(
-                    job_types, multiselect=False, type="value", label="job type", value=job_types[0]
+                    job_types,
+                    multiselect=False,
+                    type="value",
+                    label="job type",
+                    value=job_types[0],
+                )
+                refresh_dataset_name = gr.Textbox(
+                    label="dataset", placeholder="allenai/c4"
                 )
-                refresh_dataset_name = gr.Textbox(label="dataset", placeholder="c4")
                 refresh_config_name = gr.Textbox(
                     label="config (optional)", placeholder="en"
                 )
@@ -415,8 +422,17 @@ def on_change_refresh_job_type(job_type):
                     "*you can select multiple values by separating them with commas, e.g. split='train, test'*"
                 )
 
-                refresh_difficulty = gr.Slider(0, 100, processing_graph.get_processing_step(job_types[0]).difficulty, step=10, interactive=True, label="difficulty")
-                refresh_type.change(on_change_refresh_job_type, refresh_type, refresh_difficulty)
+                refresh_difficulty = gr.Slider(
+                    0,
+                    100,
+                    processing_graph.get_processing_step(job_types[0]).difficulty,
+                    step=10,
+                    interactive=True,
+                    label="difficulty",
+                )
+                refresh_type.change(
+                    on_change_refresh_job_type, refresh_type, refresh_difficulty
+                )
 
                 refresh_priority = gr.Dropdown(
                     ["low", "normal", "high"],
@@ -551,7 +567,7 @@ def delete_and_recreate_dataset(
                     outputs=delete_and_recreate_dataset_output,
                 )
             with gr.Tab("Dataset status"):
-                dataset_name = gr.Textbox(label="dataset", placeholder="c4")
+                dataset_name = gr.Textbox(label="dataset", placeholder="allenai/c4")
                 dataset_status_button = gr.Button("Get dataset status")
                 gr.Markdown("### Pending jobs")
                 jobs_table = gr.DataFrame()
diff --git a/jobs/cache_maintenance/pyproject.toml b/jobs/cache_maintenance/pyproject.toml
index c9f312670d..8c108be8cf 100644
--- a/jobs/cache_maintenance/pyproject.toml
+++ b/jobs/cache_maintenance/pyproject.toml
@@ -24,9 +24,6 @@ requires = ["poetry-core>=1.0.0"]
 
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
-markers = [
-    "wip: tests being developed"
-]
 
 [tool.mypy]
 strict = true
diff --git a/jobs/mongodb_migration/pyproject.toml b/jobs/mongodb_migration/pyproject.toml
index ded39be0cd..b58e2087a5 100644
--- a/jobs/mongodb_migration/pyproject.toml
+++ b/jobs/mongodb_migration/pyproject.toml
@@ -23,9 +23,6 @@ requires = ["poetry-core>=1.0.0"]
 
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
-markers = [
-    "wip: tests being developed"
-]
 
 [tool.mypy]
 strict = true
diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml
index 633ea4962d..ee08fcb4d1 100644
--- a/libs/libcommon/pyproject.toml
+++ b/libs/libcommon/pyproject.toml
@@ -51,8 +51,7 @@ requires = ["poetry-core>=1.0.0"]
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
 markers = [
-    "real_dataset: tests on the Hub",
-    "wip: tests being developed"
+    "real_dataset: tests on the Hub"
 ]
 
 [tool.mypy]
diff --git a/libs/libcommon/tests/test_duckdb_utils.py b/libs/libcommon/tests/test_duckdb_utils.py
index 3d8da718be..ec24bec601 100644
--- a/libs/libcommon/tests/test_duckdb_utils.py
+++ b/libs/libcommon/tests/test_duckdb_utils.py
@@ -3,11 +3,11 @@
 
 def test_duckdb_index_is_partial() -> None:
     assert duckdb_index_is_partial(
-        "https://hf.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/index.duckdb"
+        "https://hf.co/datasets/canonical/resolve/refs%2Fconvert%2Fduckdb/en/partial-train/index.duckdb"
     )
     assert duckdb_index_is_partial(
-        "https://hf.co/datasets/bigcode/the-stack/resolve/refs%2Fconvert%2Fparquet/default/train/partial-index.duckdb"
+        "https://hf.co/datasets/organization/not-canonical/resolve/refs%2Fconvert%2Fduckdb/default/train/partial-index.duckdb"
     )
     assert not duckdb_index_is_partial(
-        "https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/index.duckdb"
+        "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fduckdb/plain_text/train/index.duckdb"
     )
diff --git a/libs/libcommon/tests/test_parquet_utils.py b/libs/libcommon/tests/test_parquet_utils.py
index 141bd1ab89..21b5285b39 100644
--- a/libs/libcommon/tests/test_parquet_utils.py
+++ b/libs/libcommon/tests/test_parquet_utils.py
@@ -402,13 +402,13 @@ def indexer(
 
 def test_parquet_export_is_partial() -> None:
     assert parquet_export_is_partial(
-        "https://hf.co/datasets/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet"
+        "https://hf.co/datasets/canonical/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet"
     )
     assert not parquet_export_is_partial(
-        "https://hf.co/datasets/bigcode/the-stack/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
+        "https://hf.co/datasets/organization/not-canonical/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
     )
     assert not parquet_export_is_partial(
-        "https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet"
+        "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet"
     )
 
 
@@ -494,9 +494,12 @@ def test_indexer_schema_mistmatch_error(
 @pytest.mark.parametrize(
     "parquet_url,expected",
     [
-        ("https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet", "train"),
         (
-            "https://hf.co/datasets/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/partial-test/0000.parquet",
+            "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/train/0000.parquet",
+            "train",
+        ),
+        (
+            "https://hf.co/datasets/rajpurkar/squad/resolve/refs%2Fconvert%2Fparquet/plain_text/partial-test/0000.parquet",
             "partial-test",
         ),
     ],
diff --git a/services/admin/pyproject.toml b/services/admin/pyproject.toml
index 03d82fbe3a..4a3289e520 100644
--- a/services/admin/pyproject.toml
+++ b/services/admin/pyproject.toml
@@ -32,8 +32,7 @@ requires = ["poetry-core>=1.0.0"]
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
 markers = [
-    "real_dataset: tests on the Hub",
-    "wip: tests being developed"
+    "real_dataset: tests on the Hub"
 ]
 
 [tool.mypy]
diff --git a/services/admin/tests/test_app_real.py b/services/admin/tests/test_app_real.py
index da2cf2ae46..649293f21a 100644
--- a/services/admin/tests/test_app_real.py
+++ b/services/admin/tests/test_app_real.py
@@ -43,7 +43,7 @@ def test_force_refresh(
     real_app_config: AppConfig,
     real_client: TestClient,
 ) -> None:
-    dataset = "glue"
+    dataset = "nyu-mll/glue"
     first_step = processing_graph.get_processing_steps(order="topological")[0]
     path = first_step.job_type
     response = real_client.request("post", f"/force-refresh/{path}?dataset={dataset}")
diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
index a0042e839f..e0d12592a5 100644
--- a/services/api/pyproject.toml
+++ b/services/api/pyproject.toml
@@ -31,8 +31,7 @@ requires = ["poetry-core>=1.0.0"]
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
 markers = [
-    "real_dataset: tests on the Hub",
-    "wip: tests being developed"
+    "real_dataset: tests on the Hub"
 ]
 
 [tool.mypy]
diff --git a/services/api/tests/test_app_real.py b/services/api/tests/test_app_real.py
index 49bf96d257..658bc51700 100644
--- a/services/api/tests/test_app_real.py
+++ b/services/api/tests/test_app_real.py
@@ -46,7 +46,7 @@ def test_webhook_untrusted(
 ) -> None:
     payload = {
         "event": "add",
-        "repo": {"type": "dataset", "name": "glue", "gitalyUid": "123", "headSha": "revision"},
+        "repo": {"type": "dataset", "name": "nyu-mll/glue", "gitalyUid": "123", "headSha": "revision"},
         "scope": "repo",
     }
     response = real_client.post("/webhook", json=payload)
@@ -57,7 +57,7 @@ def test_webhook_untrusted(
 def test_webhook_trusted(real_client: TestClient) -> None:
     payload = {
         "event": "add",
-        "repo": {"type": "dataset", "name": "glue", "gitalyUid": "123", "headSha": "revision"},
+        "repo": {"type": "dataset", "name": "nyu-mll/glue", "gitalyUid": "123", "headSha": "revision"},
         "scope": "repo",
     }
     response = real_client.post("/webhook", json=payload, headers={"x-webhook-secret": API_HF_WEBHOOK_SECRET})
diff --git a/services/rows/pyproject.toml b/services/rows/pyproject.toml
index e5871f7949..f44936e9c3 100644
--- a/services/rows/pyproject.toml
+++ b/services/rows/pyproject.toml
@@ -32,10 +32,6 @@ requires = ["poetry-core>=1.0.0"]
 
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
-markers = [
-    "real_dataset: tests on the Hub",
-    "wip: tests being developed"
-]
 
 [tool.mypy]
 strict = true
diff --git a/services/search/pyproject.toml b/services/search/pyproject.toml
index 5c2fbd003b..8c7146b58e 100644
--- a/services/search/pyproject.toml
+++ b/services/search/pyproject.toml
@@ -31,10 +31,6 @@ requires = ["poetry-core>=1.0.0"]
 
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
-markers = [
-    "real_dataset: tests on the Hub",
-    "wip: tests being developed"
-]
 
 [tool.mypy]
 strict = true
diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml
index 6f80c68cc5..6f67f13cee 100644
--- a/services/worker/pyproject.toml
+++ b/services/worker/pyproject.toml
@@ -72,8 +72,7 @@ requires = ["poetry-core>=1.0.0"]
 [tool.pytest.ini_options]
 filterwarnings = ["ignore::DeprecationWarning"]
 markers = [
-    "real_dataset: tests on the Hub",
-    "wip: tests being developed"
+    "real_dataset: tests on the Hub"
 ]
 
 [tool.mypy]
diff --git a/services/worker/src/worker/job_runners/config/parquet_and_info.py b/services/worker/src/worker/job_runners/config/parquet_and_info.py
index e02e935fe9..6b1f2bf951 100644
--- a/services/worker/src/worker/job_runners/config/parquet_and_info.py
+++ b/services/worker/src/worker/job_runners/config/parquet_and_info.py
@@ -751,7 +751,7 @@ class limit_parquet_writes:
     Example of usage:
 
     ```python
-    builder = load_dataset_builder("squad")
+    builder = load_dataset_builder("rajpurkar/squad")
     max_dataset_size_bytes = 10_000_000
     with limit_parquet_writes(builder, max_dataset_size_bytes=max_dataset_size_bytes) as limiter:
         builder.download_and_prepare(file_format="parquet")
@@ -762,7 +762,7 @@ class limit_parquet_writes:
     the full dataset:
 
     ```python
-    builder = load_dataset_builder("squad")
+    builder = load_dataset_builder("rajpurkar/squad")
     max_dataset_size_bytes = 10_000_000
     dl_manager = StreamingDownloadManager(...)
     for split_generator in builder._split_generators(dl_manager):
diff --git a/services/worker/tests/job_runners/config/test_parquet_and_info.py b/services/worker/tests/job_runners/config/test_parquet_and_info.py
index 02cca837c3..55c183be8f 100644
--- a/services/worker/tests/job_runners/config/test_parquet_and_info.py
+++ b/services/worker/tests/job_runners/config/test_parquet_and_info.py
@@ -855,12 +855,12 @@ def test_resolve_trust_remote_code() -> None:
         resolve_trust_remote_code("lhoestq/demo1", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/d*"])
         is True
     )
-    assert resolve_trust_remote_code("squad", allow_list=[]) is False
-    assert resolve_trust_remote_code("squad", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}"]) is True
-    assert resolve_trust_remote_code("squad", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/s*"]) is True
+    assert resolve_trust_remote_code("mnist", allow_list=[]) is False
+    assert resolve_trust_remote_code("mnist", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}"]) is True
+    assert resolve_trust_remote_code("mnist", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/s*"]) is True
     assert (
         resolve_trust_remote_code(
-            "lhoestq/custom_squad", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/d*"]
+            "lhoestq/custom_mnist", allow_list=["{{ALL_DATASETS_WITH_NO_NAMESPACE}}", "lhoestq/d*"]
         )
         is False
     )