doc: Update inaccessible datasets (#3123)

* Update inaccessible datasets * Update croissant doc
huggingface · Dec 19, 2024 · 0d342b8 · 0d342b8
1 parent 5e9371a
commit 0d342b8
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 111 deletions.
diff --git a/docs/source/analyze_data.md b/docs/source/analyze_data.md
@@ -8,25 +8,55 @@ To demonstrate, this guide will show you an end-to-end example of how to retriev
 
 ## Get a dataset
 
-The [Hub](https://huggingface.co/datasets) is home to more than 100,000 datasets across a wide variety of tasks, sizes, and languages. For this example, you'll use the [`codeparrot/codecomplex`](https://huggingface.co/datasets/codeparrot/codecomplex) dataset, but feel free to explore and find another dataset that interests you! The dataset contains Java code from programming competitions, and the time complexity of the code is labeled by a group of algorithm experts. 
+The [Hub](https://huggingface.co/datasets) is home to more than 200,000 datasets across a wide variety of tasks, sizes, and languages. For this example, you'll use the [`codeparrot/codecomplex`](https://huggingface.co/datasets/codeparrot/codecomplex) dataset, but feel free to explore and find another dataset that interests you! The dataset contains Java code from programming competitions, and the time complexity of the code is labeled by a group of algorithm experts. 
 
 Let's say you're interested in the average length of the submitted code as it relates to the time complexity. Here's how you can get started. 
 
 Use the `/parquet` endpoint to convert the dataset to a Parquet file and return the URL to it:
 
-```py
+<inferencesnippet>
+<python>
+```python
 import requests
 API_URL = "https://datasets-server.huggingface.co/parquet?dataset=codeparrot/codecomplex"
 def query():
     response = requests.get(API_URL)
     return response.json()
 data = query()
-print(data)
-{'parquet_files': 
+```
+</python>
+<js>
+```js
+import fetch from "node-fetch";
+async function query(data) {
+    const response = await fetch(
+        "https://datasets-server.huggingface.co/parquet?dataset=codeparrot/codecomplex",
+        {
+            method: "GET"
+        }
+    );
+    const result = await response.json();
+    return result;
+}
+query().then((response) => {
+    console.log(JSON.stringify(response));
+});
+```
+</js>
+<curl>
+```curl
+curl https://datasets-server.huggingface.co/parquet?dataset=codeparrot/codecomplex \
+        -X GET
+```
+</curl>
+</inferencesnippet>
+
+```json
+{"parquet_files": 
     [
-        {'dataset': 'codeparrot/codecomplex', 'config': 'default', 'split': 'train', 'url': 'https://huggingface.co/datasets/codeparrot/codecomplex/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet', 'filename': '0000.parquet', 'size': 4115908}
+        {"dataset": "codeparrot/codecomplex", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/codeparrot/codecomplex/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", "filename": "0000.parquet", "size": 4115908}
     ], 
- 'pending': [], 'failed': [], 'partial: false
+ "pending": [], "failed": [], "partial": false
 }
 ```
 

diff --git a/docs/source/clickhouse.md b/docs/source/clickhouse.md
@@ -97,17 +97,17 @@ Remember to set `enable_url_encoding` to 0 and `max_https_get_redirects` to 1 to
 SET max_http_get_redirects = 1, enable_url_encoding = 0
 ```
 
-Let's create a function to return a list of Parquet files from the [`barilan/blog_authorship_corpus`](https://huggingface.co/datasets/barilan/blog_authorship_corpus):
+Let's create a function to return a list of Parquet files from the [`tasksource/blog_authorship_corpus`](https://huggingface.co/datasets/tasksource/blog_authorship_corpus):
 
 ```bash
 CREATE OR REPLACE FUNCTION hugging_paths AS dataset -> (
     SELECT arrayMap(x -> (x.1), JSONExtract(json, 'parquet_files', 'Array(Tuple(url String))'))
     FROM url('https://datasets-server.huggingface.co/parquet?dataset=' || dataset, 'JSONAsString')
 );
 
-SELECT hugging_paths('barilan/blog_authorship_corpus') AS paths
+SELECT hugging_paths('tasksource/blog_authorship_corpus') AS paths
 
-['https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet','https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0001.parquet','https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/validation/0000.parquet']
+['https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet','https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet']
 ```
 
 You can make this even easier by creating another function that calls `hugging_paths` and outputs all the files based on the dataset name:
@@ -118,26 +118,27 @@ CREATE OR REPLACE FUNCTION hf AS dataset -> (
     SELECT multiIf(length(urls) = 0, '', length(urls) = 1, urls[1], 'https://huggingface.co/datasets/{' || arrayStringConcat(arrayMap(x -> replaceRegexpOne(replaceOne(x, 'https://huggingface.co/datasets/', ''), '\\.parquet$', ''), urls), ',') || '}.parquet')
 );
 
-SELECT hf('barilan/blog_authorship_corpus') AS pattern
+SELECT hf('tasksource/blog_authorship_corpus') AS pattern
 
-['https://huggingface.co/datasets/{blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/barilan/blog_authorship_corpus/blog_authorship_corpus-train-00000-of-00002,barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-train-00001-of-00002,barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/blog_authorship_corpus-validation}.parquet']
+https://huggingface.co/datasets/{tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000,tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0001}.parquet 
 ```
 
 Now use the `hf` function to query any dataset by passing the dataset name:
 
 ```bash
-SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length 
-FROM url(hf('barilan/blog_authorship_corpus'))
-GROUP BY horoscope 
+SELECT sign, count(*), AVG(LENGTH(text)) AS avg_blog_length 
+FROM url(hf('tasksource/blog_authorship_corpus'))
+GROUP BY sign 
 ORDER BY avg_blog_length 
 DESC LIMIT(5) 
 
-┌─────────────┬───────┬────────────────────┐
-│  Aquarius   │ 51747 │ 1132.487873693161  │
-├─────────────┼───────┼────────────────────┤
-│ Cancer      │ 66944 │  1111.613109464627 │
-│ Libra       │ 63994 │ 1060.3968184517298 │
-│ Sagittarius │ 52753 │ 1055.7120732470191 │
-│ Capricorn   │ 52207 │ 1055.4147719654452 │
-└─────────────┴───────┴────────────────────┘
+┌───────────┬────────┬────────────────────┐
+│  sign     │ count  │ avg_blog_length    │
+├───────────┼────────┼────────────────────┤
+│ Aquarius  │ 49687  │ 1193.9523819107615 │
+│ Leo       │ 53811  │ 1186.0665291483153 │
+│ Cancer    │ 65048  │ 1160.8010392325666 │
+│ Gemini    │ 51985  │ 1158.4132922958545 │
+│ Vurgi     │ 60399  │ 1142.9977648636566 │
+└───────────┴────────┴────────────────────┘
 ```
diff --git a/docs/source/cudf.md b/docs/source/cudf.md
@@ -8,8 +8,8 @@ To read from a single Parquet file, use the [`read_parquet`](https://docs.rapids
 import cudf
 
 df = (
-    cudf.read_parquet("https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet")
-    .groupby('horoscope')['text']
+    cudf.read_parquet("https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet")
+    .groupby('sign')['text']
     .apply(lambda x: x.str.len().mean())
     .sort_values(ascending=False)
     .head(5)
@@ -25,6 +25,6 @@ import dask.dataframe as dd
 dask.config.set({"dataframe.backend": "cudf"})
 
 df = (
-    dd.read_parquet("https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/*.parquet")
+    dd.read_parquet("https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/*.parquet")
 )
 ```
diff --git a/docs/source/duckdb.md b/docs/source/duckdb.md
@@ -7,7 +7,7 @@
 ```py
 import duckdb
 
-url = "https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet"
+url = "https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
 
 con = duckdb.connect()
 con.execute("INSTALL httpfs;")
@@ -22,7 +22,7 @@ var con = db.connect();
 con.exec('INSTALL httpfs');
 con.exec('LOAD httpfs');
 
-const url = "https://huggingface.co/datasets/barilan/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/blog_authorship_corpus/train/0000.parquet"
+const url = "https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
 ```
 </js>
 </inferencesnippet>
@@ -32,22 +32,22 @@ Now you can write and execute your SQL query on the Parquet file:
 <inferencesnippet>
 <python>
 ```py
-con.sql(f"SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '{url}' GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)")
+con.sql(f"SELECT sign, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '{url}' GROUP BY sign ORDER BY avg_blog_length DESC LIMIT(5)")
 ┌───────────┬──────────────┬────────────────────┐
-│ horoscope │ count_star() │  avg_blog_length   │
+│   sign    │ count_star() │  avg_blog_length   │
 │  varchar  │    int64     │       double       │
 ├───────────┼──────────────┼────────────────────┤
-│ Aquarius  │        34062 │  1129.218836239798 │
-│ Cancer    │        41509 │  1098.366812016671 │
-│ Capricorn │        33961 │ 1073.2002002296751 │
-│ Libra     │        40302 │ 1072.0718326633914 │
-│ Leo       │        40587 │ 1064.0536871412028 │
+│ Cancer    │        38956 │ 1206.5212034089743 │
+│ Leo       │        35487 │ 1180.0673767858652 │
+│ Aquarius  │        32723 │ 1152.1136815084192 │
+│ Virgo     │        36189 │ 1117.1982094006466 │
+│ Capricorn │        31825 │  1102.397360565593 │
 └───────────┴──────────────┴────────────────────┘
 ```
 </python>
 <js>
 ```js
-con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '${url}' GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)`, function(err, res) {
+con.all(`SELECT sign, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM '${url}' GROUP BY sign ORDER BY avg_blog_length DESC LIMIT(5)`, function(err, res) {
   if (err) {
     throw err;
   }
@@ -62,22 +62,26 @@ To query multiple files - for example, if the dataset is sharded:
 <inferencesnippet>
 <python>
 ```py
-con.sql(f"SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM read_parquet({urls[:2]}) GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)")
-┌─────────────┬──────────────┬────────────────────┐
-│  horoscope  │ count_star() │  avg_blog_length   │
-│   varchar   │    int64     │       double       │
-├─────────────┼──────────────┼────────────────────┤
-│ Aquarius    │        49568 │ 1125.8306770497095 │
-│ Cancer      │        63512 │   1097.95608703867 │
-│ Libra       │        60304 │ 1060.6110539931017 │
-│ Capricorn   │        49402 │ 1059.5552609206104 │
-│ Sagittarius │        50431 │ 1057.4589835616982 │
-└─────────────┴──────────────┴────────────────────┘
+urls = ["https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", "https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet"]
+
+con.sql(f"SELECT sign, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM read_parquet({urls}) GROUP BY sign ORDER BY avg_blog_length DESC LIMIT(5)")
+┌──────────┬──────────────┬────────────────────┐
+│   sign   │ count_star() │  avg_blog_length   │
+│ varchar  │    int64     │       double       │
+├──────────┼──────────────┼────────────────────┤
+│ Aquarius │        49687 │  1191.417211745527 │
+│ Leo      │        53811 │ 1183.8782219248853 │
+│ Cancer   │        65048 │ 1158.9691612347804 │
+│ Gemini   │        51985 │ 1156.0693084543618 │
+│ Virgo    │        60399 │ 1140.9584430205798 │
+└──────────┴──────────────┴────────────────────┘
 ```
 </python>
 <js>
 ```js
-con.all(`SELECT horoscope, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM read_parquet(${JSON.stringify(urls)}) GROUP BY horoscope ORDER BY avg_blog_length DESC LIMIT(5)`, function(err, res) {
+const urls = ["https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", "https://huggingface.co/datasets/tasksource/blog_authorship_corpus/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet"];
+
+con.all(`SELECT sign, count(*), AVG(LENGTH(text)) AS avg_blog_length FROM read_parquet(${JSON.stringify(urls)}) GROUP BY sign ORDER BY avg_blog_length DESC LIMIT(5)`, function(err, res) {
   if (err) {
     throw err;
   }

diff --git a/docs/source/first_rows.md b/docs/source/first_rows.md
@@ -2,8 +2,6 @@
 
 The dataset viewer provides a `/first-rows` endpoint for visualizing the first 100 rows of a dataset. This'll give you a good idea of the data types and example data contained in a dataset.
 
-![dataset-viewer](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dataset-viewer.png)
-
 This guide shows you how to use the dataset viewer's `/first-rows` endpoint to preview a dataset. Feel free to also try it out with [Postman](https://www.postman.com/huggingface/workspace/hugging-face-apis/request/23242779-32d6a8be-b800-446a-8cee-f6b5ca1710df), [RapidAPI](https://rapidapi.com/hugging-face-hugging-face-default/api/hugging-face-datasets-api), or [ReDoc](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/listFirstRows).
 
 The `/first-rows` endpoint accepts three query parameters:
@@ -145,31 +143,60 @@ For some datasets, the response size from `/first-rows` may exceed 1MB, in which
 
 In some cases, if even the first few rows generate a response that exceeds 1MB, some of the columns are truncated and converted to a string. You'll see these listed in the `truncated_cells` field.
 
-For example, the [`ETDataset/ett`](https://datasets-server.huggingface.co/first-rows?dataset=ETDataset/ett&config=m2&split=test) dataset only returns 10 rows, and the `target` and `feat_dynamic_real` columns are truncated:
+For example, the [`GEM/SciDuet`](https://datasets-server.huggingface.co/first-rows?dataset=GEM/SciDuet&config=default&split=train) dataset only returns 10 rows, and the `paper_abstract`, `paper_content`, `paper_headers`, `slide_content_text` and `target` columns are truncated:
 
 ```json
   ...
   "rows": [
     {
-      "row_idx": 0,
-      "row": {
-        "start": "2016-07-01T00:00:00",
-        "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039",
-        "feat_static_cat": [0],
-        "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611",
-        "item_id": "OT"
-      },
-      "truncated_cells": ["target", "feat_dynamic_real"]
-    },
-    {
-      "row_idx": 1,
-      "row": {
-        "start": "2016-07-01T00:00:00",
-        "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039",
-        "feat_static_cat": [0],
-        "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611",
-        "item_id": "OT"
+      {
+         "row_idx":8,
+         "row":{
+            "gem_id":"GEM-SciDuet-train-1#paper-954#slide-8",
+            "paper_id":"954",
+            "paper_title":"Incremental Syntactic Language Models for Phrase-based Translation",
+            "paper_abstract":"\"This paper describes a novel technique for incorporating syntactic knowledge into phrasebased machi",
+            "paper_content":"{\"paper_content_id\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29",
+            "paper_headers":"{\"paper_header_number\":[\"1\",\"2\",\"3\",\"3.1\",\"3.3\",\"4\",\"4.1\",\"6\",\"7\"],\"paper_header_content\":[\"Introduc",
+            "slide_id":"GEM-SciDuet-train-1#paper-954#slide-8",
+            "slide_title":"Does an Incremental Syntactic LM Help Translation",
+            "slide_content_text":"\"but will it make my BLEU score go up?\\nMotivation Syntactic LM Decoder Integration Questions?\\nMose",
+            "target":"\"but will it make my BLEU score go up?\\nMotivation Syntactic LM Decoder Integration Questions?\\nMose",
+            "references":[]
+         },
+         "truncated_cells":[
+            "paper_abstract",
+            "paper_content",
+            "paper_headers",
+            "slide_content_text",
+            "target"
+         ]
       },
+      {
+         "row_idx":9,
+         "row":{
+            "gem_id":"GEM-SciDuet-train-1#paper-954#slide-9",
+            "paper_id":"954",
+            "paper_title":"Incremental Syntactic Language Models for Phrase-based Translation",
+            "paper_abstract":"\"This paper describes a novel technique for incorporating syntactic knowledge into phrasebased machi",
+            "paper_content":"{\"paper_content_id\":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29",
+            "paper_headers":"{\"paper_header_number\":[\"1\",\"2\",\"3\",\"3.1\",\"3.3\",\"4\",\"4.1\",\"6\",\"7\"],\"paper_header_content\":[\"Introduc",
+            "slide_id":"GEM-SciDuet-train-1#paper-954#slide-9",
+            "slide_title":"Perplexity Results",
+            "slide_content_text":"\"Language models trained on WSJ Treebank corpus\\nMotivation Syntactic LM Decoder Integration Questio",
+            "target":"\"Language models trained on WSJ Treebank corpus\\nMotivation Syntactic LM Decoder Integration Questio",
+            "references":[
+
+            ]
+         },
+         "truncated_cells":[
+            "paper_abstract",
+            "paper_content",
+            "paper_headers",
+            "slide_content_text",
+            "target"
+         ]
+      }
       "truncated_cells": ["target", "feat_dynamic_real"]
     },
   ...