From 6df1f15081512f8b26e8e9c727ab2a215d4b3ad2 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 15 May 2024 17:12:15 -0400
Subject: [PATCH 01/22] Draft duckdb cli small guides

---
 docs/source/_toctree.yml  | 18 ++++++++++++++++++
 docs/source/duckdb_cli.md | 29 +++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 docs/source/duckdb_cli.md
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 9b59c61817..def55db2c7 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -38,6 +38,24 @@
           title: ClickHouse
         - local: duckdb
           title: DuckDB
+          sections:
+            - local: duckdb_cli
+              title: DuckDB CLI
+              sections:
+                - local: duckdb_cli_auth
+                  title: Authenticate
+                - local: duckdb_cli_select
+                  title: Query datasets
+                - local: duckdb_cli_sql_operations
+                  title: Perform SQL operations
+                - local: duckdb_cli_process
+                  title: Process datasets
+                - local: duckdb_cli_vector_similarity_search
+                  title: Perform vector similarity search
+                - local: duckdb_cli_export
+                  title: Export to other formats
+                - local: duckdb_cli_fts
+                  title: Implement full-text search
         - local: pandas
           title: Pandas
         - local: polars
diff --git a/docs/source/duckdb_cli.md b/docs/source/duckdb_cli.md
new file mode 100644
index 0000000000..afe106a26c
--- /dev/null
+++ b/docs/source/duckdb_cli.md
@@ -0,0 +1,29 @@
+# DuckDB
+
+The [DuckDB CLI](https://duckdb.org/docs/api/cli/overview.html) (Command Line Interface) is a single, dependency-free executable. 
+
+<!-- <Tip>
+
+For installation details, visit the [installation page](https://duckdb.org/docs/installation).
+
+</Tip> -->
+
+Starting from version `v0.10.3-dev1012`, the DuckDB CLI includes native support for accessing datasets on Hugging Face via URLs. Here are some features you can leverage with this powerful tool:
+
+- Query public, gated and private datasets
+- Analyze datasets and perform SQL operations
+- Process and transform datasets
+- Conduct vector similarity search on embedding datasets
+- Export datasets to other formats
+- Implement full-text search on datasets
+- And more! For a complete list of DuckDB features, visit the DuckDB documentation.
+
+Let's start with a quick demo to query the full rows of a dataset under the `refs/convert/parquet` revision:
+
+```bash
+FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
+```
+
+#TODO: Put an image of the output?
+
+In the following sections, we will cover more complex operations you can perform with DuckDB on Hugging Face datasets.

From f61ecd45d0d9d93c54b7370b723aa133191389d5 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 15 May 2024 17:41:35 -0400
Subject: [PATCH 02/22] Adding duckdb_cli_auth

---
 docs/source/_toctree.yml       |  2 +-
 docs/source/duckdb_cli.md      | 33 ++++++++++++++++++++++++++++-----
 docs/source/duckdb_cli_auth.md | 22 ++++++++++++++++++++++
 3 files changed, 51 insertions(+), 6 deletions(-)
 create mode 100644 docs/source/duckdb_cli_auth.md

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index def55db2c7..cc60b610cd 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -43,7 +43,7 @@
               title: DuckDB CLI
               sections:
                 - local: duckdb_cli_auth
-                  title: Authenticate
+                  title: Authentication for private and gated datasets
                 - local: duckdb_cli_select
                   title: Query datasets
                 - local: duckdb_cli_sql_operations
diff --git a/docs/source/duckdb_cli.md b/docs/source/duckdb_cli.md
index afe106a26c..8d37ce6bab 100644
--- a/docs/source/duckdb_cli.md
+++ b/docs/source/duckdb_cli.md
@@ -1,12 +1,12 @@
-# DuckDB
+# DuckDB CLI
 
 The [DuckDB CLI](https://duckdb.org/docs/api/cli/overview.html) (Command Line Interface) is a single, dependency-free executable. 
 
-<!-- <Tip>
+<Tip>
 
 For installation details, visit the [installation page](https://duckdb.org/docs/installation).
 
-</Tip> -->
+</Tip>
 
 Starting from version `v0.10.3-dev1012`, the DuckDB CLI includes native support for accessing datasets on Hugging Face via URLs. Here are some features you can leverage with this powerful tool:
 
@@ -18,12 +18,35 @@ Starting from version `v0.10.3-dev1012`, the DuckDB CLI includes native support
 - Implement full-text search on datasets
 - And more! For a complete list of DuckDB features, visit the DuckDB documentation.
 
-Let's start with a quick demo to query the full rows of a dataset under the `refs/convert/parquet` revision:
+To start the CLI, execute the following command in the installation folder:
 
 ```bash
+./duckdb
+```
+
+## Forming the Hugging Face URL
+
+To access Hugging Face datasets, use the following URL format:
+
+```plaintext
+hf://datasets/{my-username}/{my-dataset}@~parquet/{path_to_parquet_file} 
+```
+
+Where:
+- **my-username**  The user or organization of the dataset, e.g. `ibm`
+- **my-dataset** Is the dataset name, e.g: `duorc`
+- **path_to_parquet_file** Is the parquet file path, it supports glob patterns, e.g `**/*.parquet` to query all parquet files
+
+
+Let's start with a quick demo to query the full rows of a dataset under the `refs/convert/parquet` revision:
+
+```sql
 FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
 ```
 
-#TODO: Put an image of the output?
+Or using traditional SQL syntax:
 
+```sql
+SELECT * FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
+```
 In the following sections, we will cover more complex operations you can perform with DuckDB on Hugging Face datasets.
diff --git a/docs/source/duckdb_cli_auth.md b/docs/source/duckdb_cli_auth.md
new file mode 100644
index 0000000000..0b5c5d96bf
--- /dev/null
+++ b/docs/source/duckdb_cli_auth.md
@@ -0,0 +1,22 @@
+# Authentication for private and gated datasets
+
+To access private or gated datasets, you need to configure your Hugging Face Token in the DuckDB Secrets Manager.
+
+Visit [Hugging Face Settings - Tokens](https://huggingface.co/settings/tokens) to obtain your access token.
+
+DuckDB supports two providers for managing secrets:
+
+- `CONFIG`: Requires the user to pass all configuration information into the CREATE SECRET statement.
+- `CREDENTIAL_CHAIN`: Automatically tries to fetch credentials. For Hugging Face token it will try to get it from  `~/.cache/huggingface/token`
+
+For more information about DuckDB Secrets visit https://duckdb.org/docs/configuration/secrets_manager.html
+
+## Creating a secret with `CONFIG` provider
+
+To create a secret using the CONFIG provider, use the following command:
+
+```bash
+CREATE SECRET hf_token (TYPE HUGGINGFACE, token 'your_hf_token');
+```
+
+Replace `your_hf_token` with your actual Hugging Face token.
\ No newline at end of file

From f46e704b911b65a9e2e10e32ea0c38c06778a60e Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 16 May 2024 10:15:36 -0400
Subject: [PATCH 03/22] Adding credential_chain doc

---
 docs/source/duckdb_cli_auth.md | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/docs/source/duckdb_cli_auth.md b/docs/source/duckdb_cli_auth.md
index 0b5c5d96bf..0dc072f08d 100644
--- a/docs/source/duckdb_cli_auth.md
+++ b/docs/source/duckdb_cli_auth.md
@@ -16,7 +16,31 @@ For more information about DuckDB Secrets visit https://duckdb.org/docs/configur
 To create a secret using the CONFIG provider, use the following command:
 
 ```bash
-CREATE SECRET hf_token (TYPE HUGGINGFACE, token 'your_hf_token');
+CREATE SECRET hf_token (TYPE HUGGINGFACE, TOKEN 'your_hf_token');
 ```
 
-Replace `your_hf_token` with your actual Hugging Face token.
\ No newline at end of file
+Replace `your_hf_token` with your actual Hugging Face token.
+
+## Creating a secret with `CREDENTIAL_CHAIN` provider
+
+To create a secret using the CREDENTIAL_CHAIN provider, use the following command:
+
+```bash
+CREATE SECRET hf_token (TYPE HUGGINGFACE, PROVIDER credential_chain);
+```
+
+This command automatically retrieves the stored token from `~/.cache/huggingface/token`.
+
+If you haven't configured your token, execute the following command in the terminal:
+
+```bash
+huggingface-cli login
+```
+
+Alternatively, you can set your Hugging Face token as an environment variable:
+
+```bash
+export HF_TOKEN="HF_XXXXXXXXXXXXX"
+```
+
+For more information on authentication, see the [Hugging Face authentication](https://huggingface.co/docs/huggingface_hub/main/en/quick-start#authentication) documentation.

From 573cb71b348139a41798956774a92a4ea217bc55 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 16 May 2024 17:30:53 -0400
Subject: [PATCH 04/22] Add query datasets doc

---
 docs/source/duckdb_cli_select.md | 75 ++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 docs/source/duckdb_cli_select.md

diff --git a/docs/source/duckdb_cli_select.md b/docs/source/duckdb_cli_select.md
new file mode 100644
index 0000000000..5d163fb8ce
--- /dev/null
+++ b/docs/source/duckdb_cli_select.md
@@ -0,0 +1,75 @@
+# Query public, gated and private datasets
+
+Querying datasets is a fundamental step in data analysis. Here, we'll guide you through querying datasets using various methods.
+
+You can query Hugging Face [autoconverted parquet files](https://huggingface.co/docs/datasets-server/en/parquet#conversion-to-parquet) in the `refs/converts/parquet` branch by using the following syntax:
+
+
+```plaintext
+hf://datasets/{my-username}/{my-dataset}@~parquet/{path_to_parquet_file} 
+```
+
+There are [different ways](https://duckdb.org/docs/data/parquet/overview.html) to select your data.
+
+Using `FROM` syntax:
+```bash
+FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet';
+```
+
+Using `SELECT` `FROM` sytax:
+
+```bash
+SELECT question, answers FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet' LIMIT 10;
+```
+
+Count all parquet files matching a glob pattern:
+
+```bash
+SELECT COUNT(*) FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
+```
+
+Select using [read_parquet](https://duckdb.org/docs/guides/file_formats/query_parquet.html) function:
+
+```bash
+SELECT * FROM read_parquet('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/**/*.parquet') LIMIT 10;
+```
+
+Read all files that match a glob pattern and include a filename column specifying which file each row came from:
+
+```bash
+SELECT * FROM read_parquet('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/**/*.parquet', filename = true) LIMIT 10;
+```
+
+Using `parquet_scan` function:
+
+```bash
+SELECT * FROM parquet_scan('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/**/*.parquet') LIMIT 10;
+```
+
+## Get information of parquet files
+
+The [parquet_metadata]((https://duckdb.org/docs/data/parquet/metadata.html)) function can be used to query the metadata contained within a Parquet file.
+
+```bash
+SELECT * FROM parquet_metadata('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet');
+```
+
+Fetch the column names and column types:
+
+```bash
+DESCRIBE SELECT * FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet';
+```
+
+Fetch the internal schema:
+
+```bash
+SELECT * FROM parquet_schema('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet');
+```
+
+## Get statistics of the parquet files
+
+The `SUMMARIZE` command can be used to get various aggregates over a query (min, max, approx_unique, avg, std, q25, q50, q75, count). It returns these along with the column name, column type, and the percentage of NULL values.
+
+```bash
+SUMMARIZE SELECT * FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet';
+```

From 4fe6c54fc6a9d3197f0f0491a7f6a31dcad42eb2 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 16 May 2024 17:52:44 -0400
Subject: [PATCH 05/22] Change sample dataset

---
 docs/source/duckdb_cli_select.md | 106 +++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 13 deletions(-)

diff --git a/docs/source/duckdb_cli_select.md b/docs/source/duckdb_cli_select.md
index 5d163fb8ce..d9ebc440b5 100644
--- a/docs/source/duckdb_cli_select.md
+++ b/docs/source/duckdb_cli_select.md
@@ -13,63 +13,143 @@ There are [different ways](https://duckdb.org/docs/data/parquet/overview.html) t
 
 Using `FROM` syntax:
 ```bash
-FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet';
+FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet' SELECT city, country, region LIMIT 3;
+
+┌────────────────┬─────────────┬───────────────┐
+│      city      │   country   │    region     │
+│    varchar     │   varchar   │    varchar    │
+├────────────────┼─────────────┼───────────────┤
+│ Kabul          │ Afghanistan │ Southern Asia │
+│ Kandahar       │ Afghanistan │ Southern Asia │
+│ Mazar-e Sharif │ Afghanistan │ Southern Asia │
+└────────────────┴─────────────┴───────────────┘
+
 ```
 
 Using `SELECT` `FROM` sytax:
 
 ```bash
-SELECT question, answers FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet' LIMIT 10;
+SELECT city, country, region FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet' USING SAMPLE 3;
+
+┌──────────┬─────────┬────────────────┐
+│   city   │ country │     region     │
+│ varchar  │ varchar │    varchar     │
+├──────────┼─────────┼────────────────┤
+│ Wenzhou  │ China   │ Eastern Asia   │
+│ Valdez   │ Ecuador │ South America  │
+│ Aplahoue │ Benin   │ Western Africa │
+└──────────┴─────────┴────────────────┘
+
 ```
 
 Count all parquet files matching a glob pattern:
 
 ```bash
-SELECT COUNT(*) FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
+SELECT COUNT(*) FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/**/*.parquet';
+
+┌──────────────┐
+│ count_star() │
+│    int64     │
+├──────────────┤
+│         9083 │
+└──────────────┘
+
 ```
 
 Select using [read_parquet](https://duckdb.org/docs/guides/file_formats/query_parquet.html) function:
 
 ```bash
-SELECT * FROM read_parquet('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/**/*.parquet') LIMIT 10;
+SELECT * FROM read_parquet('hf://datasets/jamescalam/world-cities-geo@~parquet/default/**/*.parquet') LIMIT 3;
 ```
 
 Read all files that match a glob pattern and include a filename column specifying which file each row came from:
 
 ```bash
-SELECT * FROM read_parquet('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/**/*.parquet', filename = true) LIMIT 10;
+SELECT * FROM read_parquet('hf://datasets/jamescalam/world-cities-geo@~parquet/default/**/*.parquet', filename = true) LIMIT 3;
 ```
 
 Using `parquet_scan` function:
 
 ```bash
-SELECT * FROM parquet_scan('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/**/*.parquet') LIMIT 10;
+SELECT * FROM parquet_scan('hf://datasets/jamescalam/world-cities-geo@~parquet/default/**/*.parquet') LIMIT 3;
 ```
 
-## Get information of parquet files
+## Get metadata and schema
 
 The [parquet_metadata]((https://duckdb.org/docs/data/parquet/metadata.html)) function can be used to query the metadata contained within a Parquet file.
 
 ```bash
-SELECT * FROM parquet_metadata('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet');
+SELECT * FROM parquet_metadata('hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet');
+
+┌───────────────────────────────────────────────────────────────────────────────┬──────────────┬────────────────────┬─────────────┐
+│                                   file_name                                   │ row_group_id │ row_group_num_rows │ compression │
+│                                    varchar                                    │    int64     │       int64        │   varchar   │
+├───────────────────────────────────────────────────────────────────────────────┼──────────────┼────────────────────┼─────────────┤
+│ hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet │            0 │               1000 │ SNAPPY      │
+│ hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet │            0 │               1000 │ SNAPPY      │
+│ hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet │            0 │               1000 │ SNAPPY      │
+└───────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────────┘
+
 ```
 
 Fetch the column names and column types:
 
 ```bash
-DESCRIBE SELECT * FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet';
+DESCRIBE SELECT * FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet';
+
+┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
+│ column_name │ column_type │  null   │   key   │ default │  extra  │
+│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
+├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
+│ city        │ VARCHAR     │ YES     │         │         │         │
+│ country     │ VARCHAR     │ YES     │         │         │         │
+│ region      │ VARCHAR     │ YES     │         │         │         │
+│ continent   │ VARCHAR     │ YES     │         │         │         │
+│ latitude    │ DOUBLE      │ YES     │         │         │         │
+│ longitude   │ DOUBLE      │ YES     │         │         │         │
+│ x           │ DOUBLE      │ YES     │         │         │         │
+│ y           │ DOUBLE      │ YES     │         │         │         │
+│ z           │ DOUBLE      │ YES     │         │         │         │
+└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
+
 ```
 
-Fetch the internal schema:
+Fetch the internal schema (Exclusing file name):
 
 ```bash
-SELECT * FROM parquet_schema('hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet');
+SELECT * EXCLUDE (file_name) FROM parquet_schema('hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet');
+
+┌───────────┬────────────┬─────────────┬─────────────────┬──────────────┬────────────────┬───────┬───────────┬──────────┬──────────────┐
+│   name    │    type    │ type_length │ repetition_type │ num_children │ converted_type │ scale │ precision │ field_id │ logical_type │
+│  varchar  │  varchar   │   varchar   │     varchar     │    int64     │    varchar     │ int64 │   int64   │  int64   │   varchar    │
+├───────────┼────────────┼─────────────┼─────────────────┼──────────────┼────────────────┼───────┼───────────┼──────────┼──────────────┤
+│ schema    │            │             │ REQUIRED        │            9 │                │       │           │          │              │
+│ city      │ BYTE_ARRAY │             │ OPTIONAL        │              │ UTF8           │       │           │          │ StringType() │
+│ country   │ BYTE_ARRAY │             │ OPTIONAL        │              │ UTF8           │       │           │          │ StringType() │
+│ region    │ BYTE_ARRAY │             │ OPTIONAL        │              │ UTF8           │       │           │          │ StringType() │
+│ continent │ BYTE_ARRAY │             │ OPTIONAL        │              │ UTF8           │       │           │          │ StringType() │
+│ latitude  │ DOUBLE     │             │ OPTIONAL        │              │                │       │           │          │              │
+│ longitude │ DOUBLE     │             │ OPTIONAL        │              │                │       │           │          │              │
+│ x         │ DOUBLE     │             │ OPTIONAL        │              │                │       │           │          │              │
+│ y         │ DOUBLE     │             │ OPTIONAL        │              │                │       │           │          │              │
+│ z         │ DOUBLE     │             │ OPTIONAL        │              │                │       │           │          │              │
+├───────────┴────────────┴─────────────┴─────────────────┴──────────────┴────────────────┴───────┴───────────┴──────────┴──────────────┤
+
 ```
 
-## Get statistics of the parquet files
+## Get statistics
 
 The `SUMMARIZE` command can be used to get various aggregates over a query (min, max, approx_unique, avg, std, q25, q50, q75, count). It returns these along with the column name, column type, and the percentage of NULL values.
 
 ```bash
-SUMMARIZE SELECT * FROM 'hf://datasets/ibm/duorc@~parquet/ParaphraseRC/train/0000.parquet';
+SUMMARIZE SELECT latitude, longitude FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet';
+
+┌─────────────┬─────────────┬──────────────┬─────────────┬───────────────┬────────────────────┬───────────────────┬────────────────────┬───────────────────┬────────────────────┬───────┐
+│ column_name │ column_type │     min      │     max     │ approx_unique │        avg         │        std        │        q25         │        q50        │        q75         │ count │
+│   varchar   │   varchar   │   varchar    │   varchar   │     int64     │      varchar       │      varchar      │      varchar       │      varchar      │      varchar       │ int64 │
+├─────────────┼─────────────┼──────────────┼─────────────┼───────────────┼────────────────────┼───────────────────┼────────────────────┼───────────────────┼────────────────────┼───────┤
+│ latitude    │ DOUBLE      │ -54.8        │ 67.8557214  │          7324 │ 22.5004568364307   │ 26.77045468469093 │ 6.065424395863388  │ 29.33687520478191 │ 44.88357641321427  │  9083 │
+│ longitude   │ DOUBLE      │ -175.2166595 │ 179.3833313 │          7802 │ 14.699333721953098 │ 63.93672742608224 │ -7.077471714978484 │ 19.19758476462836 │ 43.782932169927165 │  9083 │
+└─────────────┴─────────────┴──────────────┴─────────────┴───────────────┴────────────────────┴───────────────────┴────────────────────┴───────────────────┴────────────────────┴───────┘
+
 ```

From cd462caf22970d6f2b34244506796d7b6c92c3e9 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 21 May 2024 18:56:29 -0400
Subject: [PATCH 06/22] sql operations

---
 docs/source/duckdb_cli_sql.md | 141 ++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 docs/source/duckdb_cli_sql.md

diff --git a/docs/source/duckdb_cli_sql.md b/docs/source/duckdb_cli_sql.md
new file mode 100644
index 0000000000..5404f00621
--- /dev/null
+++ b/docs/source/duckdb_cli_sql.md
@@ -0,0 +1,141 @@
+# Performing SQL operations
+
+Performing SQL operations with DuckDB opens up a world of possibilities for querying datasets efficiently. Let's dive into some examples showcasing the power of DuckDB functions.
+
+For our demonstration, we'll explore a fascinating dataset: A the multitask test containing multiple-choice questions spanning various domains of knowledge. You can access the dataset [here](https://huggingface.co/datasets/cais/mmlu).
+
+To preview the dataset, let's select a sample of 3 rows:
+
+```bash
+FROM 'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet' USING SAMPLE 3;
+
+┌──────────────────────┬──────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
+│       question       │     subject      │                                                                           choices                                                                            │ answer │
+│       varchar        │     varchar      │                                                                          varchar[]                                                                           │ int64  │
+├──────────────────────┼──────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────┤
+│ Which of the follo…  │ miscellaneous    │ [A campaign manager's introduction of a mayoral candidate, A professor's lecture on the structure of the heart, A company president's yearly sales report,…  │      0 │
+│ Are strategies of …  │ security_studies │ [No, arms control does not serve any purpose outside the realm of traditional military weapons., Yes, successful policies of arms control and deterrence h…  │      2 │
+│ Find all c in Z_3 …  │ abstract_algebra │ [0, 2, 1, 3]                                                                                                                                                 │      1 │
+└──────────────────────┴──────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────┘
+
+```
+
+This command retrieves a random sample of 3 rows from the dataset for us to examine.
+
+Let's start by examining the schema of our dataset. The following table outlines the structure of our dataset:
+
+```bash
+DESCRIBE FROM 'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet' USING SAMPLE 3;
+┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
+│ column_name │ column_type │  null   │   key   │ default │  extra  │
+│   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
+├─────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
+│ question    │ VARCHAR     │ YES     │         │         │         │
+│ subject     │ VARCHAR     │ YES     │         │         │         │
+│ choices     │ VARCHAR[]   │ YES     │         │         │         │
+│ answer      │ BIGINT      │ YES     │         │         │         │
+└─────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
+
+```
+Next, let's analyze if there are any duplicated records in our dataset:
+
+```bash
+SELECT   *,
+         COUNT(*) AS counts
+FROM     'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+GROUP BY ALL
+HAVING   counts > 2; 
+
+┌──────────┬─────────┬───────────┬────────┬────────┐
+│ question │ subject │  choices  │ answer │ counts │
+│ varchar  │ varchar │ varchar[] │ int64  │ int64  │
+├──────────┴─────────┴───────────┴────────┴────────┤
+│                      0 rows                      │
+└──────────────────────────────────────────────────┘
+
+```
+
+Fortunately, our dataset doesn't contain any duplicate records.
+
+Lets see the proportion of questions based on the subject in a bar representation:
+
+```bash
+SELECT 
+    subject, 
+    COUNT(*) AS counts, 
+    BAR(COUNT(*), 0, (SELECT COUNT(*) FROM 'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet')) AS percentage 
+FROM 
+    'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet' 
+GROUP BY 
+    subject 
+ORDER BY 
+    counts DESC;
+```
+
+Now, let's prepare a subset of the dataset containing questions related to **nutrition** and create a mapping of questions to correct answers:
+Notice that we have the column **choices** from wich we can get the correct aswer using the **answer** column as an index.
+
+```bash
+SELECT *
+FROM   'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+WHERE  subject = 'nutrition' LIMIT 3;
+
+┌──────────────────────┬───────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
+│       question       │  subject  │                                                                               choices                                                                               │ answer │
+│       varchar        │  varchar  │                                                                              varchar[]                                                                              │ int64  │
+├──────────────────────┼───────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────┤
+│ Which foods tend t…  │ nutrition │ [Meat, Confectionary, Fruits and vegetables, Potatoes]                                                                                                              │      2 │
+│ In which one of th…  │ nutrition │ [If the incidence rate of the disease falls., If survival time with the disease increases., If recovery of the disease is faster., If the population in which the…  │      1 │
+│ Which of the follo…  │ nutrition │ [The flavonoid class comprises flavonoids and isoflavonoids., The digestibility and bioavailability of isoflavones in soya food products are not changed by proce…  │      0 │
+└──────────────────────┴───────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────┘
+
+```
+
+```bash
+SELECT question,
+       choices[answer] AS correct_answer
+FROM   'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+WHERE  subject = 'nutrition' LIMIT 3;
+
+┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────┐
+│                                                              question                                                               │               correct_answer                │
+│                                                               varchar                                                               │                   varchar                   │
+├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────┤
+│ Which foods tend to be consumed in lower quantities in Wales and Scotland (as of 2020)?\n                                           │ Confectionary                               │
+│ In which one of the following circumstances will the prevalence of a disease in the population increase, all else being constant?\n │ If the incidence rate of the disease falls. │
+│ Which of the following statements is correct?\n                                                                                     │                                             │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────┘
+
+```
+
+To ensure data cleanliness, let's remove any newline characters at the end of the questions and filter out any empty answers:
+
+```bash
+SELECT regexp_replace(question, '\n', '') AS question,
+       choices[answer] AS correct_answer
+FROM   'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+WHERE  subject = 'nutrition' AND LENGTH(correct_answer) > 0 LIMIT 3;
+
+┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────┐
+│                                                             question                                                              │               correct_answer                │
+│                                                              varchar                                                              │                   varchar                   │
+├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────┤
+│ Which foods tend to be consumed in lower quantities in Wales and Scotland (as of 2020)?                                           │ Confectionary                               │
+│ In which one of the following circumstances will the prevalence of a disease in the population increase, all else being constant? │ If the incidence rate of the disease falls. │
+│ Which vitamin is a major lipid-soluble antioxidant in cell membranes?                                                             │ Vitamin D                                   │
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────┘
+
+```
+Finally, lets hightlight some of the DuckDB functions used in this section:
+- `DESCRIBE`, returns the table schema
+- `USING SAMPLE`, 
+- `BAR`, Draw a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
+- `string[begin:end]`, Extract a string using slice conventions. Missing begin or end arguments are interpreted as the beginning or end of the list respectively. Negative values are accepted.
+- `regexp_replace`, If string contains the regexp pattern, replaces the matching part with replacement
+- `LENGTH`, gets the number of characters in string
+
+<Tip>
+
+There are plenty of useful functions available at https://duckdb.org/docs/sql/functions/overview. The best part is that you can now directly use them on Hugging Face datasets.
+
+</Tip>

From 4bd4425d1f19a39883005923f2dcf5f5ded7e30e Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Tue, 21 May 2024 18:59:25 -0400
Subject: [PATCH 07/22] Complete information

---
 docs/source/duckdb_cli_sql.md | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/docs/source/duckdb_cli_sql.md b/docs/source/duckdb_cli_sql.md
index 5404f00621..ae96ef5791 100644
--- a/docs/source/duckdb_cli_sql.md
+++ b/docs/source/duckdb_cli_sql.md
@@ -70,6 +70,23 @@ GROUP BY
     subject 
 ORDER BY 
     counts DESC;
+
+┌──────────────────────────────┬────────┬────────────────────────────────────────────────────────────────────────────────┐
+│           subject            │ counts │                                   percentage                                   │
+│           varchar            │ int64  │                                    varchar                                     │
+├──────────────────────────────┼────────┼────────────────────────────────────────────────────────────────────────────────┤
+│ professional_law             │   1534 │ ████████▋                                                                      │
+│ moral_scenarios              │    895 │ █████                                                                          │
+│ miscellaneous                │    783 │ ████▍                                                                          │
+│ professional_psychology      │    612 │ ███▍                                                                           │
+│ high_school_psychology       │    545 │ ███                                                                            │
+│ high_school_macroeconomics   │    390 │ ██▏                                                                            │
+│ elementary_mathematics       │    378 │ ██▏                                                                            │
+│ moral_disputes               │    346 │ █▉                                                                             │
+├──────────────────────────────┴────────┴────────────────────────────────────────────────────────────────────────────────┤
+│ 57 rows (8 shown)                                                                                           3 columns  │
+└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+
 ```
 
 Now, let's prepare a subset of the dataset containing questions related to **nutrition** and create a mapping of questions to correct answers:
@@ -128,10 +145,10 @@ WHERE  subject = 'nutrition' AND LENGTH(correct_answer) > 0 LIMIT 3;
 ```
 Finally, lets hightlight some of the DuckDB functions used in this section:
 - `DESCRIBE`, returns the table schema
-- `USING SAMPLE`, 
-- `BAR`, Draw a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
-- `string[begin:end]`, Extract a string using slice conventions. Missing begin or end arguments are interpreted as the beginning or end of the list respectively. Negative values are accepted.
-- `regexp_replace`, If string contains the regexp pattern, replaces the matching part with replacement
+- `USING SAMPLE`, samples are used to randomly select a subset of a dataset.
+- `BAR`, draws a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
+- `string[begin:end]`, extracts a string using slice conventions. Missing begin or end arguments are interpreted as the beginning or end of the list respectively. Negative values are accepted.
+- `regexp_replace`, if string contains the regexp pattern, replaces the matching part with replacement
 - `LENGTH`, gets the number of characters in string
 
 <Tip>

From d1311671febc2aa2eb487192038b191c8b473ccd Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 22 May 2024 16:54:00 -0400
Subject: [PATCH 08/22] Apply code review suggestions

---
 docs/source/duckdb_cli.md        | 14 ++++++++++----
 docs/source/duckdb_cli_select.md | 15 +++++----------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/docs/source/duckdb_cli.md b/docs/source/duckdb_cli.md
index 8d37ce6bab..3d65e951b6 100644
--- a/docs/source/duckdb_cli.md
+++ b/docs/source/duckdb_cli.md
@@ -29,7 +29,7 @@ To start the CLI, execute the following command in the installation folder:
 To access Hugging Face datasets, use the following URL format:
 
 ```plaintext
-hf://datasets/{my-username}/{my-dataset}@~parquet/{path_to_parquet_file} 
+hf://datasets/{my-username}/{my-dataset}/{path_to_parquet_file} 
 ```
 
 Where:
@@ -38,15 +38,21 @@ Where:
 - **path_to_parquet_file** Is the parquet file path, it supports glob patterns, e.g `**/*.parquet` to query all parquet files
 
 
-Let's start with a quick demo to query the full rows of a dataset under the `refs/convert/parquet` revision:
+<Tip>
+
+You can query auto-converted Parquet files using the @~parquet branch, which corresponds to the refs/convert/parquet revision. For more details, refer to the documentation at https://huggingface.co/docs/datasets-server/en/parquet#conversion-to-parquet.
+
+</Tip>
+
+Let's start with a quick demo to query all the rows of a dataset:
 
 ```sql
-FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
+FROM 'hf://datasets/ibm/duorc/ParaphraseRC/*.parquet' LIMIT 3;
 ```
 
 Or using traditional SQL syntax:
 
 ```sql
-SELECT * FROM 'hf://datasets/ibm/duorc@~parquet/**/*.parquet';
+SELECT * FROM 'hf://datasets/ibm/duorc/ParaphraseRC/*.parquet' LIMIT 3;
 ```
 In the following sections, we will cover more complex operations you can perform with DuckDB on Hugging Face datasets.
diff --git a/docs/source/duckdb_cli_select.md b/docs/source/duckdb_cli_select.md
index d9ebc440b5..a59d40f283 100644
--- a/docs/source/duckdb_cli_select.md
+++ b/docs/source/duckdb_cli_select.md
@@ -2,18 +2,11 @@
 
 Querying datasets is a fundamental step in data analysis. Here, we'll guide you through querying datasets using various methods.
 
-You can query Hugging Face [autoconverted parquet files](https://huggingface.co/docs/datasets-server/en/parquet#conversion-to-parquet) in the `refs/converts/parquet` branch by using the following syntax:
-
-
-```plaintext
-hf://datasets/{my-username}/{my-dataset}@~parquet/{path_to_parquet_file} 
-```
-
 There are [different ways](https://duckdb.org/docs/data/parquet/overview.html) to select your data.
 
 Using `FROM` syntax:
 ```bash
-FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet' SELECT city, country, region LIMIT 3;
+FROM 'hf://datasets/jamescalam/world-cities-geo/train.jsonl' SELECT city, country, region LIMIT 3;
 
 ┌────────────────┬─────────────┬───────────────┐
 │      city      │   country   │    region     │
@@ -29,7 +22,7 @@ FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parq
 Using `SELECT` `FROM` sytax:
 
 ```bash
-SELECT city, country, region FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet' USING SAMPLE 3;
+SELECT city, country, region FROM 'hf://datasets/jamescalam/world-cities-geo/train.jsonl' USING SAMPLE 3;
 
 ┌──────────┬─────────┬────────────────┐
 │   city   │ country │     region     │
@@ -45,7 +38,7 @@ SELECT city, country, region FROM 'hf://datasets/jamescalam/world-cities-geo@~pa
 Count all parquet files matching a glob pattern:
 
 ```bash
-SELECT COUNT(*) FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/**/*.parquet';
+SELECT COUNT(*) FROM 'hf://datasets/jamescalam/world-cities-geo/*.jsonl';
 
 ┌──────────────┐
 │ count_star() │
@@ -56,6 +49,8 @@ SELECT COUNT(*) FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/**/*.pa
 
 ```
 
+You can also query Parquet files using the read_parquet and parquet_scan functions. Let's explore these functions using the auto-converted Parquet files for the same dataset.
+
 Select using [read_parquet](https://duckdb.org/docs/guides/file_formats/query_parquet.html) function:
 
 ```bash

From aef0ac848208989af2aaae488574fca46e76544d Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 22 May 2024 16:56:57 -0400
Subject: [PATCH 09/22] Change release tag

---
 docs/source/duckdb_cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/duckdb_cli.md b/docs/source/duckdb_cli.md
index 3d65e951b6..d3fe6dbd5e 100644
--- a/docs/source/duckdb_cli.md
+++ b/docs/source/duckdb_cli.md
@@ -8,7 +8,7 @@ For installation details, visit the [installation page](https://duckdb.org/docs/
 
 </Tip>
 
-Starting from version `v0.10.3-dev1012`, the DuckDB CLI includes native support for accessing datasets on Hugging Face via URLs. Here are some features you can leverage with this powerful tool:
+Starting from version `v0.10.3`, the DuckDB CLI includes native support for accessing datasets on Hugging Face via URLs. Here are some features you can leverage with this powerful tool:
 
 - Query public, gated and private datasets
 - Analyze datasets and perform SQL operations

From c525899641bfacfeeb7a72897ff3f8c2914d376e Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 22 May 2024 17:17:52 -0400
Subject: [PATCH 10/22] Combine and export a result dataset

---
 docs/source/duckdb_cli.md                    |   3 +-
 docs/source/duckdb_cli_combine_and_export.md | 105 +++++++++++++++++++
 2 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/duckdb_cli_combine_and_export.md

diff --git a/docs/source/duckdb_cli.md b/docs/source/duckdb_cli.md
index d3fe6dbd5e..b2d64bb148 100644
--- a/docs/source/duckdb_cli.md
+++ b/docs/source/duckdb_cli.md
@@ -12,9 +12,8 @@ Starting from version `v0.10.3`, the DuckDB CLI includes native support for acce
 
 - Query public, gated and private datasets
 - Analyze datasets and perform SQL operations
-- Process and transform datasets
+- Combine datasets and export it different formats
 - Conduct vector similarity search on embedding datasets
-- Export datasets to other formats
 - Implement full-text search on datasets
 - And more! For a complete list of DuckDB features, visit the DuckDB documentation.
 
diff --git a/docs/source/duckdb_cli_combine_and_export.md b/docs/source/duckdb_cli_combine_and_export.md
new file mode 100644
index 0000000000..0b799e3c78
--- /dev/null
+++ b/docs/source/duckdb_cli_combine_and_export.md
@@ -0,0 +1,105 @@
+# Combine datasets and export
+
+In this section, we'll combine two datasets and export the result. Let's start with our datasets:
+
+
+The first will be [TheFusion21/PokemonCards](https://huggingface.co/datasets/TheFusion21/PokemonCards):
+
+```bash
+FROM 'hf://datasets/TheFusion21/PokemonCards/train.csv' LIMIT 3;
+┌─────────┬──────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────┬───────┬─────────────────┐
+│   id    │      image_url       │                                                                 caption                                                                 │    name    │  hp   │    set_name     │
+│ varchar │       varchar        │                                                                 varchar                                                                 │  varchar   │ int64 │     varchar     │
+├─────────┼──────────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────┼───────┼─────────────────┤
+│ pl3-1   │ https://images.pok…  │ A Basic, SP Pokemon Card of type Darkness with the title Absol G and 70 HP of rarity Rare Holo from the set Supreme Victors.  It has …  │ Absol G    │    70 │ Supreme Victors │
+│ ex12-1  │ https://images.pok…  │ A Stage 1 Pokemon Card of type Colorless with the title Aerodactyl and 70 HP of rarity Rare Holo evolved from Mysterious Fossil from …  │ Aerodactyl │    70 │ Legend Maker    │
+│ xy5-1   │ https://images.pok…  │ A Basic Pokemon Card of type Grass with the title Weedle and 50 HP of rarity Common from the set Primal Clash and the flavor text: It…  │ Weedle     │    50 │ Primal Clash    │
+└─────────┴──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────┴───────┴─────────────────┘
+```
+
+And the second one will be [wanghaofan/pokemon-wiki-captions](https://huggingface.co/datasets/wanghaofan/pokemon-wiki-captions):
+
+```bash
+FROM 'hf://datasets/wanghaofan/pokemon-wiki-captions/data/*.parquet' LIMIT 3;
+
+┌──────────────────────┬───────────┬──────────┬──────────────────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│        image         │  name_en  │ name_zh  │                           text_en                            │                                              text_zh                                               │
+│ struct(bytes blob,…  │  varchar  │ varchar  │                           varchar                            │                                              varchar                                               │
+├──────────────────────┼───────────┼──────────┼──────────────────────────────────────────────────────────────┼────────────────────────────────────────────────────────────────────────────────────────────────────┤
+│ {'bytes': \x89PNG\…  │ abomasnow │ 暴雪王   │ Grass attributes,Blizzard King standing on two feet, with …  │ 草属性，双脚站立的暴雪王，全身白色的绒毛，淡紫色的眼睛，几缕长条装的毛皮盖着它的嘴巴               │
+│ {'bytes': \x89PNG\…  │ abra      │ 凯西     │ Super power attributes, the whole body is yellow, the head…  │ 超能力属性，通体黄色，头部外形类似狐狸，尖尖鼻子，手和脚上都有三个指头，长尾巴末端带着一个褐色圆环 │
+│ {'bytes': \x89PNG\…  │ absol     │ 阿勃梭鲁 │ Evil attribute, with white hair, blue-gray part without ha…  │ 恶属性，有白色毛发，没毛发的部分是蓝灰色，头右边类似弓的角，红色眼睛                               │
+└──────────────────────┴───────────┴──────────┴──────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────┘
+
+```
+
+Now, let's try to combine these two datasets joining by the `name` column:
+
+```bash
+SELECT a.image_url
+        , a.caption AS card_caption
+        , a.name
+        , a.hp
+        , b.text_en as wiki_caption 
+FROM 'hf://datasets/TheFusion21/PokemonCards/train.csv' a 
+JOIN 'hf://datasets/wanghaofan/pokemon-wiki-captions/data/*.parquet' b 
+ON LOWER(a.name) = b.name_en
+LIMIT 3;
+
+┌──────────────────────┬──────────────────────┬────────────┬───────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│      image_url       │     card_caption     │    name    │  hp   │                                                                 wiki_caption                                                                 │
+│       varchar        │       varchar        │  varchar   │ int64 │                                                                   varchar                                                                    │
+├──────────────────────┼──────────────────────┼────────────┼───────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
+│ https://images.pok…  │ A Stage 1 Pokemon …  │ Aerodactyl │    70 │ A Pokémon with rock attributes, gray body, blue pupils, purple inner wings, two sharp claws on the wings, jagged teeth, and an arrow-like …  │
+│ https://images.pok…  │ A Basic Pokemon Ca…  │ Weedle     │    50 │ Insect-like, caterpillar-like in appearance, with a khaki-yellow body, seven pairs of pink gastropods, a pink nose, a sharp poisonous need…  │
+│ https://images.pok…  │ A Basic Pokemon Ca…  │ Caterpie   │    50 │ Insect attributes, caterpillar appearance, green back, white abdomen, Y-shaped red antennae on the head, yellow spindle-shaped tail, two p…  │
+└──────────────────────┴──────────────────────┴────────────┴───────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+
+```
+
+We can export the result to a Parquet file using the `COPY` command:
+
+```bash
+COPY (SELECT a.image_url
+        , a.caption AS card_caption
+        , a.name
+        , a.hp
+        , b.text_en as wiki_caption 
+FROM 'hf://datasets/TheFusion21/PokemonCards/train.csv' a 
+JOIN 'hf://datasets/wanghaofan/pokemon-wiki-captions/data/*.parquet' b 
+ON LOWER(a.name) = b.name_en) 
+TO 'output.parquet' (FORMAT PARQUET);
+```
+
+Let's validate the new Parquet file:
+
+```bash
+SELECT COUNT(*) FROM 'output.parquet';
+
+┌──────────────┐
+│ count_star() │
+│    int64     │
+├──────────────┤
+│         9460 │
+└──────────────┘
+
+```
+
+<Tip>
+
+You can also export to [CSV](https://duckdb.org/docs/guides/file_formats/csv_export), [Excel](https://duckdb.org/docs/guides/file_formats/excel_export
+) and [JSON](https://duckdb.org/docs/guides/file_formats/json_export
+) formats.
+
+</Tip>
+
+Finally, let's push the resulting dataset to the Hub using the `datasets` library in Python:
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("parquet", data_files="output.parquet")
+dataset.push_to_hub("asoria/duckdb_combine_demo")
+```
+
+And that's it! You've successfully combined two datasets, exported the result, and uploaded it to the Hugging Face Hub.

From 89648fcb7c7b255a8443a326547f32ad57de8626 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 22 May 2024 17:20:21 -0400
Subject: [PATCH 11/22] Align sections

---
 docs/source/_toctree.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index cc60b610cd..215e0d7ae6 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -46,14 +46,12 @@
                   title: Authentication for private and gated datasets
                 - local: duckdb_cli_select
                   title: Query datasets
-                - local: duckdb_cli_sql_operations
+                - local: duckdb_cli_sql
                   title: Perform SQL operations
-                - local: duckdb_cli_process
-                  title: Process datasets
+                - local: duckdb_cli_combine_and_export
+                  title: Combine datasets and export
                 - local: duckdb_cli_vector_similarity_search
                   title: Perform vector similarity search
-                - local: duckdb_cli_export
-                  title: Export to other formats
                 - local: duckdb_cli_fts
                   title: Implement full-text search
         - local: pandas

From 4565ebe0b7ad9ad3d3aeb4108cbb65f0add49451 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Wed, 22 May 2024 17:33:44 -0400
Subject: [PATCH 12/22] Adding vector search

---
 .../duckdb_cli_vector_similarity_search.md    | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 docs/source/duckdb_cli_vector_similarity_search.md

diff --git a/docs/source/duckdb_cli_vector_similarity_search.md b/docs/source/duckdb_cli_vector_similarity_search.md
new file mode 100644
index 0000000000..197d1533c8
--- /dev/null
+++ b/docs/source/duckdb_cli_vector_similarity_search.md
@@ -0,0 +1,63 @@
+# Vector Similarity Search
+
+In the latest release of DuckDB (version 0.10.0), a cool feature called Fixed-Length Arrays was added. This lets you use vector embeddings in DuckDB tables, making your data analysis even more powerful.
+
+Additionally, the array_cosine_similarity function was introduced. This function measures the cosine of the angle between two vectors, indicating their similarity. A value of 1 means they’re perfectly aligned, 0 means they’re perpendicular, and -1 means they’re completely opposite.
+
+Let's explore how to use this function for similarity searches. In this section, we’ll show you how to perform similarity searches using DuckDB.
+
+We will use the dataset [asoria/awesome-chatgpt-prompts-embeddings](https://huggingface.co/datasets/asoria/awesome-chatgpt-prompts-embeddings).
+
+First, let's preview a few records from the dataset:
+
+```bash
+FROM 'hf://datasets/asoria/awesome-chatgpt-prompts-embeddings/data/*.parquet' SELECT act, prompt, len(embedding) as embed_len LIMIT 3;
+
+┌──────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┐
+│         act          │                                                                                    prompt                                                                                    │ embed_len │
+│       varchar        │                                                                                   varchar                                                                                    │   int64   │
+├──────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────┤
+│ Linux Terminal       │ I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output insid…  │       384 │
+│ English Translator…  │ I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer…  │       384 │
+│ `position` Intervi…  │ I want you to act as an interviewer. I will be the candidate and you will ask me the interview questions for the `position` position. I want you to only reply as the inte…  │       384 │
+└──────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴───────────┘
+
+```
+
+Next, let's choose an embedding to use for the similarity search:
+
+```bash
+FROM 'hf://datasets/asoria/awesome-chatgpt-prompts-embeddings/data/*.parquet' SELECT  embedding  WHERE act = 'Linux Terminal';
+
+┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│                                                                                                    embedding                                                                                                    │
+│                                                                                                     float[]                                                                                                     │
+├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
+│ [-0.020781303, -0.029143505, -0.0660217, -0.00932716, -0.02601602, -0.011426172, 0.06627567, 0.11941507, 0.0013917526, 0.012889079, 0.053234346, -0.07380514, 0.04871567, -0.043601237, -0.0025319182, 0.0448…  │
+└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+
+```
+
+Now, let's use the selected embedding to find similar records:
+
+
+```bash
+SELECT act,
+       prompt,
+       array_cosine_similarity(embedding::float[384], (SELECT embedding FROM 'hf://datasets/asoria/awesome-chatgpt-prompts-embeddings/data/*.parquet' WHERE  act = 'Linux Terminal')::float[384]) AS similarity 
+FROM 'hf://datasets/asoria/awesome-chatgpt-prompts-embeddings/data/*.parquet'
+ORDER BY similarity DESC
+LIMIT 3;
+
+┌──────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────┐
+│         act          │                                                                                   prompt                                                                                    │ similarity │
+│       varchar        │                                                                                   varchar                                                                                   │   float    │
+├──────────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────────┤
+│ Linux Terminal       │ I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output insi…  │        1.0 │
+│ JavaScript Console   │ I want you to act as a javascript console. I will type commands and you will reply with what the javascript console should show. I want you to only reply with the termin…  │  0.7599728 │
+│ R programming Inte…  │ I want you to act as a R interpreter. I'll type commands and you'll reply with what the terminal should show. I want you to only reply with the terminal output inside on…  │  0.7303775 │
+└──────────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────────┘
+
+```
+
+That's it! You have successfully performed a vector similarity search using DuckDB.

From 8a154b98de4b91fc53e923d32a879e778c735a9a Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Thu, 23 May 2024 08:33:02 -0400
Subject: [PATCH 13/22] Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/duckdb_cli.md                        | 16 ++++++++--------
 docs/source/duckdb_cli_auth.md                   |  4 ++--
 docs/source/duckdb_cli_combine_and_export.md     |  4 ++--
 docs/source/duckdb_cli_select.md                 | 16 ++++++++--------
 .../duckdb_cli_vector_similarity_search.md       |  6 +++---
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/docs/source/duckdb_cli.md b/docs/source/duckdb_cli.md
index b2d64bb148..1ec43419a0 100644
--- a/docs/source/duckdb_cli.md
+++ b/docs/source/duckdb_cli.md
@@ -8,14 +8,15 @@ For installation details, visit the [installation page](https://duckdb.org/docs/
 
 </Tip>
 
-Starting from version `v0.10.3`, the DuckDB CLI includes native support for accessing datasets on Hugging Face via URLs. Here are some features you can leverage with this powerful tool:
+Starting from version `v0.10.3`, the DuckDB CLI includes native support for accessing datasets on the Hugging Face Hub via URLs. Here are some features you can leverage with this powerful tool:
 
-- Query public, gated and private datasets
+- Query public datasets and your own gated and private datasets
 - Analyze datasets and perform SQL operations
-- Combine datasets and export it different formats
+- Combine datasets and export it to different formats
 - Conduct vector similarity search on embedding datasets
 - Implement full-text search on datasets
-- And more! For a complete list of DuckDB features, visit the DuckDB documentation.
+
+For a complete list of DuckDB features, visit the DuckDB [documentation](https://duckdb.org/docs/).
 
 To start the CLI, execute the following command in the installation folder:
 
@@ -31,10 +32,9 @@ To access Hugging Face datasets, use the following URL format:
 hf://datasets/{my-username}/{my-dataset}/{path_to_parquet_file} 
 ```
 
-Where:
-- **my-username**  The user or organization of the dataset, e.g. `ibm`
-- **my-dataset** Is the dataset name, e.g: `duorc`
-- **path_to_parquet_file** Is the parquet file path, it supports glob patterns, e.g `**/*.parquet` to query all parquet files
+- **my-username**, the user or organization of the dataset, e.g. `ibm`
+- **my-dataset**, the dataset name, e.g: `duorc`
+- **path_to_parquet_file**, the parquet file path which supports glob patterns, e.g `**/*.parquet`, to query all parquet files
 
 
 <Tip>
diff --git a/docs/source/duckdb_cli_auth.md b/docs/source/duckdb_cli_auth.md
index 0dc072f08d..32c2d37a24 100644
--- a/docs/source/duckdb_cli_auth.md
+++ b/docs/source/duckdb_cli_auth.md
@@ -7,9 +7,9 @@ Visit [Hugging Face Settings - Tokens](https://huggingface.co/settings/tokens) t
 DuckDB supports two providers for managing secrets:
 
 - `CONFIG`: Requires the user to pass all configuration information into the CREATE SECRET statement.
-- `CREDENTIAL_CHAIN`: Automatically tries to fetch credentials. For Hugging Face token it will try to get it from  `~/.cache/huggingface/token`
+- `CREDENTIAL_CHAIN`: Automatically tries to fetch credentials. For the Hugging Face token, it will try to get it from  `~/.cache/huggingface/token`.
 
-For more information about DuckDB Secrets visit https://duckdb.org/docs/configuration/secrets_manager.html
+For more information about DuckDB Secrets visit the [Secrets Manager](https://duckdb.org/docs/configuration/secrets_manager.html) guide.
 
 ## Creating a secret with `CONFIG` provider
 
diff --git a/docs/source/duckdb_cli_combine_and_export.md b/docs/source/duckdb_cli_combine_and_export.md
index 0b799e3c78..c0d504b87e 100644
--- a/docs/source/duckdb_cli_combine_and_export.md
+++ b/docs/source/duckdb_cli_combine_and_export.md
@@ -33,7 +33,7 @@ FROM 'hf://datasets/wanghaofan/pokemon-wiki-captions/data/*.parquet' LIMIT 3;
 
 ```
 
-Now, let's try to combine these two datasets joining by the `name` column:
+Now, let's try to combine these two datasets by joining on the `name` column:
 
 ```bash
 SELECT a.image_url
@@ -93,7 +93,7 @@ You can also export to [CSV](https://duckdb.org/docs/guides/file_formats/csv_exp
 
 </Tip>
 
-Finally, let's push the resulting dataset to the Hub using the `datasets` library in Python:
+Finally, let's push the resulting dataset to the Hub using the [Datasets](https://huggingface.co/docs/datasets/index) library in Python:
 
 ```python
 from datasets import load_dataset
diff --git a/docs/source/duckdb_cli_select.md b/docs/source/duckdb_cli_select.md
index a59d40f283..c940027e2a 100644
--- a/docs/source/duckdb_cli_select.md
+++ b/docs/source/duckdb_cli_select.md
@@ -1,10 +1,10 @@
-# Query public, gated and private datasets
+# Query datasets
 
 Querying datasets is a fundamental step in data analysis. Here, we'll guide you through querying datasets using various methods.
 
-There are [different ways](https://duckdb.org/docs/data/parquet/overview.html) to select your data.
+There are several [different ways](https://duckdb.org/docs/data/parquet/overview.html) to select your data.
 
-Using `FROM` syntax:
+Using the `FROM` syntax:
 ```bash
 FROM 'hf://datasets/jamescalam/world-cities-geo/train.jsonl' SELECT city, country, region LIMIT 3;
 
@@ -19,7 +19,7 @@ FROM 'hf://datasets/jamescalam/world-cities-geo/train.jsonl' SELECT city, countr
 
 ```
 
-Using `SELECT` `FROM` sytax:
+Using the `SELECT` and `FROM` syntax:
 
 ```bash
 SELECT city, country, region FROM 'hf://datasets/jamescalam/world-cities-geo/train.jsonl' USING SAMPLE 3;
@@ -49,7 +49,7 @@ SELECT COUNT(*) FROM 'hf://datasets/jamescalam/world-cities-geo/*.jsonl';
 
 ```
 
-You can also query Parquet files using the read_parquet and parquet_scan functions. Let's explore these functions using the auto-converted Parquet files for the same dataset.
+You can also query Parquet files using the read_parquet and parquet_scan functions. Let's explore these functions using the auto-converted Parquet files from the same dataset.
 
 Select using [read_parquet](https://duckdb.org/docs/guides/file_formats/query_parquet.html) function:
 
@@ -71,7 +71,7 @@ SELECT * FROM parquet_scan('hf://datasets/jamescalam/world-cities-geo@~parquet/d
 
 ## Get metadata and schema
 
-The [parquet_metadata]((https://duckdb.org/docs/data/parquet/metadata.html)) function can be used to query the metadata contained within a Parquet file.
+The [parquet_metadata](https://duckdb.org/docs/data/parquet/metadata.html) function can be used to query the metadata contained within a Parquet file.
 
 ```bash
 SELECT * FROM parquet_metadata('hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet');
@@ -109,7 +109,7 @@ DESCRIBE SELECT * FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/defau
 
 ```
 
-Fetch the internal schema (Exclusing file name):
+Fetch the internal schema (excluding the file name):
 
 ```bash
 SELECT * EXCLUDE (file_name) FROM parquet_schema('hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet');
@@ -134,7 +134,7 @@ SELECT * EXCLUDE (file_name) FROM parquet_schema('hf://datasets/jamescalam/world
 
 ## Get statistics
 
-The `SUMMARIZE` command can be used to get various aggregates over a query (min, max, approx_unique, avg, std, q25, q50, q75, count). It returns these along with the column name, column type, and the percentage of NULL values.
+The `SUMMARIZE` command can be used to get various aggregates over a query (min, max, approx_unique, avg, std, q25, q50, q75, count). It returns these statistics along with the column name, column type, and the percentage of NULL values.
 
 ```bash
 SUMMARIZE SELECT latitude, longitude FROM 'hf://datasets/jamescalam/world-cities-geo@~parquet/default/train/0000.parquet';
diff --git a/docs/source/duckdb_cli_vector_similarity_search.md b/docs/source/duckdb_cli_vector_similarity_search.md
index 197d1533c8..ef6aed3907 100644
--- a/docs/source/duckdb_cli_vector_similarity_search.md
+++ b/docs/source/duckdb_cli_vector_similarity_search.md
@@ -1,12 +1,12 @@
-# Vector Similarity Search
+# Perform vector similarity search
 
-In the latest release of DuckDB (version 0.10.0), a cool feature called Fixed-Length Arrays was added. This lets you use vector embeddings in DuckDB tables, making your data analysis even more powerful.
+The Fixed-Length Arrays feature was added in DuckDB version 0.10.0. This lets you use vector embeddings in DuckDB tables, making your data analysis even more powerful.
 
 Additionally, the array_cosine_similarity function was introduced. This function measures the cosine of the angle between two vectors, indicating their similarity. A value of 1 means they’re perfectly aligned, 0 means they’re perpendicular, and -1 means they’re completely opposite.
 
 Let's explore how to use this function for similarity searches. In this section, we’ll show you how to perform similarity searches using DuckDB.
 
-We will use the dataset [asoria/awesome-chatgpt-prompts-embeddings](https://huggingface.co/datasets/asoria/awesome-chatgpt-prompts-embeddings).
+We will use the [asoria/awesome-chatgpt-prompts-embeddings](https://huggingface.co/datasets/asoria/awesome-chatgpt-prompts-embeddings) dataset.
 
 First, let's preview a few records from the dataset:
 

From a2fd8de43fd25019a02ed222655c27e46ea35d33 Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Thu, 23 May 2024 08:40:37 -0400
Subject: [PATCH 14/22] Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/duckdb_cli_sql.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/duckdb_cli_sql.md b/docs/source/duckdb_cli_sql.md
index ae96ef5791..6deb4d2e1b 100644
--- a/docs/source/duckdb_cli_sql.md
+++ b/docs/source/duckdb_cli_sql.md
@@ -1,8 +1,8 @@
-# Performing SQL operations
+# Perform SQL operations
 
 Performing SQL operations with DuckDB opens up a world of possibilities for querying datasets efficiently. Let's dive into some examples showcasing the power of DuckDB functions.
 
-For our demonstration, we'll explore a fascinating dataset: A the multitask test containing multiple-choice questions spanning various domains of knowledge. You can access the dataset [here](https://huggingface.co/datasets/cais/mmlu).
+For our demonstration, we'll explore a fascinating dataset. The [MMLU](https://huggingface.co/datasets/cais/mmlu) dataset is a multitask test containing multiple-choice questions spanning various knowledge domains.
 
 To preview the dataset, let's select a sample of 3 rows:
 
@@ -57,7 +57,7 @@ HAVING   counts > 2;
 
 Fortunately, our dataset doesn't contain any duplicate records.
 
-Lets see the proportion of questions based on the subject in a bar representation:
+Let's see the proportion of questions based on the subject in a bar representation:
 
 ```bash
 SELECT 
@@ -89,8 +89,8 @@ ORDER BY
 
 ```
 
-Now, let's prepare a subset of the dataset containing questions related to **nutrition** and create a mapping of questions to correct answers:
-Notice that we have the column **choices** from wich we can get the correct aswer using the **answer** column as an index.
+Now, let's prepare a subset of the dataset containing questions related to **nutrition** and create a mapping of questions to correct answers.
+Notice that we have the column **choices** from which we can get the correct answer using the **answer** column as an index.
 
 ```bash
 SELECT *
@@ -144,15 +144,15 @@ WHERE  subject = 'nutrition' AND LENGTH(correct_answer) > 0 LIMIT 3;
 
 ```
 Finally, lets hightlight some of the DuckDB functions used in this section:
-- `DESCRIBE`, returns the table schema
+- `DESCRIBE`, returns the table schema.
 - `USING SAMPLE`, samples are used to randomly select a subset of a dataset.
-- `BAR`, draws a band whose width is proportional to (x - min) and equal to width characters when x = max. width defaults to 80.
+- `BAR`, draws a band whose width is proportional to (x - min) and equal to width characters when x = max. Width defaults to 80.
 - `string[begin:end]`, extracts a string using slice conventions. Missing begin or end arguments are interpreted as the beginning or end of the list respectively. Negative values are accepted.
-- `regexp_replace`, if string contains the regexp pattern, replaces the matching part with replacement
-- `LENGTH`, gets the number of characters in string
+- `regexp_replace`, if the string contains the regexp pattern, replaces the matching part with replacement.
+- `LENGTH`, gets the number of characters in the string.
 
 <Tip>
 
-There are plenty of useful functions available at https://duckdb.org/docs/sql/functions/overview. The best part is that you can now directly use them on Hugging Face datasets.
+There are plenty of useful functions available in DuckDB's [SQL functions overview](https://duckdb.org/docs/sql/functions/overview). The best part is that you can use them directly on Hugging Face datasets.
 
 </Tip>

From 5a6615dde6537b4806a0a9c8476fd71c942caaff Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 23 May 2024 09:21:57 -0400
Subject: [PATCH 15/22] Adding ref for parquet_scan

---
 docs/source/duckdb_cli_select.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/duckdb_cli_select.md b/docs/source/duckdb_cli_select.md
index c940027e2a..d126737c04 100644
--- a/docs/source/duckdb_cli_select.md
+++ b/docs/source/duckdb_cli_select.md
@@ -63,7 +63,7 @@ Read all files that match a glob pattern and include a filename column specifyin
 SELECT * FROM read_parquet('hf://datasets/jamescalam/world-cities-geo@~parquet/default/**/*.parquet', filename = true) LIMIT 3;
 ```
 
-Using `parquet_scan` function:
+Using [`parquet_scan`](https://duckdb.org/docs/data/parquet/overview) function:
 
 ```bash
 SELECT * FROM parquet_scan('hf://datasets/jamescalam/world-cities-geo@~parquet/default/**/*.parquet') LIMIT 3;

From 60bd06c437881fcb388fd808946e4385592f355f Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Thu, 23 May 2024 15:52:36 +0200
Subject: [PATCH 16/22] Update docs/source/_toctree.yml

---
 docs/source/_toctree.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 215e0d7ae6..0394f2e42c 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -36,8 +36,7 @@
           title: Overview
         - local: clickhouse
           title: ClickHouse
-        - local: duckdb
-          title: DuckDB
+        - title: DuckDB
           sections:
             - local: duckdb_cli
               title: DuckDB CLI

From 18c8d14f6e77dd159bdd33d022df850eaea45144 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Thu, 23 May 2024 15:56:18 +0200
Subject: [PATCH 17/22] Update docs/source/_toctree.yml

---
 docs/source/_toctree.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 0394f2e42c..215e0d7ae6 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -36,7 +36,8 @@
           title: Overview
         - local: clickhouse
           title: ClickHouse
-        - title: DuckDB
+        - local: duckdb
+          title: DuckDB
           sections:
             - local: duckdb_cli
               title: DuckDB CLI

From d3a7a92b29774655667f38c94fe131a9d337263e Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Thu, 23 May 2024 15:59:58 +0200
Subject: [PATCH 18/22] Update docs/source/_toctree.yml

---
 docs/source/_toctree.yml | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 215e0d7ae6..a9b19814b9 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -41,19 +41,18 @@
           sections:
             - local: duckdb_cli
               title: DuckDB CLI
-              sections:
-                - local: duckdb_cli_auth
-                  title: Authentication for private and gated datasets
-                - local: duckdb_cli_select
-                  title: Query datasets
-                - local: duckdb_cli_sql
-                  title: Perform SQL operations
-                - local: duckdb_cli_combine_and_export
-                  title: Combine datasets and export
-                - local: duckdb_cli_vector_similarity_search
-                  title: Perform vector similarity search
-                - local: duckdb_cli_fts
-                  title: Implement full-text search
+            - local: duckdb_cli_auth
+              title: Authentication for private and gated datasets
+            - local: duckdb_cli_select
+              title: Query datasets
+            - local: duckdb_cli_sql
+              title: Perform SQL operations
+            - local: duckdb_cli_combine_and_export
+              title: Combine datasets and export
+            - local: duckdb_cli_vector_similarity_search
+              title: Perform vector similarity search
+            - local: duckdb_cli_fts
+              title: Implement full-text search
         - local: pandas
           title: Pandas
         - local: polars

From d169f48d524a888d6a7ba765820f77a9a42a9801 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 23 May 2024 10:14:33 -0400
Subject: [PATCH 19/22] Try to fix menu

---
 docs/source/_toctree.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index a9b19814b9..c7e06c7ade 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -36,9 +36,10 @@
           title: Overview
         - local: clickhouse
           title: ClickHouse
-        - local: duckdb
-          title: DuckDB
+        - isExpanded: false
           sections:
+            - local: duckdb
+              title: General Usage
             - local: duckdb_cli
               title: DuckDB CLI
             - local: duckdb_cli_auth
@@ -53,6 +54,7 @@
               title: Perform vector similarity search
             - local: duckdb_cli_fts
               title: Implement full-text search
+          title: DuckDB
         - local: pandas
           title: Pandas
         - local: polars

From 21d62be03992515cfb314812597fd5710da99ab4 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 23 May 2024 10:18:49 -0400
Subject: [PATCH 20/22] Remove misisng file

---
 docs/source/_toctree.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index c7e06c7ade..4672ce0960 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -52,8 +52,6 @@
               title: Combine datasets and export
             - local: duckdb_cli_vector_similarity_search
               title: Perform vector similarity search
-            - local: duckdb_cli_fts
-              title: Implement full-text search
           title: DuckDB
         - local: pandas
           title: Pandas

From a31c7a1d356d191cc305a5a11692267d54d3f19b Mon Sep 17 00:00:00 2001
From: Andrea Francis Soria Jimenez <andrea@huggingface.co>
Date: Thu, 23 May 2024 10:22:07 -0400
Subject: [PATCH 21/22] Apply suggestions from code review

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 docs/source/duckdb_cli_sql.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/duckdb_cli_sql.md b/docs/source/duckdb_cli_sql.md
index 6deb4d2e1b..6e3d44d3d7 100644
--- a/docs/source/duckdb_cli_sql.md
+++ b/docs/source/duckdb_cli_sql.md
@@ -7,7 +7,7 @@ For our demonstration, we'll explore a fascinating dataset. The [MMLU](https://h
 To preview the dataset, let's select a sample of 3 rows:
 
 ```bash
-FROM 'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet' USING SAMPLE 3;
+FROM 'hf://datasets/cais/mmlu/all/test-*.parquet' USING SAMPLE 3;
 
 ┌──────────────────────┬──────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
 │       question       │     subject      │                                                                           choices                                                                            │ answer │
@@ -25,7 +25,7 @@ This command retrieves a random sample of 3 rows from the dataset for us to exam
 Let's start by examining the schema of our dataset. The following table outlines the structure of our dataset:
 
 ```bash
-DESCRIBE FROM 'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet' USING SAMPLE 3;
+DESCRIBE FROM 'hf://datasets/cais/mmlu/all/test-*.parquet' USING SAMPLE 3;
 ┌─────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
 │ column_name │ column_type │  null   │   key   │ default │  extra  │
 │   varchar   │   varchar   │ varchar │ varchar │ varchar │ varchar │
@@ -42,7 +42,7 @@ Next, let's analyze if there are any duplicated records in our dataset:
 ```bash
 SELECT   *,
          COUNT(*) AS counts
-FROM     'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+FROM     'hf://datasets/cais/mmlu/all/test-*.parquet'
 GROUP BY ALL
 HAVING   counts > 2; 
 
@@ -63,9 +63,9 @@ Let's see the proportion of questions based on the subject in a bar representati
 SELECT 
     subject, 
     COUNT(*) AS counts, 
-    BAR(COUNT(*), 0, (SELECT COUNT(*) FROM 'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet')) AS percentage 
+    BAR(COUNT(*), 0, (SELECT COUNT(*) FROM 'hf://datasets/cais/mmlu/all/test-*.parquet')) AS percentage 
 FROM 
-    'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet' 
+    'hf://datasets/cais/mmlu/all/test-*.parquet' 
 GROUP BY 
     subject 
 ORDER BY 
@@ -94,7 +94,7 @@ Notice that we have the column **choices** from which we can get the correct ans
 
 ```bash
 SELECT *
-FROM   'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+FROM   'hf://datasets/cais/mmlu/all/test-*.parquet'
 WHERE  subject = 'nutrition' LIMIT 3;
 
 ┌──────────────────────┬───────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
@@ -111,7 +111,7 @@ WHERE  subject = 'nutrition' LIMIT 3;
 ```bash
 SELECT question,
        choices[answer] AS correct_answer
-FROM   'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+FROM   'hf://datasets/cais/mmlu/all/test-*.parquet'
 WHERE  subject = 'nutrition' LIMIT 3;
 
 ┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────┐
@@ -130,7 +130,7 @@ To ensure data cleanliness, let's remove any newline characters at the end of th
 ```bash
 SELECT regexp_replace(question, '\n', '') AS question,
        choices[answer] AS correct_answer
-FROM   'hf://datasets/cais/mmlu@~parquet/all/test/*.parquet'
+FROM   'hf://datasets/cais/mmlu/all/test-*.parquet'
 WHERE  subject = 'nutrition' AND LENGTH(correct_answer) > 0 LIMIT 3;
 
 ┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────┐

From 1e6592f981724a7524875afee4f2ce6a27326507 Mon Sep 17 00:00:00 2001
From: Andrea Soria <andrea@huggingface.co>
Date: Thu, 23 May 2024 10:22:19 -0400
Subject: [PATCH 22/22] Adling new results

---
 docs/source/duckdb_cli_sql.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/duckdb_cli_sql.md b/docs/source/duckdb_cli_sql.md
index 6e3d44d3d7..33714d1e49 100644
--- a/docs/source/duckdb_cli_sql.md
+++ b/docs/source/duckdb_cli_sql.md
@@ -9,14 +9,14 @@ To preview the dataset, let's select a sample of 3 rows:
 ```bash
 FROM 'hf://datasets/cais/mmlu/all/test-*.parquet' USING SAMPLE 3;
 
-┌──────────────────────┬──────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
-│       question       │     subject      │                                                                           choices                                                                            │ answer │
-│       varchar        │     varchar      │                                                                          varchar[]                                                                           │ int64  │
-├──────────────────────┼──────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────┤
-│ Which of the follo…  │ miscellaneous    │ [A campaign manager's introduction of a mayoral candidate, A professor's lecture on the structure of the heart, A company president's yearly sales report,…  │      0 │
-│ Are strategies of …  │ security_studies │ [No, arms control does not serve any purpose outside the realm of traditional military weapons., Yes, successful policies of arms control and deterrence h…  │      2 │
-│ Find all c in Z_3 …  │ abstract_algebra │ [0, 2, 1, 3]                                                                                                                                                 │      1 │
-└──────────────────────┴──────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────┘
+┌──────────────────────┬──────────────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────┐
+│       question       │       subject        │                                                                         choices                                                                          │ answer │
+│       varchar        │       varchar        │                                                                        varchar[]                                                                         │ int64  │
+├──────────────────────┼──────────────────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼────────┤
+│ Dr. Harry Holliday…  │ professional_psych…  │ [discuss his vacation plans with his current clients ahead of time so that they know he’ll be unavailable during that time., give his clients a phone …  │      2 │
+│ A resident of a st…  │ professional_law     │ [The resident would succeed, because the logging company's selling of the timber would entitle the resident to re-enter and terminate the grant to the…  │      2 │
+│ Moderate and frequ…  │ miscellaneous        │ [dispersed alluvial fan soil, heavy-textured soil, such as silty clay, light-textured soil, such as loamy sand, region of low humidity]                  │      2 │
+└──────────────────────┴──────────────────────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴────────┘
 
 ```
 
@@ -143,6 +143,7 @@ WHERE  subject = 'nutrition' AND LENGTH(correct_answer) > 0 LIMIT 3;
 └───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────────┘
 
 ```
+
 Finally, lets hightlight some of the DuckDB functions used in this section:
 - `DESCRIBE`, returns the table schema.
 - `USING SAMPLE`, samples are used to randomly select a subset of a dataset.