huggingface · AndreaFrancis · Oct 19, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/chart/templates/_env/_envCachedAssets.tpl b/chart/templates/_env/_envCachedAssets.tpl
@@ -6,14 +6,6 @@
   value: "{{ include "cachedAssets.baseUrl" . }}"
 - name: CACHED_ASSETS_STORAGE_DIRECTORY
   value: {{ .Values.cachedAssets.storageDirectory | quote }}
-- name: CACHED_ASSETS_CLEAN_CACHE_PROBA
-  value: {{ .Values.cachedAssets.cleanCacheProba | quote }}
-- name: CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER
-  value: {{ .Values.cachedAssets.keepFirstRowsNumber | quote }}
-- name: CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER
-  value: {{ .Values.cachedAssets.keepMostRecentRowsNumber | quote }}
-- name: CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER
-  value: {{ .Values.cachedAssets.maxCleanedRowsNumber | quote }}
 - name: CACHED_ASSETS_S3_FOLDER_NAME
   value: {{ .Values.cachedAssets.s3FolderName | quote }}
 {{- end -}}
diff --git a/docs/source/openapi.json b/docs/source/openapi.json
@@ -2270,12 +2270,12 @@
                           "row_idx": 0,
                           "row": {
                             "imageA": {
-                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/0/imageA/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/0/imageA/image.jpg",
                               "height": 256,
                               "width": 256
                             },
                             "imageB": {
-                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/0/imageB/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/0/imageB/image.jpg",
                               "height": 256,
                               "width": 256
                             }
@@ -2286,12 +2286,12 @@
                           "row_idx": 1,
                           "row": {
                             "imageA": {
-                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/1/imageA/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/1/imageA/image.jpg",
                               "height": 256,
                               "width": 256
                             },
                             "imageB": {
-                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/1/imageB/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/1/imageB/image.jpg",
                               "height": 256,
                               "width": 256
                             }
@@ -2302,12 +2302,12 @@
                           "row_idx": 2,
                           "row": {
                             "imageA": {
-                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/2/imageA/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/2/imageA/image.jpg",
                               "height": 256,
                               "width": 256
                             },
                             "imageB": {
-                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/2/imageB/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/2/imageB/image.jpg",
                               "height": 256,
                               "width": 256
                             }
@@ -2390,7 +2390,7 @@
                             "id": "id10059_229vKIGbxrI_00001",
                             "audio": [
                               {
-                                "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/0/audio/audio.wav",
+                                "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/0/audio/audio.wav",
                                 "type": "audio/wav"
                               }
                             ],
@@ -2408,7 +2408,7 @@
                             "id": "id10059_229vKIGbxrI_00002",
                             "audio": [
                               {
-                                "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/1/audio/audio.wav",
+                                "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/1/audio/audio.wav",
                                 "type": "audio/wav"
                               }
                             ],
@@ -2426,7 +2426,7 @@
                             "id": "id10059_229vKIGbxrI_00003",
                             "audio": [
                               {
-                                "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/2/audio/audio.wav",
+                                "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/2/audio/audio.wav",
                                 "type": "audio/wav"
                               }
                             ],
@@ -2711,12 +2711,12 @@
                           "row_idx": 234,
                           "row": {
                             "imageA": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/234/imageA/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/234/imageA/image.jpg",
                               "height": 256,
                               "width": 256
                             },
                             "imageB": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/234/imageB/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/234/imageB/image.jpg",
                               "height": 256,
                               "width": 256
                             }
@@ -2727,12 +2727,12 @@
                           "row_idx": 235,
                           "row": {
                             "imageA": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/235/imageA/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/235/imageA/image.jpg",
                               "height": 256,
                               "width": 256
                             },
                             "imageB": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/235/imageB/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/235/imageB/image.jpg",
                               "height": 256,
                               "width": 256
                             }
@@ -2743,12 +2743,12 @@
                           "row_idx": 236,
                           "row": {
                             "imageA": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/236/imageA/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/236/imageA/image.jpg",
                               "height": 256,
                               "width": 256
                             },
                             "imageB": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/236/imageB/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/236/imageB/image.jpg",
                               "height": 256,
                               "width": 256
                             }
@@ -3128,7 +3128,7 @@
                           "row_idx": 16,
                           "row": {
                             "image": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/16/image/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/16/image/image.jpg",
                               "height": 431,
                               "width": 431
                             },
@@ -3140,7 +3140,7 @@
                           "row_idx": 54,
                           "row": {
                             "image": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/54/image/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/54/image/image.jpg",
                               "height": 1280,
                               "width": 1280
                             },
@@ -3152,7 +3152,7 @@
                           "row_idx": 56,
                           "row": {
                             "image": {
-                              "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/56/image/image.jpg",
+                              "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/56/image/image.jpg",
                               "height": 1280,
                               "width": 1280
                             },

diff --git a/docs/source/rows.mdx b/docs/source/rows.mdx
@@ -174,7 +174,7 @@ Here is an example of image, from the first row of the cifar100 dataset:
       "row_idx": 0,
       "row": {
         "img": {
-          "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/cifar100/train/0/img/image.jpg",
+          "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/main/--/cifar100/train/0/img/image.jpg",
           "height": 32,
           "width": 32
         },

diff --git a/libs/libapi/src/libapi/duckdb.py b/libs/libapi/src/libapi/duckdb.py
@@ -23,9 +23,9 @@
 def get_index_file_location_and_download_if_missing(
     duckdb_index_file_directory: StrPath,
     dataset: str,
+    revision: str,
     config: str,
     split: str,
-    revision: Optional[str],
     filename: str,
     url: str,
     target_revision: str,
@@ -55,9 +55,7 @@ def get_index_file_location_and_download_if_missing(
         return index_file_location
 
 
-def get_download_folder(
-    root_directory: StrPath, dataset: str, config: str, split: str, revision: Optional[str]
-) -> str:
+def get_download_folder(root_directory: StrPath, dataset: str, revision: str, config: str, split: str) -> str:
     payload = (dataset, config, split, revision)
     hash_suffix = sha1(json.dumps(payload, sort_keys=True).encode(), usedforsecurity=False).hexdigest()[:8]
     subdirectory = "".join([c if re.match(r"[\w-]", c) else "-" for c in f"{dataset}-{hash_suffix}"])

diff --git a/libs/libapi/src/libapi/response.py b/libs/libapi/src/libapi/response.py
@@ -15,6 +15,7 @@
 
 def create_response(
     dataset: str,
+    revision: str,
     config: str,
     split: str,
     cached_assets_base_url: str,
@@ -45,6 +46,7 @@ def create_response(
         "rows": to_rows_list(
             pa_table=pa_table,
             dataset=dataset,
+            revision=revision,
             config=config,
             split=split,
             storage_options=storage_options,

diff --git a/libs/libapi/src/libapi/rows_utils.py b/libs/libapi/src/libapi/rows_utils.py
@@ -14,6 +14,7 @@
 def _transform_row(
     row_idx_and_row: tuple[int, Row],
     dataset: str,
+    revision: str,
     config: str,
     split: str,
     features: Features,
@@ -25,6 +26,7 @@ def _transform_row(
     transformed_row = {
         featureName: get_cell_value(
             dataset=dataset,
+            revision=revision,
             config=config,
             split=split,
             row_idx=offset + row_idx if row_idx_column is None else row[row_idx_column],
@@ -42,6 +44,7 @@ def _transform_row(
 
 def transform_rows(
     dataset: str,
+    revision: str,
     config: str,
     split: str,
     rows: list[Row],
@@ -53,6 +56,7 @@ def transform_rows(
     fn = partial(
         _transform_row,
         dataset=dataset,
+        revision=revision,
         config=config,
         split=split,
         features=features,

diff --git a/libs/libapi/src/libapi/utils.py b/libs/libapi/src/libapi/utils.py
@@ -2,12 +2,8 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import logging
-import os
-import random
-import shutil
 from collections.abc import Callable, Coroutine
 from http import HTTPStatus
-from itertools import islice
 from typing import Any, Optional, Union
 
 import pyarrow as pa
@@ -21,10 +17,8 @@
     CacheEntry,
     get_best_response,
 )
-from libcommon.storage import StrPath
 from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions
 from libcommon.utils import Priority, RowItem, orjson_dumps
-from libcommon.viewer_utils.asset import glob_rows_in_assets_dir
 from starlette.requests import Request
 from starlette.responses import JSONResponse, Response
 
@@ -207,6 +201,7 @@ def get_cache_entry_from_steps(
 def to_rows_list(
     pa_table: pa.Table,
     dataset: str,
+    revision: str,
     config: str,
     split: str,
     offset: int,
@@ -223,6 +218,7 @@ def to_rows_list(
     try:
         transformed_rows = transform_rows(
             dataset=dataset,
+            revision=revision,
             config=config,
             split=split,
             rows=pa_table.to_pylist(),
@@ -243,108 +239,3 @@ def to_rows_list(
         }
         for idx, row in enumerate(transformed_rows)
     ]
-
-
-def _greater_or_equal(row_dir_name: str, row_idx: int, on_error: bool) -> bool:
-    try:
-        return int(row_dir_name) >= row_idx
-    except ValueError:
-        return on_error
-
-
-def clean_cached_assets(
-    dataset: str,
-    cached_assets_directory: StrPath,
-    keep_first_rows_number: int,
-    keep_most_recent_rows_number: int,
-    max_cleaned_rows_number: int,
-) -> None:
-    """
-    The cached assets directory is cleaned to save disk space using this simple (?) heuristic:
-
-    1. it takes a big sample of rows from the cache using glob (max `max_cleaned_rows_number`)
-    2. it keeps the most recent ones (max `keep_most_recent_rows_number`)
-    3. it keeps the rows below a certain index (max `keep_first_rows_number`)
-    4. it discards the rest
-
-    To check for the most recent rows, it looks at the "last modified time" of rows directories.
-    This time is updated every time a row is accessed using `update_last_modified_date_of_rows_in_assets_dir()`.
-
-    Args:
-        dataset (`str`):
-            Dataset name e.g 'squad' or 'lhoestq/demo1'.
-            Rows are cleaned in any dataset configuration or split of this dataset.
-        cached_assets_directory (`StrPath`):
-            Directory containing the cached image and audio files.
-        keep_first_rows_number (`int`):
-            Keep the rows with an index below a certain number.
-        keep_most_recent_rows_number (`int`):
-            Keep the most recently accessed rows.
-        max_cleaned_rows_number (`int`):
-            Maximum number of rows to discard.
-    """
-    if keep_first_rows_number < 0 or keep_most_recent_rows_number < 0 or max_cleaned_rows_number < 0:
-        raise ValueError(
-            "Failed to run cached assets cleaning. Make sure all of keep_first_rows_number,"
-            f" keep_most_recent_rows_number and max_cleaned_rows_number  are set (got {keep_first_rows_number},"
-            f" {keep_most_recent_rows_number} and {max_cleaned_rows_number})"
-        )
-    row_directories = glob_rows_in_assets_dir(dataset, cached_assets_directory)
-    row_directories_sample = list(
-        islice(
-            (
-                row_dir
-                for row_dir in row_directories
-                if _greater_or_equal(row_dir.name, keep_first_rows_number, on_error=True)
-            ),
-            max_cleaned_rows_number + keep_most_recent_rows_number,
-        )
-    )
-    if len(row_directories_sample) > keep_most_recent_rows_number:
-        row_dirs_to_delete = sorted(row_directories_sample, key=os.path.getmtime, reverse=True)[
-            keep_most_recent_rows_number:
-        ]
-        for row_dir_to_delete in row_dirs_to_delete:
-            shutil.rmtree(row_dir_to_delete, ignore_errors=True)
-
-
-def clean_cached_assets_randomly(
-    clean_cache_proba: float,
-    dataset: str,
-    cached_assets_directory: StrPath,
-    keep_first_rows_number: int,
-    keep_most_recent_rows_number: int,
-    max_cleaned_rows_number: int,
-) -> None:
-    """Randomly clean the cached assets' directory.
-
-    Args:
-        clean_cache_proba (`float`):
-            Probability to clean the cached assets' directory.
-        dataset (`str`):
-            Dataset name e.g 'squad' or 'lhoestq/demo1'.
-            Rows are cleaned in any dataset configuration or split of this dataset.
-        cached_assets_directory (`StrPath`):
-            Directory containing the cached image and audio files.
-        keep_first_rows_number (`int`):
-            Keep the rows with an index below a certain number.
-        keep_most_recent_rows_number (`int`):
-            Keep the most recently accessed rows.
-        max_cleaned_rows_number (`int`):
-            Maximum number of rows to discard.
-    """
-    # no need to do it every time
-    if random.random() < clean_cache_proba:  # nosec
-        if keep_first_rows_number < 0 and keep_most_recent_rows_number < 0 and max_cleaned_rows_number < 0:
-            logging.debug(
-                "Params keep_first_rows_number, keep_most_recent_rows_number and"
-                " max_cleaned_rows_number are not set. Skipping cached assets cleaning."
-            )
-        else:
-            clean_cached_assets(
-                dataset=dataset,
-                cached_assets_directory=cached_assets_directory,
-                keep_first_rows_number=keep_first_rows_number,
-                keep_most_recent_rows_number=keep_most_recent_rows_number,
-                max_cleaned_rows_number=max_cleaned_rows_number,
-            )