From 28aaea7e96e1050f94061d286033c265a1126e15 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 16 Oct 2023 14:10:12 -0400 Subject: [PATCH 1/6] adding revision for assets creation path --- chart/templates/_env/_envCachedAssets.tpl | 8 -- docs/source/openapi.json | 36 +++--- docs/source/rows.mdx | 2 +- libs/libapi/src/libapi/response.py | 3 + libs/libapi/src/libapi/rows_utils.py | 4 + libs/libapi/src/libapi/utils.py | 113 +----------------- libs/libapi/tests/test_utils.py | 56 --------- libs/libcommon/src/libcommon/config.py | 18 --- .../src/libcommon/viewer_utils/asset.py | 53 +++----- .../src/libcommon/viewer_utils/features.py | 11 ++ .../tests/viewer_utils/test_assets.py | 12 +- .../tests/viewer_utils/test_features.py | 37 +++--- services/rows/src/rows/routes/rows.py | 36 ------ services/rows/tests/routes/test_rows.py | 30 +---- services/search/src/search/routes/filter.py | 14 --- services/search/src/search/routes/search.py | 26 +--- .../split/first_rows_from_parquet.py | 6 + .../split/first_rows_from_streaming.py | 5 + services/worker/tests/fixtures/hub.py | 8 +- tools/docker-compose-datasets-server.yml | 8 -- tools/docker-compose-dev-datasets-server.yml | 8 -- 21 files changed, 108 insertions(+), 386 deletions(-) delete mode 100644 libs/libapi/tests/test_utils.py diff --git a/chart/templates/_env/_envCachedAssets.tpl b/chart/templates/_env/_envCachedAssets.tpl index 43dc53a638..0cededa3eb 100644 --- a/chart/templates/_env/_envCachedAssets.tpl +++ b/chart/templates/_env/_envCachedAssets.tpl @@ -6,14 +6,6 @@ value: "{{ include "cachedAssets.baseUrl" . }}" - name: CACHED_ASSETS_STORAGE_DIRECTORY value: {{ .Values.cachedAssets.storageDirectory | quote }} -- name: CACHED_ASSETS_CLEAN_CACHE_PROBA - value: {{ .Values.cachedAssets.cleanCacheProba | quote }} -- name: CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER - value: {{ .Values.cachedAssets.keepFirstRowsNumber | quote }} -- name: CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER - value: {{ .Values.cachedAssets.keepMostRecentRowsNumber | quote }} -- name: CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER - value: {{ .Values.cachedAssets.maxCleanedRowsNumber | quote }} - name: CACHED_ASSETS_S3_FOLDER_NAME value: {{ .Values.cachedAssets.s3FolderName | quote }} {{- end -}} diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 6d8553d4ab..63197d0a58 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -2270,12 +2270,12 @@ "row_idx": 0, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/0/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/0/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/0/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/0/imageB/image.jpg", "height": 256, "width": 256 } @@ -2286,12 +2286,12 @@ "row_idx": 1, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/1/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/1/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/1/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/1/imageB/image.jpg", "height": 256, "width": 256 } @@ -2302,12 +2302,12 @@ "row_idx": 2, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/2/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/2/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/2/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/2/imageB/image.jpg", "height": 256, "width": 256 } @@ -2390,7 +2390,7 @@ "id": "id10059_229vKIGbxrI_00001", "audio": [ { - "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/0/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/main/voxceleb/train/0/audio/audio.wav", "type": "audio/wav" } ], @@ -2408,7 +2408,7 @@ "id": "id10059_229vKIGbxrI_00002", "audio": [ { - "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/1/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/main/voxceleb/train/1/audio/audio.wav", "type": "audio/wav" } ], @@ -2426,7 +2426,7 @@ "id": "id10059_229vKIGbxrI_00003", "audio": [ { - "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/2/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/main/voxceleb/train/2/audio/audio.wav", "type": "audio/wav" } ], @@ -2711,12 +2711,12 @@ "row_idx": 234, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/234/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/234/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/234/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/234/imageB/image.jpg", "height": 256, "width": 256 } @@ -2727,12 +2727,12 @@ "row_idx": 235, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/235/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/235/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/235/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/235/imageB/image.jpg", "height": 256, "width": 256 } @@ -2743,12 +2743,12 @@ "row_idx": 236, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/236/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/236/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/236/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/236/imageB/image.jpg", "height": 256, "width": 256 } @@ -3128,7 +3128,7 @@ "row_idx": 16, "row": { "image": { - "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/16/image/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/main/default/train/16/image/image.jpg", "height": 431, "width": 431 }, @@ -3140,7 +3140,7 @@ "row_idx": 54, "row": { "image": { - "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/54/image/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/main/default/train/54/image/image.jpg", "height": 1280, "width": 1280 }, @@ -3152,7 +3152,7 @@ "row_idx": 56, "row": { "image": { - "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/56/image/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/main/default/train/56/image/image.jpg", "height": 1280, "width": 1280 }, diff --git a/docs/source/rows.mdx b/docs/source/rows.mdx index 21aef518a6..3ce08e12ef 100644 --- a/docs/source/rows.mdx +++ b/docs/source/rows.mdx @@ -174,7 +174,7 @@ Here is an example of image, from the first row of the cifar100 dataset: "row_idx": 0, "row": { "img": { - "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/cifar100/train/0/img/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/main/cifar100/train/0/img/image.jpg", "height": 32, "width": 32 }, diff --git a/libs/libapi/src/libapi/response.py b/libs/libapi/src/libapi/response.py index bf562e91ea..459cbccbad 100644 --- a/libs/libapi/src/libapi/response.py +++ b/libs/libapi/src/libapi/response.py @@ -1,4 +1,5 @@ import logging +from typing import Optional import pyarrow as pa from datasets import Features @@ -27,6 +28,7 @@ def create_response( unsupported_columns: list[str], num_rows_total: int, use_row_idx_column: bool = False, + revision: Optional[str] = None, ) -> PaginatedResponse: if set(pa_table.column_names).intersection(set(unsupported_columns)): raise RuntimeError( @@ -45,6 +47,7 @@ def create_response( "rows": to_rows_list( pa_table=pa_table, dataset=dataset, + revision=revision, config=config, split=split, storage_options=storage_options, diff --git a/libs/libapi/src/libapi/rows_utils.py b/libs/libapi/src/libapi/rows_utils.py index b10c7d051c..55b5125dfc 100644 --- a/libs/libapi/src/libapi/rows_utils.py +++ b/libs/libapi/src/libapi/rows_utils.py @@ -20,11 +20,13 @@ def _transform_row( storage_options: Union[DirectoryStorageOptions, S3StorageOptions], offset: int, row_idx_column: Optional[str], + revision: Optional[str] = None, ) -> Row: row_idx, row = row_idx_and_row transformed_row = { featureName: get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=offset + row_idx if row_idx_column is None else row[row_idx_column], @@ -49,10 +51,12 @@ def transform_rows( storage_options: Union[DirectoryStorageOptions, S3StorageOptions], offset: int, row_idx_column: Optional[str], + revision: Optional[str] = None, ) -> list[Row]: fn = partial( _transform_row, dataset=dataset, + revision=revision, config=config, split=split, features=features, diff --git a/libs/libapi/src/libapi/utils.py b/libs/libapi/src/libapi/utils.py index 69065e3116..1d406faf67 100644 --- a/libs/libapi/src/libapi/utils.py +++ b/libs/libapi/src/libapi/utils.py @@ -2,12 +2,8 @@ # Copyright 2022 The HuggingFace Authors. import logging -import os -import random -import shutil from collections.abc import Callable, Coroutine from http import HTTPStatus -from itertools import islice from typing import Any, Optional, Union import pyarrow as pa @@ -21,10 +17,8 @@ CacheEntry, get_best_response, ) -from libcommon.storage import StrPath from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions from libcommon.utils import Priority, RowItem, orjson_dumps -from libcommon.viewer_utils.asset import glob_rows_in_assets_dir from starlette.requests import Request from starlette.responses import JSONResponse, Response @@ -214,6 +208,7 @@ def to_rows_list( unsupported_columns: list[str], storage_options: Union[DirectoryStorageOptions, S3StorageOptions], row_idx_column: Optional[str] = None, + revision: Optional[str] = None, ) -> list[RowItem]: num_rows = pa_table.num_rows for idx, (column, feature) in enumerate(features.items()): @@ -223,6 +218,7 @@ def to_rows_list( try: transformed_rows = transform_rows( dataset=dataset, + revision=revision, config=config, split=split, rows=pa_table.to_pylist(), @@ -243,108 +239,3 @@ def to_rows_list( } for idx, row in enumerate(transformed_rows) ] - - -def _greater_or_equal(row_dir_name: str, row_idx: int, on_error: bool) -> bool: - try: - return int(row_dir_name) >= row_idx - except ValueError: - return on_error - - -def clean_cached_assets( - dataset: str, - cached_assets_directory: StrPath, - keep_first_rows_number: int, - keep_most_recent_rows_number: int, - max_cleaned_rows_number: int, -) -> None: - """ - The cached assets directory is cleaned to save disk space using this simple (?) heuristic: - - 1. it takes a big sample of rows from the cache using glob (max `max_cleaned_rows_number`) - 2. it keeps the most recent ones (max `keep_most_recent_rows_number`) - 3. it keeps the rows below a certain index (max `keep_first_rows_number`) - 4. it discards the rest - - To check for the most recent rows, it looks at the "last modified time" of rows directories. - This time is updated every time a row is accessed using `update_last_modified_date_of_rows_in_assets_dir()`. - - Args: - dataset (`str`): - Dataset name e.g 'squad' or 'lhoestq/demo1'. - Rows are cleaned in any dataset configuration or split of this dataset. - cached_assets_directory (`StrPath`): - Directory containing the cached image and audio files. - keep_first_rows_number (`int`): - Keep the rows with an index below a certain number. - keep_most_recent_rows_number (`int`): - Keep the most recently accessed rows. - max_cleaned_rows_number (`int`): - Maximum number of rows to discard. - """ - if keep_first_rows_number < 0 or keep_most_recent_rows_number < 0 or max_cleaned_rows_number < 0: - raise ValueError( - "Failed to run cached assets cleaning. Make sure all of keep_first_rows_number," - f" keep_most_recent_rows_number and max_cleaned_rows_number are set (got {keep_first_rows_number}," - f" {keep_most_recent_rows_number} and {max_cleaned_rows_number})" - ) - row_directories = glob_rows_in_assets_dir(dataset, cached_assets_directory) - row_directories_sample = list( - islice( - ( - row_dir - for row_dir in row_directories - if _greater_or_equal(row_dir.name, keep_first_rows_number, on_error=True) - ), - max_cleaned_rows_number + keep_most_recent_rows_number, - ) - ) - if len(row_directories_sample) > keep_most_recent_rows_number: - row_dirs_to_delete = sorted(row_directories_sample, key=os.path.getmtime, reverse=True)[ - keep_most_recent_rows_number: - ] - for row_dir_to_delete in row_dirs_to_delete: - shutil.rmtree(row_dir_to_delete, ignore_errors=True) - - -def clean_cached_assets_randomly( - clean_cache_proba: float, - dataset: str, - cached_assets_directory: StrPath, - keep_first_rows_number: int, - keep_most_recent_rows_number: int, - max_cleaned_rows_number: int, -) -> None: - """Randomly clean the cached assets' directory. - - Args: - clean_cache_proba (`float`): - Probability to clean the cached assets' directory. - dataset (`str`): - Dataset name e.g 'squad' or 'lhoestq/demo1'. - Rows are cleaned in any dataset configuration or split of this dataset. - cached_assets_directory (`StrPath`): - Directory containing the cached image and audio files. - keep_first_rows_number (`int`): - Keep the rows with an index below a certain number. - keep_most_recent_rows_number (`int`): - Keep the most recently accessed rows. - max_cleaned_rows_number (`int`): - Maximum number of rows to discard. - """ - # no need to do it every time - if random.random() < clean_cache_proba: # nosec - if keep_first_rows_number < 0 and keep_most_recent_rows_number < 0 and max_cleaned_rows_number < 0: - logging.debug( - "Params keep_first_rows_number, keep_most_recent_rows_number and" - " max_cleaned_rows_number are not set. Skipping cached assets cleaning." - ) - else: - clean_cached_assets( - dataset=dataset, - cached_assets_directory=cached_assets_directory, - keep_first_rows_number=keep_first_rows_number, - keep_most_recent_rows_number=keep_most_recent_rows_number, - max_cleaned_rows_number=max_cleaned_rows_number, - ) diff --git a/libs/libapi/tests/test_utils.py b/libs/libapi/tests/test_utils.py deleted file mode 100644 index aa8d05ba85..0000000000 --- a/libs/libapi/tests/test_utils.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright 2023 The HuggingFace Authors. - -import os -import time -from pathlib import Path -from unittest.mock import patch - -import pytest -from libcommon.storage import StrPath - -from libapi.utils import clean_cached_assets - - -@pytest.mark.parametrize( - "n_rows,keep_most_recent_rows_number,keep_first_rows_number,max_cleaned_rows_number,expected_remaining_rows", - [ - (8, 1, 1, 100, [0, 7]), - (8, 2, 2, 100, [0, 1, 6, 7]), - (8, 1, 1, 3, [0, 4, 5, 6, 7]), - ], -) -def test_clean_cached_assets( - tmp_path: Path, - n_rows: int, - keep_most_recent_rows_number: int, - keep_first_rows_number: int, - max_cleaned_rows_number: int, - expected_remaining_rows: list[int], -) -> None: - cached_assets_directory = tmp_path / "cached-assets" - split_dir = cached_assets_directory / "ds/--/plain_text/train" - split_dir.mkdir(parents=True) - for i in range(n_rows): - (split_dir / str(i)).mkdir() - time.sleep(0.01) - - def deterministic_glob_rows_in_assets_dir( - dataset: str, - assets_directory: StrPath, - ) -> list[Path]: - return sorted( - list(Path(assets_directory).resolve().glob(os.path.join(dataset, "--", "*", "*", "*"))), - key=lambda p: int(p.name), - ) - - with patch("libapi.utils.glob_rows_in_assets_dir", deterministic_glob_rows_in_assets_dir): - clean_cached_assets( - "ds", - cached_assets_directory, - keep_most_recent_rows_number=keep_most_recent_rows_number, - keep_first_rows_number=keep_first_rows_number, - max_cleaned_rows_number=max_cleaned_rows_number, - ) - remaining_rows = sorted(int(row_dir.name) for row_dir in split_dir.glob("*")) - assert remaining_rows == expected_remaining_rows diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py index d2bf188015..6c4b084797 100644 --- a/libs/libcommon/src/libcommon/config.py +++ b/libs/libcommon/src/libcommon/config.py @@ -89,10 +89,6 @@ def from_env(cls) -> "S3Config": CACHED_ASSETS_BASE_URL = "cached-assets" CACHED_ASSETS_STORAGE_DIRECTORY = None -CACHED_ASSETS_CLEAN_CACHE_PROBA = 0.05 -CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER = 100 -CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER = 200 -CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER = 10_000 CACHED_ASSETS_S3_FOLDER_NAME = "cached-assets" @@ -100,10 +96,6 @@ def from_env(cls) -> "S3Config": class CachedAssetsConfig: base_url: str = ASSETS_BASE_URL storage_directory: Optional[str] = CACHED_ASSETS_STORAGE_DIRECTORY - clean_cache_proba: float = CACHED_ASSETS_CLEAN_CACHE_PROBA - keep_first_rows_number: int = CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER - keep_most_recent_rows_number: int = CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER - max_cleaned_rows_number: int = CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER s3_folder_name: str = CACHED_ASSETS_S3_FOLDER_NAME @classmethod @@ -113,16 +105,6 @@ def from_env(cls) -> "CachedAssetsConfig": return cls( base_url=env.str(name="BASE_URL", default=CACHED_ASSETS_BASE_URL), storage_directory=env.str(name="STORAGE_DIRECTORY", default=CACHED_ASSETS_STORAGE_DIRECTORY), - clean_cache_proba=env.float(name="CLEAN_CACHE_PROBA", default=CACHED_ASSETS_CLEAN_CACHE_PROBA), - keep_first_rows_number=env.float( - name="KEEP_FIRST_ROWS_NUMBER", default=CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER - ), - keep_most_recent_rows_number=env.float( - name="KEEP_MOST_RECENT_ROWS_NUMBER", default=CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER - ), - max_cleaned_rows_number=env.float( - name="MAX_CLEAN_SAMPLE_SIZE", default=CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER - ), s3_folder_name=env.str(name="S3_FOLDER_NAME", default=CACHED_ASSETS_S3_FOLDER_NAME), ) diff --git a/libs/libcommon/src/libcommon/viewer_utils/asset.py b/libs/libcommon/src/libcommon/viewer_utils/asset.py index e4a7153aec..0cab661caf 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/asset.py +++ b/libs/libcommon/src/libcommon/viewer_utils/asset.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2022 The HuggingFace Authors. -import contextlib import logging import os -from collections.abc import Callable, Generator +from collections.abc import Callable from functools import partial from os import makedirs from pathlib import Path @@ -15,7 +14,6 @@ from PIL import Image # type: ignore from pydub import AudioSegment # type:ignore -from libcommon.constants import DATASET_SEPARATOR from libcommon.s3_client import S3Client from libcommon.storage import StrPath, remove_dir from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions @@ -31,8 +29,11 @@ def get_and_create_dir_path(assets_directory: StrPath, url_dir_path: str) -> Pat return dir_path -def get_url_dir_path(dataset: str, config: str, split: str, row_idx: int, column: str) -> str: - return f"{dataset}/{DATASET_SEPARATOR}/{config}/{split}/{str(row_idx)}/{column}" +def get_url_dir_path( + dataset: str, config: str, split: str, row_idx: int, column: str, revision: Optional[str] = None +) -> str: + revision = "main" if revision is None else revision + return f"{dataset}/{revision}/{config}/{split}/{str(row_idx)}/{column}" def get_unique_path_for_filename(assets_directory: StrPath, filename: str) -> Path: @@ -44,39 +45,6 @@ def delete_asset_dir(dataset: str, directory: StrPath) -> None: remove_dir(dir_path) -def glob_rows_in_assets_dir( - dataset: str, - assets_directory: StrPath, -) -> Generator[Path, None, None]: - return Path(assets_directory).resolve().glob(os.path.join(dataset, DATASET_SEPARATOR, "*", "*", "*")) - - -def update_directory_modification_date(path: Path) -> None: - if path.is_dir(): - # update the directory's last modified date - temporary_file = path / DATASETS_SERVER_MDATE_FILENAME - if temporary_file.is_dir(): - raise ValueError(f"Cannot create temporary file {temporary_file} in {path}") - temporary_file.touch(exist_ok=True) - if temporary_file.is_file(): - with contextlib.suppress(FileNotFoundError): - temporary_file.unlink() - - -def update_last_modified_date_of_rows_in_assets_dir( - dataset: str, - config: str, - split: str, - offset: int, - length: int, - assets_directory: StrPath, -) -> None: - update_directory_modification_date(Path(assets_directory).resolve() / dataset.split("/")[0]) - row_dirs_path = Path(assets_directory).resolve() / dataset / DATASET_SEPARATOR / config / split - for row_idx in range(offset, offset + length): - update_directory_modification_date(row_dirs_path / str(row_idx)) - - class ImageSource(TypedDict): src: str height: int @@ -114,6 +82,7 @@ def create_asset_file( filename: str, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], fn: Callable[[Path, str, bool], SupportedSource], + revision: Optional[str] = None, ) -> SupportedSource: # get url dir path assets_base_url = storage_options.assets_base_url @@ -121,7 +90,9 @@ def create_asset_file( overwrite = storage_options.overwrite use_s3_storage = isinstance(storage_options, S3StorageOptions) logging.debug(f"storage options with {use_s3_storage=}") - url_dir_path = get_url_dir_path(dataset=dataset, config=config, split=split, row_idx=row_idx, column=column) + url_dir_path = get_url_dir_path( + dataset=dataset, revision=revision, config=config, split=split, row_idx=row_idx, column=column + ) src = f"{assets_base_url}/{url_dir_path}/{filename}" # configure file path @@ -196,12 +167,14 @@ def create_image_file( filename: str, image: Image.Image, storage_options: DirectoryStorageOptions, + revision: Optional[str] = None, ) -> ImageSource: fn = partial(save_image, image=image) return cast( ImageSource, create_asset_file( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -223,6 +196,7 @@ def create_audio_file( audio_file_extension: str, filename: str, storage_options: DirectoryStorageOptions, + revision: Optional[str] = None, ) -> list[AudioSource]: fn = partial(save_audio, audio_file_bytes=audio_file_bytes, audio_file_extension=audio_file_extension) return [ @@ -230,6 +204,7 @@ def create_audio_file( AudioSource, create_asset_file( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, diff --git a/libs/libcommon/src/libcommon/viewer_utils/features.py b/libs/libcommon/src/libcommon/viewer_utils/features.py index a1f59c94fb..5e080498a5 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/features.py +++ b/libs/libcommon/src/libcommon/viewer_utils/features.py @@ -59,6 +59,7 @@ def image( featureName: str, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], json_path: Optional[list[Union[str, int]]] = None, + revision: Optional[str] = None, ) -> Any: if value is None: return None @@ -81,6 +82,7 @@ def image( try: return create_image_file( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -106,6 +108,7 @@ def audio( featureName: str, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], json_path: Optional[list[Union[str, int]]] = None, + revision: Optional[str] = None, ) -> Any: if value is None: return None @@ -156,6 +159,7 @@ def audio( # this function can raise, we don't catch it return create_audio_file( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -177,6 +181,7 @@ def get_cell_value( fieldType: Any, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], json_path: Optional[list[Union[str, int]]] = None, + revision: Optional[str] = None, ) -> Any: # always allow None values in the cells if cell is None: @@ -184,6 +189,7 @@ def get_cell_value( if isinstance(fieldType, Image): return image( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -195,6 +201,7 @@ def get_cell_value( elif isinstance(fieldType, Audio): return audio( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -212,6 +219,7 @@ def get_cell_value( return [ get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -230,6 +238,7 @@ def get_cell_value( return [ get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -251,6 +260,7 @@ def get_cell_value( key: [ get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -272,6 +282,7 @@ def get_cell_value( return { key: get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, diff --git a/libs/libcommon/tests/viewer_utils/test_assets.py b/libs/libcommon/tests/viewer_utils/test_assets.py index 4f90d67141..1546e8a595 100644 --- a/libs/libcommon/tests/viewer_utils/test_assets.py +++ b/libs/libcommon/tests/viewer_utils/test_assets.py @@ -44,6 +44,7 @@ def test_create_image_file_with_s3_storage(datasets: Mapping[str, Dataset], cach value = create_image_file( dataset="dataset", + revision="revision", config="config", split="split", image=dataset[0]["col"], @@ -53,11 +54,11 @@ def test_create_image_file_with_s3_storage(datasets: Mapping[str, Dataset], cach storage_options=storage_options, ) assert value == { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image.jpg", "height": 480, "width": 640, } - body = conn.Object(bucket_name, "assets/dataset/--/config/split/7/col/image.jpg").get()["Body"].read() + body = conn.Object(bucket_name, "assets/dataset/revision/config/split/7/col/image.jpg").get()["Body"].read() assert body is not None image = PILImage.open(io.BytesIO(body)) @@ -97,6 +98,7 @@ def test_create_audio_file_with_s3_storage(datasets: Mapping[str, Dataset], cach value = create_audio_file( dataset="dataset", + revision="revision", config="config", split="split", row_idx=7, @@ -109,9 +111,11 @@ def test_create_audio_file_with_s3_storage(datasets: Mapping[str, Dataset], cach assert value == [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio.wav", "type": "audio/wav", }, ] - audio_object = conn.Object(bucket_name, "assets/dataset/--/config/split/7/col/audio.wav").get()["Body"].read() + audio_object = ( + conn.Object(bucket_name, "assets/dataset/revision/config/split/7/col/audio.wav").get()["Body"].read() + ) assert audio_object is not None diff --git a/libs/libcommon/tests/viewer_utils/test_features.py b/libs/libcommon/tests/viewer_utils/test_features.py index b12ff10f6b..b68a0566af 100644 --- a/libs/libcommon/tests/viewer_utils/test_features.py +++ b/libs/libcommon/tests/viewer_utils/test_features.py @@ -78,6 +78,7 @@ def test_value( assert feature.dtype == output_dtype value = get_cell_value( dataset="dataset", + revision="revision", config="config", split="split", row_idx=7, @@ -161,7 +162,7 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "audio", [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio.wav", "type": "audio/wav", } ], @@ -171,7 +172,7 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "audio_ogg", [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio.wav", "type": "audio/wav", } ], @@ -183,7 +184,7 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO ( "image", { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image.jpg", "height": 480, "width": 640, }, @@ -202,12 +203,12 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "images_list", [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image-1d100e9.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d100e9.jpg", "height": 480, "width": 640, }, { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image-1d300ea.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d300ea.jpg", "height": 480, "width": 640, }, @@ -219,13 +220,13 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO [ [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio-1d100e9.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d100e9.wav", "type": "audio/wav", }, ], [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio-1d300ea.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d300ea.wav", "type": "audio/wav", }, ], @@ -236,12 +237,12 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "images_sequence", [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image-1d100e9.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d100e9.jpg", "height": 480, "width": 640, }, { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image-1d300ea.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d300ea.jpg", "height": 480, "width": 640, }, @@ -253,13 +254,13 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO [ [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio-1d100e9.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d100e9.wav", "type": "audio/wav", }, ], [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio-1d300ea.wav", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d300ea.wav", "type": "audio/wav", }, ], @@ -272,12 +273,12 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "a": 0, "b": [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image-89101db.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-89101db.jpg", "height": 480, "width": 640, }, { - "src": "http://localhost/assets/dataset/--/config/split/7/col/image-89301dc.jpg", + "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-89301dc.jpg", "height": 480, "width": 640, }, @@ -286,13 +287,17 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "ca": [ [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio-18360330.wav", + "src": ( + "http://localhost/assets/dataset/revision/config/split/7/col/audio-18360330.wav" + ), "type": "audio/wav", }, ], [ { - "src": "http://localhost/assets/dataset/--/config/split/7/col/audio-18380331.wav", + "src": ( + "http://localhost/assets/dataset/revision/config/split/7/col/audio-18380331.wav" + ), "type": "audio/wav", }, ], @@ -321,6 +326,7 @@ def test_others( # decoded value = get_cell_value( dataset="dataset", + revision="revision", config="config", split="split", row_idx=7, @@ -334,6 +340,7 @@ def test_others( # encoded value = get_cell_value( dataset="dataset", + revision="revision", config="config", split="split", row_idx=7, diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index 06deb59d24..84d1477b1c 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -2,7 +2,6 @@ # Copyright 2022 The HuggingFace Authors. import logging -import random from typing import Literal, Optional, Union from fsspec.implementations.http import HTTPFileSystem @@ -16,7 +15,6 @@ from libapi.response import create_response from libapi.utils import ( Endpoint, - clean_cached_assets, get_json_api_error_response, get_json_error_response, get_json_ok_response, @@ -28,7 +26,6 @@ from libcommon.s3_client import S3Client from libcommon.simple_cache import CachedArtifactError, CachedArtifactNotFoundError from libcommon.storage import StrPath -from libcommon.viewer_utils.asset import update_last_modified_date_of_rows_in_assets_dir from libcommon.viewer_utils.features import UNSUPPORTED_FEATURES from starlette.requests import Request from starlette.responses import Response @@ -57,10 +54,6 @@ def create_rows_endpoint( hf_timeout_seconds: Optional[float] = None, max_age_long: int = 0, max_age_short: int = 0, - clean_cache_proba: float = 0.0, - keep_first_rows_number: int = -1, - keep_most_recent_rows_number: int = -1, - max_cleaned_rows_number: int = -1, ) -> Endpoint: indexer = Indexer( processing_graph=processing_graph, @@ -123,26 +116,6 @@ async def rows_endpoint(request: Request) -> Response: ) with StepProfiler(method="rows_endpoint", step="query the rows"): pa_table = rows_index.query(offset=offset, length=length) - with StepProfiler(method="rows_endpoint", step="clean cache"): - # no need to do it every time - if random.random() < clean_cache_proba: # nosec - if ( - keep_first_rows_number < 0 - and keep_most_recent_rows_number < 0 - and max_cleaned_rows_number < 0 - ): - logger.debug( - "Params keep_first_rows_number, keep_most_recent_rows_number and" - " max_cleaned_rows_number are not set. Skipping cached assets cleaning." - ) - else: - clean_cached_assets( - dataset=dataset, - cached_assets_directory=cached_assets_directory, - keep_first_rows_number=keep_first_rows_number, - keep_most_recent_rows_number=keep_most_recent_rows_number, - max_cleaned_rows_number=max_cleaned_rows_number, - ) with StepProfiler(method="rows_endpoint", step="transform to a list"): response = create_response( dataset=dataset, @@ -158,15 +131,6 @@ async def rows_endpoint(request: Request) -> Response: unsupported_columns=rows_index.parquet_index.unsupported_columns, num_rows_total=rows_index.parquet_index.num_rows_total, ) - with StepProfiler(method="rows_endpoint", step="update last modified time of rows in asset dir"): - update_last_modified_date_of_rows_in_assets_dir( - dataset=dataset, - config=config, - split=split, - offset=offset, - length=length, - assets_directory=cached_assets_directory, - ) with StepProfiler(method="rows_endpoint", step="generate the OK response"): return get_json_ok_response(content=response, max_age=max_age_long, revision=revision) except CachedArtifactError as e: diff --git a/services/rows/tests/routes/test_rows.py b/services/rows/tests/routes/test_rows.py index e94a8da770..1d9afc69ed 100644 --- a/services/rows/tests/routes/test_rows.py +++ b/services/rows/tests/routes/test_rows.py @@ -4,7 +4,6 @@ import io import os import shutil -import time from collections.abc import Generator from http import HTTPStatus from pathlib import Path @@ -29,7 +28,6 @@ from libcommon.s3_client import S3Client from libcommon.simple_cache import _clean_cache_database, upsert_response from libcommon.storage import StrPath -from libcommon.viewer_utils.asset import update_last_modified_date_of_rows_in_assets_dir from moto import mock_s3 from PIL import Image as PILImage # type: ignore @@ -467,6 +465,7 @@ def test_create_response(ds: Dataset, app_config: AppConfig, cached_assets_direc ) response = create_response( dataset="ds", + revision="revision", config="default", split="train", cached_assets_base_url=app_config.cached_assets.base_url, @@ -509,6 +508,7 @@ def test_create_response_with_image( response = create_response( dataset=dataset, + revision="revision", config=config, split=split, cached_assets_base_url=app_config.cached_assets.base_url, @@ -527,7 +527,7 @@ def test_create_response_with_image( "row_idx": 0, "row": { "image": { - "src": "http://localhost/cached-assets/ds_image/--/default/train/0/image/image.jpg", + "src": "http://localhost/cached-assets/ds_image/revision/default/train/0/image/image.jpg", "height": 480, "width": 640, } @@ -537,31 +537,9 @@ def test_create_response_with_image( ] body = ( - conn.Object(bucket_name, "cached-assets/ds_image/--/default/train/0/image/image.jpg").get()["Body"].read() + conn.Object(bucket_name, "cached-assets/ds_image/revision/default/train/0/image/image.jpg").get()["Body"].read() ) assert body is not None image = PILImage.open(io.BytesIO(body)) assert image is not None - - -def test_update_last_modified_date_of_rows_in_assets_dir(tmp_path: Path) -> None: - cached_assets_directory = tmp_path / "cached-assets" - split_dir = cached_assets_directory / "ds/--/default/train" - split_dir.mkdir(parents=True) - n_rows = 8 - for i in range(n_rows): - (split_dir / str(i)).mkdir() - time.sleep(0.01) - update_last_modified_date_of_rows_in_assets_dir( - dataset="ds", - config="default", - split="train", - offset=2, - length=3, - assets_directory=cached_assets_directory, - ) - most_recent_rows_dirs = sorted(list(split_dir.glob("*")), key=os.path.getmtime, reverse=True) - most_recent_rows = [int(row_dir.name) for row_dir in most_recent_rows_dirs] - assert sorted(most_recent_rows[:3]) == [2, 3, 4] - assert most_recent_rows[3:] == [7, 6, 5, 1, 0] diff --git a/services/search/src/search/routes/filter.py b/services/search/src/search/routes/filter.py index f9f8a6622c..11bb2e0033 100644 --- a/services/search/src/search/routes/filter.py +++ b/services/search/src/search/routes/filter.py @@ -24,7 +24,6 @@ from libapi.response import ROW_IDX_COLUMN, create_response from libapi.utils import ( Endpoint, - clean_cached_assets_randomly, get_json_api_error_response, get_json_error_response, get_json_ok_response, @@ -73,10 +72,6 @@ def create_filter_endpoint( hf_timeout_seconds: Optional[float] = None, max_age_long: int = 0, max_age_short: int = 0, - clean_cache_proba: float = 0.0, - keep_first_rows_number: int = -1, - keep_most_recent_rows_number: int = -1, - max_cleaned_rows_number: int = -1, ) -> Endpoint: async def filter_endpoint(request: Request) -> Response: revision: Optional[str] = None @@ -162,15 +157,6 @@ async def filter_endpoint(request: Request) -> Response: limit=length, offset=offset, ) - with StepProfiler(method="filter_endpoint", step="clean cache randomly"): - clean_cached_assets_randomly( - clean_cache_proba=clean_cache_proba, - dataset=dataset, - cached_assets_directory=cached_assets_directory, - keep_first_rows_number=keep_first_rows_number, - keep_most_recent_rows_number=keep_most_recent_rows_number, - max_cleaned_rows_number=max_cleaned_rows_number, - ) with StepProfiler(method="filter_endpoint", step="create response"): response = create_response( dataset=dataset, diff --git a/services/search/src/search/routes/search.py b/services/search/src/search/routes/search.py index d802b254f7..818d9d4690 100644 --- a/services/search/src/search/routes/search.py +++ b/services/search/src/search/routes/search.py @@ -3,7 +3,6 @@ import logging from http import HTTPStatus -from pathlib import Path from typing import Optional import pyarrow as pa @@ -27,7 +26,6 @@ from libapi.response import ROW_IDX_COLUMN from libapi.utils import ( Endpoint, - clean_cached_assets_randomly, get_json_api_error_response, get_json_error_response, get_json_ok_response, @@ -85,6 +83,7 @@ def create_response( offset: int, features: Features, num_rows_total: int, + revision: Optional[str] = None, ) -> PaginatedResponse: features_without_key = features.copy() features_without_key.pop(ROW_IDX_COLUMN, None) @@ -105,10 +104,11 @@ def create_response( return PaginatedResponse( features=to_features_list(features_without_key), rows=to_rows_list( - pa_table, - dataset, - config, - split, + pa_table=pa_table, + dataset=dataset, + revision=revision, + config=config, + split=split, storage_options=storage_options, offset=offset, features=features, @@ -138,10 +138,6 @@ def create_search_endpoint( hf_timeout_seconds: Optional[float] = None, max_age_long: int = 0, max_age_short: int = 0, - clean_cache_proba: float = 0.0, - keep_first_rows_number: int = -1, - keep_most_recent_rows_number: int = -1, - max_cleaned_rows_number: int = -1, ) -> Endpoint: async def search_endpoint(request: Request) -> Response: revision: Optional[str] = None @@ -211,16 +207,6 @@ async def search_endpoint(request: Request) -> Response: logging.debug(f"connect to index file {index_file_location}") (num_rows_total, pa_table) = full_text_search(index_file_location, query, offset, length) - with StepProfiler(method="search_endpoint", step="clean cache randomly"): - clean_cached_assets_randomly( - clean_cache_proba=clean_cache_proba, - dataset=dataset, - cached_assets_directory=cached_assets_directory, - keep_first_rows_number=keep_first_rows_number, - keep_most_recent_rows_number=keep_most_recent_rows_number, - max_cleaned_rows_number=max_cleaned_rows_number, - ) - with StepProfiler(method="search_endpoint", step="create response"): if "features" in duckdb_index_cache_entry["content"] and isinstance( duckdb_index_cache_entry["content"]["features"], dict diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py index 44227a652a..b70d84bad1 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py @@ -2,6 +2,7 @@ # Copyright 2022 The HuggingFace Authors. import logging +from typing import Optional from datasets import Audio, Features, Image from fsspec.implementations.http import HTTPFileSystem @@ -35,11 +36,13 @@ def transform_rows( rows: list[RowItem], features: Features, storage_options: S3StorageOptions, + revision: Optional[str] = None, ) -> list[Row]: return [ { featureName: get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -65,6 +68,7 @@ def compute_first_rows_response( rows_min_number: int, columns_max_number: int, indexer: Indexer, + revision: Optional[str] = None, ) -> SplitFirstRowsResponse: logging.info(f"get first-rows for dataset={dataset} config={config} split={split}") @@ -122,6 +126,7 @@ def compute_first_rows_response( try: transformed_rows = transform_rows( dataset=dataset, + revision=revision, config=config, split=split, rows=rows, @@ -221,5 +226,6 @@ def compute(self) -> CompleteJobResult: rows_min_number=self.first_rows_config.min_number, columns_max_number=self.first_rows_config.columns_max_number, indexer=self.indexer, + revision=self.job_info["params"]["revision"], ) ) diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py b/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py index 0ba296584e..c01700270f 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py @@ -44,11 +44,13 @@ def transform_rows( rows: list[Row], features: Features, storage_options: S3StorageOptions, + revision: Optional[str] = None, ) -> list[Row]: return [ { featureName: get_cell_value( dataset=dataset, + revision=revision, config=config, split=split, row_idx=row_idx, @@ -75,6 +77,7 @@ def compute_first_rows_response( rows_min_number: int, columns_max_number: int, max_size_fallback: Optional[int] = None, + revision: Optional[str] = None, ) -> SplitFirstRowsResponse: """ Get the response of /first-rows for one specific split of a dataset from huggingface.co. @@ -213,6 +216,7 @@ def compute_first_rows_response( try: transformed_rows = transform_rows( dataset=dataset, + revision=revision, config=config, split=split, rows=rows, @@ -301,5 +305,6 @@ def compute(self) -> CompleteJobResult: rows_max_number=self.first_rows_config.max_number, rows_min_number=self.first_rows_config.min_number, columns_max_number=self.first_rows_config.columns_max_number, + revision=self.job_info["params"]["revision"], ) ) diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index b9e8bbd9f0..8cad6ed518 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -563,7 +563,7 @@ def get_AUDIO_rows(dataset: str) -> Any: { "col": [ { - "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/audio.wav", + "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/audio.wav", "type": "audio/wav", }, ] @@ -581,7 +581,7 @@ def get_IMAGE_rows(dataset: str) -> Any: return [ { "col": { - "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image.jpg", + "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/image.jpg", "height": 480, "width": 640, }, @@ -600,12 +600,12 @@ def get_IMAGES_LIST_rows(dataset: str) -> Any: { "col": [ { - "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image-1d100e9.jpg", + "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/image-1d100e9.jpg", "height": 480, "width": 640, }, { - "src": f"http://localhost/assets/{dataset}/--/{config}/{split}/0/col/image-1d300ea.jpg", + "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/image-1d300ea.jpg", "height": 480, "width": 640, }, diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index 9ce46ac870..844a692b25 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -100,10 +100,6 @@ services: environment: CACHED_ASSETS_BASE_URL: "http://localhost:${PORT_REVERSE_PROXY-8000}/cached-assets" # hard-coded to work with the reverse-proxy CACHED_ASSETS_STORAGE_DIRECTORY: ${CACHED_ASSETS_STORAGE_DIRECTORY-/cached-assets} - CACHED_ASSETS_CLEAN_CACHE_PROBA: ${CACHED_ASSETS_CLEAN_CACHE_PROBA-0.05} - CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER-100} - CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER-200} - CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER: ${CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER-10000} PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata} ROWS_INDEX_MAX_ARROW_DATA_IN_MEMORY: ${ROWS_INDEX_MAX_ARROW_DATA_IN_MEMORY-300_000_000} # prometheus @@ -132,10 +128,6 @@ services: environment: CACHED_ASSETS_BASE_URL: "http://localhost:${PORT_REVERSE_PROXY-8000}/cached-assets" # hard-coded to work with the reverse-proxy CACHED_ASSETS_STORAGE_DIRECTORY: ${CACHED_ASSETS_STORAGE_DIRECTORY-/cached-assets} - CACHED_ASSETS_CLEAN_CACHE_PROBA: ${CACHED_ASSETS_CLEAN_CACHE_PROBA-0.05} - CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER-100} - CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER-200} - CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER: ${CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER-10000} DUCKDB_INDEX_CACHE_DIRECTORY: ${DUCKDB_INDEX_CACHE_DIRECTORY-/duckdb-index} # prometheus PROMETHEUS_MULTIPROC_DIR: ${PROMETHEUS_MULTIPROC_DIR-} diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index a6f8ca1de3..c4165cf38d 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -102,10 +102,6 @@ services: environment: CACHED_ASSETS_BASE_URL: "http://localhost:${PORT_REVERSE_PROXY-8000}/cached-assets" # hard-coded to work with the reverse-proxy CACHED_ASSETS_STORAGE_DIRECTORY: ${CACHED_ASSETS_STORAGE_DIRECTORY-/cached-assets} - CACHED_ASSETS_CLEAN_CACHE_PROBA: ${CACHED_ASSETS_CLEAN_CACHE_PROBA-0.05} - CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER-100} - CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER-200} - CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER: ${CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER-10000} PARQUET_METADATA_STORAGE_DIRECTORY: ${PARQUET_METADATA_STORAGE_DIRECTORY-/parquet_metadata} ROWS_INDEX_MAX_ARROW_DATA_IN_MEMORY: ${ROWS_INDEX_MAX_ARROW_DATA_IN_MEMORY-300_000_000} # prometheus @@ -135,10 +131,6 @@ services: environment: CACHED_ASSETS_BASE_URL: "http://localhost:${PORT_REVERSE_PROXY-8000}/cached-assets" # hard-coded to work with the reverse-proxy CACHED_ASSETS_STORAGE_DIRECTORY: ${CACHED_ASSETS_STORAGE_DIRECTORY-/cached-assets} - CACHED_ASSETS_CLEAN_CACHE_PROBA: ${CACHED_ASSETS_CLEAN_CACHE_PROBA-0.05} - CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER-100} - CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER: ${CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER-200} - CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER: ${CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER-10000} DUCKDB_INDEX_CACHE_DIRECTORY: ${DUCKDB_INDEX_CACHE_DIRECTORY-/duckdb-index} # prometheus PROMETHEUS_MULTIPROC_DIR: ${PROMETHEUS_MULTIPROC_DIR-} From b1b8d39f44e5f95abacde039e969572c1a41b063 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 16 Oct 2023 15:19:55 -0400 Subject: [PATCH 2/6] fix style --- services/rows/tests/routes/test_rows.py | 4 +++- .../tests/job_runners/split/test_first_rows_from_streaming.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/services/rows/tests/routes/test_rows.py b/services/rows/tests/routes/test_rows.py index 1d9afc69ed..70c551e19e 100644 --- a/services/rows/tests/routes/test_rows.py +++ b/services/rows/tests/routes/test_rows.py @@ -537,7 +537,9 @@ def test_create_response_with_image( ] body = ( - conn.Object(bucket_name, "cached-assets/ds_image/revision/default/train/0/image/image.jpg").get()["Body"].read() + conn.Object(bucket_name, "cached-assets/ds_image/revision/default/train/0/image/image.jpg") + .get()["Body"] + .read() ) assert body is not None diff --git a/services/worker/tests/job_runners/split/test_first_rows_from_streaming.py b/services/worker/tests/job_runners/split/test_first_rows_from_streaming.py index b3a493b08a..b93cbd8aae 100644 --- a/services/worker/tests/job_runners/split/test_first_rows_from_streaming.py +++ b/services/worker/tests/job_runners/split/test_first_rows_from_streaming.py @@ -90,7 +90,7 @@ def _get_job_runner( "type": SplitFirstRowsFromStreamingJobRunner.get_job_type(), "params": { "dataset": dataset, - "revision": "revision", + "revision": "main", "config": config, "split": split, }, From 6efa4e8a207cf15f1e02c225c23db599f9a325c7 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Mon, 16 Oct 2023 16:42:21 -0400 Subject: [PATCH 3/6] add missing parameters --- services/rows/src/rows/routes/rows.py | 1 + services/search/src/search/routes/filter.py | 1 + services/search/src/search/routes/search.py | 23 +++++++++++---------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index 84d1477b1c..b958c9df61 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -119,6 +119,7 @@ async def rows_endpoint(request: Request) -> Response: with StepProfiler(method="rows_endpoint", step="transform to a list"): response = create_response( dataset=dataset, + revision=revision, config=config, split=split, cached_assets_base_url=cached_assets_base_url, diff --git a/services/search/src/search/routes/filter.py b/services/search/src/search/routes/filter.py index 11bb2e0033..58507f06db 100644 --- a/services/search/src/search/routes/filter.py +++ b/services/search/src/search/routes/filter.py @@ -160,6 +160,7 @@ async def filter_endpoint(request: Request) -> Response: with StepProfiler(method="filter_endpoint", step="create response"): response = create_response( dataset=dataset, + revision=revision, config=config, split=split, cached_assets_base_url=cached_assets_base_url, diff --git a/services/search/src/search/routes/search.py b/services/search/src/search/routes/search.py index 818d9d4690..6600865953 100644 --- a/services/search/src/search/routes/search.py +++ b/services/search/src/search/routes/search.py @@ -215,17 +215,18 @@ async def search_endpoint(request: Request) -> Response: else: features = Features.from_arrow_schema(pa_table.schema) response = create_response( - pa_table, - dataset, - config, - split, - cached_assets_base_url, - cached_assets_directory, - s3_client, - cached_assets_s3_folder_name, - offset, - features, - num_rows_total, + pa_table=pa_table, + dataset=dataset, + revision=revision, + config=config, + split=split, + cached_assets_base_url=cached_assets_base_url, + cached_assets_directory=cached_assets_directory, + s3_client=s3_client, + cached_assets_s3_folder_name=cached_assets_s3_folder_name, + offset=offset, + features=features, + num_rows_total=num_rows_total, ) with StepProfiler(method="search_endpoint", step="generate the OK response"): return get_json_ok_response(response, max_age=max_age_long, revision=revision) From dce7c47e6f23edffc9721e7b0c14e1f05f81884e Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Wed, 18 Oct 2023 16:08:15 -0400 Subject: [PATCH 4/6] merge with main - revision not None --- libs/libapi/src/libapi/duckdb.py | 4 ++-- libs/libapi/src/libapi/response.py | 3 +-- libs/libapi/src/libapi/rows_utils.py | 4 ++-- libs/libapi/src/libapi/utils.py | 2 +- libs/libcommon/src/libcommon/parquet_utils.py | 1 - libs/libcommon/src/libcommon/simple_cache.py | 5 +++-- libs/libcommon/src/libcommon/state.py | 2 +- libs/libcommon/src/libcommon/viewer_utils/asset.py | 11 ++++------- libs/libcommon/src/libcommon/viewer_utils/features.py | 6 +++--- services/rows/src/rows/routes/rows.py | 2 +- services/search/src/search/routes/search.py | 2 +- services/search/tests/routes/test_filter.py | 1 + .../job_runners/split/first_rows_from_parquet.py | 7 +++---- .../job_runners/split/first_rows_from_streaming.py | 6 +++--- 14 files changed, 26 insertions(+), 30 deletions(-) diff --git a/libs/libapi/src/libapi/duckdb.py b/libs/libapi/src/libapi/duckdb.py index ce69b1f48a..d3b8d2fa54 100644 --- a/libs/libapi/src/libapi/duckdb.py +++ b/libs/libapi/src/libapi/duckdb.py @@ -23,9 +23,9 @@ def get_index_file_location_and_download_if_missing( duckdb_index_file_directory: StrPath, dataset: str, + revision: str, config: str, split: str, - revision: Optional[str], filename: str, url: str, target_revision: str, @@ -56,7 +56,7 @@ def get_index_file_location_and_download_if_missing( def get_download_folder( - root_directory: StrPath, dataset: str, config: str, split: str, revision: Optional[str] + root_directory: StrPath, dataset: str, revision: str, config: str, split: str ) -> str: payload = (dataset, config, split, revision) hash_suffix = sha1(json.dumps(payload, sort_keys=True).encode(), usedforsecurity=False).hexdigest()[:8] diff --git a/libs/libapi/src/libapi/response.py b/libs/libapi/src/libapi/response.py index 459cbccbad..3a4f6c39e3 100644 --- a/libs/libapi/src/libapi/response.py +++ b/libs/libapi/src/libapi/response.py @@ -1,5 +1,4 @@ import logging -from typing import Optional import pyarrow as pa from datasets import Features @@ -16,6 +15,7 @@ def create_response( dataset: str, + revision: str, config: str, split: str, cached_assets_base_url: str, @@ -28,7 +28,6 @@ def create_response( unsupported_columns: list[str], num_rows_total: int, use_row_idx_column: bool = False, - revision: Optional[str] = None, ) -> PaginatedResponse: if set(pa_table.column_names).intersection(set(unsupported_columns)): raise RuntimeError( diff --git a/libs/libapi/src/libapi/rows_utils.py b/libs/libapi/src/libapi/rows_utils.py index 55b5125dfc..7193453fbb 100644 --- a/libs/libapi/src/libapi/rows_utils.py +++ b/libs/libapi/src/libapi/rows_utils.py @@ -14,13 +14,13 @@ def _transform_row( row_idx_and_row: tuple[int, Row], dataset: str, + revision: str, config: str, split: str, features: Features, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], offset: int, row_idx_column: Optional[str], - revision: Optional[str] = None, ) -> Row: row_idx, row = row_idx_and_row transformed_row = { @@ -44,6 +44,7 @@ def _transform_row( def transform_rows( dataset: str, + revision: str, config: str, split: str, rows: list[Row], @@ -51,7 +52,6 @@ def transform_rows( storage_options: Union[DirectoryStorageOptions, S3StorageOptions], offset: int, row_idx_column: Optional[str], - revision: Optional[str] = None, ) -> list[Row]: fn = partial( _transform_row, diff --git a/libs/libapi/src/libapi/utils.py b/libs/libapi/src/libapi/utils.py index 1d406faf67..06a78ccf72 100644 --- a/libs/libapi/src/libapi/utils.py +++ b/libs/libapi/src/libapi/utils.py @@ -201,6 +201,7 @@ def get_cache_entry_from_steps( def to_rows_list( pa_table: pa.Table, dataset: str, + revision: str, config: str, split: str, offset: int, @@ -208,7 +209,6 @@ def to_rows_list( unsupported_columns: list[str], storage_options: Union[DirectoryStorageOptions, S3StorageOptions], row_idx_column: Optional[str] = None, - revision: Optional[str] = None, ) -> list[RowItem]: num_rows = pa_table.num_rows for idx, (column, feature) in enumerate(features.items()): diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py index d2cf955145..a94f17ce88 100644 --- a/libs/libcommon/src/libcommon/parquet_utils.py +++ b/libs/libcommon/src/libcommon/parquet_utils.py @@ -255,7 +255,6 @@ def __init__( unsupported_features: list[FeatureType] = [], ): self.dataset = dataset - self.revision: Optional[str] = None self.config = config self.split = split self.processing_graph = processing_graph diff --git a/libs/libcommon/src/libcommon/simple_cache.py b/libs/libcommon/src/libcommon/simple_cache.py index bff8be20d9..7a797e04b7 100644 --- a/libs/libcommon/src/libcommon/simple_cache.py +++ b/libs/libcommon/src/libcommon/simple_cache.py @@ -317,8 +317,8 @@ def _clean_nested_mongo_object(obj: Any) -> Any: class CacheEntryWithoutContent(TypedDict): http_status: HTTPStatus + dataset_git_revision: str error_code: Optional[str] - dataset_git_revision: Optional[str] progress: Optional[float] job_runner_version: Optional[int] @@ -488,6 +488,7 @@ def get_response_with_details( CACHED_RESPONSE_NOT_FOUND = "CachedResponseNotFound" +DATASET_GIT_REVISION_NOT_FOUND = "dataset-git-revision-not-found" def get_response_or_missing_error( @@ -504,7 +505,7 @@ def get_response_or_missing_error( }, http_status=HTTPStatus.NOT_FOUND, error_code=CACHED_RESPONSE_NOT_FOUND, - dataset_git_revision=None, + dataset_git_revision=DATASET_GIT_REVISION_NOT_FOUND, job_runner_version=None, progress=None, details={}, diff --git a/libs/libcommon/src/libcommon/state.py b/libs/libcommon/src/libcommon/state.py index 1a5616302a..5fbe5efa13 100644 --- a/libs/libcommon/src/libcommon/state.py +++ b/libs/libcommon/src/libcommon/state.py @@ -68,7 +68,7 @@ def __post_init__(self) -> None: http_status=entry["http_status"], error_code=None if entry["error_code"] is pd.NA else entry["error_code"], job_runner_version=None if entry["job_runner_version"] is pd.NA else entry["job_runner_version"], - dataset_git_revision=None if entry["dataset_git_revision"] is pd.NA else entry["dataset_git_revision"], + dataset_git_revision=entry["dataset_git_revision"], updated_at=entry["updated_at"], progress=None if entry["progress"] is pd.NA else entry["progress"], ) diff --git a/libs/libcommon/src/libcommon/viewer_utils/asset.py b/libs/libcommon/src/libcommon/viewer_utils/asset.py index 0cab661caf..49bc6c51e0 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/asset.py +++ b/libs/libcommon/src/libcommon/viewer_utils/asset.py @@ -29,10 +29,7 @@ def get_and_create_dir_path(assets_directory: StrPath, url_dir_path: str) -> Pat return dir_path -def get_url_dir_path( - dataset: str, config: str, split: str, row_idx: int, column: str, revision: Optional[str] = None -) -> str: - revision = "main" if revision is None else revision +def get_url_dir_path(dataset: str, revision: str, config: str, split: str, row_idx: int, column: str) -> str: return f"{dataset}/{revision}/{config}/{split}/{str(row_idx)}/{column}" @@ -75,6 +72,7 @@ def upload_asset_file( def create_asset_file( dataset: str, + revision: str, config: str, split: str, row_idx: int, @@ -82,7 +80,6 @@ def create_asset_file( filename: str, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], fn: Callable[[Path, str, bool], SupportedSource], - revision: Optional[str] = None, ) -> SupportedSource: # get url dir path assets_base_url = storage_options.assets_base_url @@ -160,6 +157,7 @@ def save_audio( def create_image_file( dataset: str, + revision: str, config: str, split: str, row_idx: int, @@ -167,7 +165,6 @@ def create_image_file( filename: str, image: Image.Image, storage_options: DirectoryStorageOptions, - revision: Optional[str] = None, ) -> ImageSource: fn = partial(save_image, image=image) return cast( @@ -188,6 +185,7 @@ def create_image_file( def create_audio_file( dataset: str, + revision: str, config: str, split: str, row_idx: int, @@ -196,7 +194,6 @@ def create_audio_file( audio_file_extension: str, filename: str, storage_options: DirectoryStorageOptions, - revision: Optional[str] = None, ) -> list[AudioSource]: fn = partial(save_audio, audio_file_bytes=audio_file_bytes, audio_file_extension=audio_file_extension) return [ diff --git a/libs/libcommon/src/libcommon/viewer_utils/features.py b/libs/libcommon/src/libcommon/viewer_utils/features.py index 5e080498a5..0d4c917024 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/features.py +++ b/libs/libcommon/src/libcommon/viewer_utils/features.py @@ -52,6 +52,7 @@ def append_hash_suffix(string: str, json_path: Optional[list[Union[str, int]]] = def image( dataset: str, + revision: str, config: str, split: str, row_idx: int, @@ -59,7 +60,6 @@ def image( featureName: str, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], json_path: Optional[list[Union[str, int]]] = None, - revision: Optional[str] = None, ) -> Any: if value is None: return None @@ -101,6 +101,7 @@ def image( def audio( dataset: str, + revision: str, config: str, split: str, row_idx: int, @@ -108,7 +109,6 @@ def audio( featureName: str, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], json_path: Optional[list[Union[str, int]]] = None, - revision: Optional[str] = None, ) -> Any: if value is None: return None @@ -173,6 +173,7 @@ def audio( def get_cell_value( dataset: str, + revision: str, config: str, split: str, row_idx: int, @@ -181,7 +182,6 @@ def get_cell_value( fieldType: Any, storage_options: Union[DirectoryStorageOptions, S3StorageOptions], json_path: Optional[list[Union[str, int]]] = None, - revision: Optional[str] = None, ) -> Any: # always allow None values in the cells if cell is None: diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index b958c9df61..8fd728c6bd 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -119,7 +119,7 @@ async def rows_endpoint(request: Request) -> Response: with StepProfiler(method="rows_endpoint", step="transform to a list"): response = create_response( dataset=dataset, - revision=revision, + revision=revision, # type: ignore config=config, split=split, cached_assets_base_url=cached_assets_base_url, diff --git a/services/search/src/search/routes/search.py b/services/search/src/search/routes/search.py index 6600865953..65603ad624 100644 --- a/services/search/src/search/routes/search.py +++ b/services/search/src/search/routes/search.py @@ -74,6 +74,7 @@ def full_text_search(index_file_location: str, query: str, offset: int, length: def create_response( pa_table: pa.Table, dataset: str, + revision: str, config: str, split: str, cached_assets_base_url: str, @@ -83,7 +84,6 @@ def create_response( offset: int, features: Features, num_rows_total: int, - revision: Optional[str] = None, ) -> PaginatedResponse: features_without_key = features.copy() features_without_key.pop(ROW_IDX_COLUMN, None) diff --git a/services/search/tests/routes/test_filter.py b/services/search/tests/routes/test_filter.py index c3c806485a..11718d32f0 100644 --- a/services/search/tests/routes/test_filter.py +++ b/services/search/tests/routes/test_filter.py @@ -94,6 +94,7 @@ def test_create_response(ds: Dataset, app_config: AppConfig, cached_assets_direc ) response = create_response( dataset=dataset, + revision="revision", config=config, split=split, cached_assets_base_url=app_config.cached_assets.base_url, diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py index b70d84bad1..78aa69bd37 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_parquet.py @@ -2,7 +2,6 @@ # Copyright 2022 The HuggingFace Authors. import logging -from typing import Optional from datasets import Audio, Features, Image from fsspec.implementations.http import HTTPFileSystem @@ -31,12 +30,12 @@ def transform_rows( dataset: str, + revision: str, config: str, split: str, rows: list[RowItem], features: Features, storage_options: S3StorageOptions, - revision: Optional[str] = None, ) -> list[Row]: return [ { @@ -59,6 +58,7 @@ def transform_rows( def compute_first_rows_response( dataset: str, + revision: str, config: str, split: str, storage_options: S3StorageOptions, @@ -68,7 +68,6 @@ def compute_first_rows_response( rows_min_number: int, columns_max_number: int, indexer: Indexer, - revision: Optional[str] = None, ) -> SplitFirstRowsResponse: logging.info(f"get first-rows for dataset={dataset} config={config} split={split}") @@ -217,6 +216,7 @@ def compute(self) -> CompleteJobResult: return CompleteJobResult( compute_first_rows_response( dataset=self.dataset, + revision=self.dataset_git_revision, config=self.config, split=self.split, storage_options=self.storage_options, @@ -226,6 +226,5 @@ def compute(self) -> CompleteJobResult: rows_min_number=self.first_rows_config.min_number, columns_max_number=self.first_rows_config.columns_max_number, indexer=self.indexer, - revision=self.job_info["params"]["revision"], ) ) diff --git a/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py b/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py index c01700270f..0fd401ce06 100644 --- a/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py +++ b/services/worker/src/worker/job_runners/split/first_rows_from_streaming.py @@ -39,12 +39,12 @@ def transform_rows( dataset: str, + revision: str, config: str, split: str, rows: list[Row], features: Features, storage_options: S3StorageOptions, - revision: Optional[str] = None, ) -> list[Row]: return [ { @@ -67,6 +67,7 @@ def transform_rows( def compute_first_rows_response( dataset: str, + revision: str, config: str, split: str, storage_options: S3StorageOptions, @@ -77,7 +78,6 @@ def compute_first_rows_response( rows_min_number: int, columns_max_number: int, max_size_fallback: Optional[int] = None, - revision: Optional[str] = None, ) -> SplitFirstRowsResponse: """ Get the response of /first-rows for one specific split of a dataset from huggingface.co. @@ -296,6 +296,7 @@ def compute(self) -> CompleteJobResult: return CompleteJobResult( compute_first_rows_response( dataset=self.dataset, + revision=self.dataset_git_revision, config=self.config, split=self.split, storage_options=self.storage_options, @@ -305,6 +306,5 @@ def compute(self) -> CompleteJobResult: rows_max_number=self.first_rows_config.max_number, rows_min_number=self.first_rows_config.min_number, columns_max_number=self.first_rows_config.columns_max_number, - revision=self.job_info["params"]["revision"], ) ) From 1e40a2dc14f10b34e6a0674d20c7b7bb1444264a Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Wed, 18 Oct 2023 16:24:08 -0400 Subject: [PATCH 5/6] fix style and tests --- libs/libapi/src/libapi/duckdb.py | 4 +--- .../job_runners/dataset/dataset_job_runner.py | 2 +- services/worker/tests/fixtures/hub.py | 13 +++++++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/libs/libapi/src/libapi/duckdb.py b/libs/libapi/src/libapi/duckdb.py index d3b8d2fa54..45fb83b806 100644 --- a/libs/libapi/src/libapi/duckdb.py +++ b/libs/libapi/src/libapi/duckdb.py @@ -55,9 +55,7 @@ def get_index_file_location_and_download_if_missing( return index_file_location -def get_download_folder( - root_directory: StrPath, dataset: str, revision: str, config: str, split: str -) -> str: +def get_download_folder(root_directory: StrPath, dataset: str, revision: str, config: str, split: str) -> str: payload = (dataset, config, split, revision) hash_suffix = sha1(json.dumps(payload, sort_keys=True).encode(), usedforsecurity=False).hexdigest()[:8] subdirectory = "".join([c if re.match(r"[\w-]", c) else "-" for c in f"{dataset}-{hash_suffix}"]) diff --git a/services/worker/src/worker/job_runners/dataset/dataset_job_runner.py b/services/worker/src/worker/job_runners/dataset/dataset_job_runner.py index ac04d22dba..8e57e605a1 100644 --- a/services/worker/src/worker/job_runners/dataset/dataset_job_runner.py +++ b/services/worker/src/worker/job_runners/dataset/dataset_job_runner.py @@ -30,7 +30,7 @@ def __init__( if job_info["params"]["revision"] is None: raise ParameterMissingError("'revision' parameter is required") self.dataset = job_info["params"]["dataset"] - self.revision = job_info["params"]["revision"] + self.dataset_git_revision = job_info["params"]["revision"] class DatasetJobRunnerWithDatasetsCache(JobRunnerWithDatasetsCache, DatasetJobRunner): diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 8cad6ed518..7e07201c3e 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -18,6 +18,7 @@ from huggingface_hub.utils._errors import hf_raise_for_status from ..constants import CI_HUB_ENDPOINT, CI_URL_TEMPLATE, CI_USER, CI_USER_TOKEN +from ..job_runners.utils import REVISION_NAME DATASET = "dataset" hf_api = HfApi(endpoint=CI_HUB_ENDPOINT) @@ -563,7 +564,7 @@ def get_AUDIO_rows(dataset: str) -> Any: { "col": [ { - "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/audio.wav", + "src": f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/audio.wav", "type": "audio/wav", }, ] @@ -581,7 +582,7 @@ def get_IMAGE_rows(dataset: str) -> Any: return [ { "col": { - "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/image.jpg", + "src": f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/image.jpg", "height": 480, "width": 640, }, @@ -600,12 +601,16 @@ def get_IMAGES_LIST_rows(dataset: str) -> Any: { "col": [ { - "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/image-1d100e9.jpg", + "src": ( + f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/image-1d100e9.jpg" + ), "height": 480, "width": 640, }, { - "src": f"http://localhost/assets/{dataset}/main/{config}/{split}/0/col/image-1d300ea.jpg", + "src": ( + f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/image-1d300ea.jpg" + ), "height": 480, "width": 640, }, From 80068375d94791e177915ccf501db25df6228526 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 19 Oct 2023 10:15:08 -0400 Subject: [PATCH 6/6] apply code review suggestions --- docs/source/openapi.json | 36 +++++++++--------- docs/source/rows.mdx | 2 +- .../src/libcommon/viewer_utils/asset.py | 3 +- .../tests/viewer_utils/test_assets.py | 10 +++-- .../tests/viewer_utils/test_features.py | 37 +++++++++---------- services/rows/src/rows/routes/rows.py | 36 +++++++++--------- services/rows/tests/routes/test_rows.py | 6 ++- services/worker/tests/fixtures/hub.py | 10 +++-- 8 files changed, 73 insertions(+), 67 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 63197d0a58..816cd15983 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -2270,12 +2270,12 @@ "row_idx": 0, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/0/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/0/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/0/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/0/imageB/image.jpg", "height": 256, "width": 256 } @@ -2286,12 +2286,12 @@ "row_idx": 1, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/1/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/1/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/1/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/1/imageB/image.jpg", "height": 256, "width": 256 } @@ -2302,12 +2302,12 @@ "row_idx": 2, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/2/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/2/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/main/default/train/2/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/2/imageB/image.jpg", "height": 256, "width": 256 } @@ -2390,7 +2390,7 @@ "id": "id10059_229vKIGbxrI_00001", "audio": [ { - "src": "https://datasets-server.huggingface.co/assets/asapp/slue/main/voxceleb/train/0/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/0/audio/audio.wav", "type": "audio/wav" } ], @@ -2408,7 +2408,7 @@ "id": "id10059_229vKIGbxrI_00002", "audio": [ { - "src": "https://datasets-server.huggingface.co/assets/asapp/slue/main/voxceleb/train/1/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/1/audio/audio.wav", "type": "audio/wav" } ], @@ -2426,7 +2426,7 @@ "id": "id10059_229vKIGbxrI_00003", "audio": [ { - "src": "https://datasets-server.huggingface.co/assets/asapp/slue/main/voxceleb/train/2/audio/audio.wav", + "src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/2/audio/audio.wav", "type": "audio/wav" } ], @@ -2711,12 +2711,12 @@ "row_idx": 234, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/234/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/234/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/234/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/234/imageB/image.jpg", "height": 256, "width": 256 } @@ -2727,12 +2727,12 @@ "row_idx": 235, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/235/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/235/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/235/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/235/imageB/image.jpg", "height": 256, "width": 256 } @@ -2743,12 +2743,12 @@ "row_idx": 236, "row": { "imageA": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/236/imageA/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/236/imageA/image.jpg", "height": 256, "width": 256 }, "imageB": { - "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/main/default/train/236/imageB/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/236/imageB/image.jpg", "height": 256, "width": 256 } @@ -3128,7 +3128,7 @@ "row_idx": 16, "row": { "image": { - "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/main/default/train/16/image/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/16/image/image.jpg", "height": 431, "width": 431 }, @@ -3140,7 +3140,7 @@ "row_idx": 54, "row": { "image": { - "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/main/default/train/54/image/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/54/image/image.jpg", "height": 1280, "width": 1280 }, @@ -3152,7 +3152,7 @@ "row_idx": 56, "row": { "image": { - "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/main/default/train/56/image/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/56/image/image.jpg", "height": 1280, "width": 1280 }, diff --git a/docs/source/rows.mdx b/docs/source/rows.mdx index 3ce08e12ef..c20dcccf16 100644 --- a/docs/source/rows.mdx +++ b/docs/source/rows.mdx @@ -174,7 +174,7 @@ Here is an example of image, from the first row of the cifar100 dataset: "row_idx": 0, "row": { "img": { - "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/main/cifar100/train/0/img/image.jpg", + "src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/main/--/cifar100/train/0/img/image.jpg", "height": 32, "width": 32 }, diff --git a/libs/libcommon/src/libcommon/viewer_utils/asset.py b/libs/libcommon/src/libcommon/viewer_utils/asset.py index 49bc6c51e0..4b6dd725e2 100644 --- a/libs/libcommon/src/libcommon/viewer_utils/asset.py +++ b/libs/libcommon/src/libcommon/viewer_utils/asset.py @@ -14,6 +14,7 @@ from PIL import Image # type: ignore from pydub import AudioSegment # type:ignore +from libcommon.constants import DATASET_SEPARATOR from libcommon.s3_client import S3Client from libcommon.storage import StrPath, remove_dir from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions @@ -30,7 +31,7 @@ def get_and_create_dir_path(assets_directory: StrPath, url_dir_path: str) -> Pat def get_url_dir_path(dataset: str, revision: str, config: str, split: str, row_idx: int, column: str) -> str: - return f"{dataset}/{revision}/{config}/{split}/{str(row_idx)}/{column}" + return f"{dataset}/{DATASET_SEPARATOR}/{revision}/{DATASET_SEPARATOR}/{config}/{split}/{str(row_idx)}/{column}" def get_unique_path_for_filename(assets_directory: StrPath, filename: str) -> Path: diff --git a/libs/libcommon/tests/viewer_utils/test_assets.py b/libs/libcommon/tests/viewer_utils/test_assets.py index 1546e8a595..d3276a667f 100644 --- a/libs/libcommon/tests/viewer_utils/test_assets.py +++ b/libs/libcommon/tests/viewer_utils/test_assets.py @@ -54,11 +54,13 @@ def test_create_image_file_with_s3_storage(datasets: Mapping[str, Dataset], cach storage_options=storage_options, ) assert value == { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image.jpg", + "src": "http://localhost/assets/dataset/--/revision/--/config/split/7/col/image.jpg", "height": 480, "width": 640, } - body = conn.Object(bucket_name, "assets/dataset/revision/config/split/7/col/image.jpg").get()["Body"].read() + body = ( + conn.Object(bucket_name, "assets/dataset/--/revision/--/config/split/7/col/image.jpg").get()["Body"].read() + ) assert body is not None image = PILImage.open(io.BytesIO(body)) @@ -111,11 +113,11 @@ def test_create_audio_file_with_s3_storage(datasets: Mapping[str, Dataset], cach assert value == [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio.wav", + "src": "http://localhost/assets/dataset/--/revision/--/config/split/7/col/audio.wav", "type": "audio/wav", }, ] audio_object = ( - conn.Object(bucket_name, "assets/dataset/revision/config/split/7/col/audio.wav").get()["Body"].read() + conn.Object(bucket_name, "assets/dataset/--/revision/--/config/split/7/col/audio.wav").get()["Body"].read() ) assert audio_object is not None diff --git a/libs/libcommon/tests/viewer_utils/test_features.py b/libs/libcommon/tests/viewer_utils/test_features.py index b68a0566af..87afe7e52b 100644 --- a/libs/libcommon/tests/viewer_utils/test_features.py +++ b/libs/libcommon/tests/viewer_utils/test_features.py @@ -108,6 +108,9 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO assert os.path.getsize(path) > 0 +ASSETS_BASE_URL_SPLIT = "http://localhost/assets/dataset/--/revision/--/config/split" + + @pytest.mark.parametrize( "dataset_type,output_value,output_type", [ @@ -162,7 +165,7 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "audio", [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio.wav", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio.wav", "type": "audio/wav", } ], @@ -172,7 +175,7 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "audio_ogg", [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio.wav", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio.wav", "type": "audio/wav", } ], @@ -184,7 +187,7 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO ( "image", { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image.jpg", "height": 480, "width": 640, }, @@ -203,12 +206,12 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "images_list", [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d100e9.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image-1d100e9.jpg", "height": 480, "width": 640, }, { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d300ea.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image-1d300ea.jpg", "height": 480, "width": 640, }, @@ -220,13 +223,13 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO [ [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d100e9.wav", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio-1d100e9.wav", "type": "audio/wav", }, ], [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d300ea.wav", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio-1d300ea.wav", "type": "audio/wav", }, ], @@ -237,12 +240,12 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "images_sequence", [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d100e9.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image-1d100e9.jpg", "height": 480, "width": 640, }, { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-1d300ea.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image-1d300ea.jpg", "height": 480, "width": 640, }, @@ -254,13 +257,13 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO [ [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d100e9.wav", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio-1d100e9.wav", "type": "audio/wav", }, ], [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/audio-1d300ea.wav", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio-1d300ea.wav", "type": "audio/wav", }, ], @@ -273,12 +276,12 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "a": 0, "b": [ { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-89101db.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image-89101db.jpg", "height": 480, "width": 640, }, { - "src": "http://localhost/assets/dataset/revision/config/split/7/col/image-89301dc.jpg", + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/image-89301dc.jpg", "height": 480, "width": 640, }, @@ -287,17 +290,13 @@ def assert_output_has_valid_files(value: Any, storage_options: DirectoryStorageO "ca": [ [ { - "src": ( - "http://localhost/assets/dataset/revision/config/split/7/col/audio-18360330.wav" - ), + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio-18360330.wav", "type": "audio/wav", }, ], [ { - "src": ( - "http://localhost/assets/dataset/revision/config/split/7/col/audio-18380331.wav" - ), + "src": f"{ASSETS_BASE_URL_SPLIT}/7/col/audio-18380331.wav", "type": "audio/wav", }, ], diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index 8fd728c6bd..3e92cfd52f 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -97,6 +97,24 @@ async def rows_endpoint(request: Request) -> Response: split=split, ) revision = rows_index.revision + with StepProfiler(method="rows_endpoint", step="query the rows"): + pa_table = rows_index.query(offset=offset, length=length) + with StepProfiler(method="rows_endpoint", step="transform to a list"): + response = create_response( + dataset=dataset, + revision=revision, + config=config, + split=split, + cached_assets_base_url=cached_assets_base_url, + cached_assets_directory=cached_assets_directory, + s3_client=s3_client, + cached_assets_s3_folder_name=cached_assets_s3_folder_name, + pa_table=pa_table, + offset=offset, + features=rows_index.parquet_index.features, + unsupported_columns=rows_index.parquet_index.unsupported_columns, + num_rows_total=rows_index.parquet_index.num_rows_total, + ) except CachedArtifactNotFoundError: config_parquet_processing_steps = processing_graph.get_config_parquet_processing_steps() config_parquet_metadata_processing_steps = ( @@ -114,24 +132,6 @@ async def rows_endpoint(request: Request) -> Response: cache_max_days=cache_max_days, blocked_datasets=blocked_datasets, ) - with StepProfiler(method="rows_endpoint", step="query the rows"): - pa_table = rows_index.query(offset=offset, length=length) - with StepProfiler(method="rows_endpoint", step="transform to a list"): - response = create_response( - dataset=dataset, - revision=revision, # type: ignore - config=config, - split=split, - cached_assets_base_url=cached_assets_base_url, - cached_assets_directory=cached_assets_directory, - s3_client=s3_client, - cached_assets_s3_folder_name=cached_assets_s3_folder_name, - pa_table=pa_table, - offset=offset, - features=rows_index.parquet_index.features, - unsupported_columns=rows_index.parquet_index.unsupported_columns, - num_rows_total=rows_index.parquet_index.num_rows_total, - ) with StepProfiler(method="rows_endpoint", step="generate the OK response"): return get_json_ok_response(content=response, max_age=max_age_long, revision=revision) except CachedArtifactError as e: diff --git a/services/rows/tests/routes/test_rows.py b/services/rows/tests/routes/test_rows.py index 12ade7a347..3841d61840 100644 --- a/services/rows/tests/routes/test_rows.py +++ b/services/rows/tests/routes/test_rows.py @@ -536,7 +536,9 @@ def test_create_response_with_image( "row_idx": 0, "row": { "image": { - "src": "http://localhost/cached-assets/ds_image/revision/default/train/0/image/image.jpg", + "src": ( + "http://localhost/cached-assets/ds_image/--/revision/--/default/train/0/image/image.jpg" + ), "height": 480, "width": 640, } @@ -546,7 +548,7 @@ def test_create_response_with_image( ] body = ( - conn.Object(bucket_name, "cached-assets/ds_image/revision/default/train/0/image/image.jpg") + conn.Object(bucket_name, "cached-assets/ds_image/--/revision/--/default/train/0/image/image.jpg") .get()["Body"] .read() ) diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 7e07201c3e..2eade41c1f 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -564,7 +564,7 @@ def get_AUDIO_rows(dataset: str) -> Any: { "col": [ { - "src": f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/audio.wav", + "src": f"http://localhost/assets/{dataset}/--/{REVISION_NAME}/--/{config}/{split}/0/col/audio.wav", "type": "audio/wav", }, ] @@ -582,7 +582,7 @@ def get_IMAGE_rows(dataset: str) -> Any: return [ { "col": { - "src": f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/image.jpg", + "src": f"http://localhost/assets/{dataset}/--/{REVISION_NAME}/--/{config}/{split}/0/col/image.jpg", "height": 480, "width": 640, }, @@ -594,6 +594,8 @@ def get_IMAGE_rows(dataset: str) -> Any: "col": [{"_type": "Image"}], } +ASSETS_BASE_URL = "http://localhost/assets" + def get_IMAGES_LIST_rows(dataset: str) -> Any: config, split = get_default_config_split() @@ -602,14 +604,14 @@ def get_IMAGES_LIST_rows(dataset: str) -> Any: "col": [ { "src": ( - f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/image-1d100e9.jpg" + f"{ASSETS_BASE_URL}/{dataset}/--/{REVISION_NAME}/--/{config}/{split}/0/col/image-1d100e9.jpg" ), "height": 480, "width": 640, }, { "src": ( - f"http://localhost/assets/{dataset}/{REVISION_NAME}/{config}/{split}/0/col/image-1d300ea.jpg" + f"{ASSETS_BASE_URL}/{dataset}/--/{REVISION_NAME}/--/{config}/{split}/0/col/image-1d300ea.jpg" ), "height": 480, "width": 640,