From 40bb174b1ab44dbd9f77bce41c1bf8cd2d11a431 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 28 Sep 2023 12:18:30 -0400 Subject: [PATCH] cached assets on s3 for all datasets --- services/rows/src/rows/routes/rows.py | 25 ++++++------------ services/rows/tests/routes/test_rows.py | 29 ++++++++++----------- services/search/src/search/routes/search.py | 25 ++++++------------ 3 files changed, 30 insertions(+), 49 deletions(-) diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py index f722b2ee5a..9766978bdf 100644 --- a/services/rows/src/rows/routes/rows.py +++ b/services/rows/src/rows/routes/rows.py @@ -31,7 +31,7 @@ from libcommon.s3_client import S3Client from libcommon.simple_cache import CachedArtifactError, CachedArtifactNotFoundError from libcommon.storage import StrPath -from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions +from libcommon.storage_options import S3StorageOptions from libcommon.utils import PaginatedResponse from libcommon.viewer_utils.asset import update_last_modified_date_of_rows_in_assets_dir from libcommon.viewer_utils.features import to_features_list @@ -49,8 +49,6 @@ # audio still has some errors when librosa is imported UNSUPPORTED_FEATURES = [Value("binary")] -CACHED_ASSETS_S3_SUPPORTED_DATASETS: list[str] = ["asoria/image"] # for testing - def create_response( dataset: str, @@ -70,20 +68,13 @@ def create_response( raise RuntimeError( "The pyarrow table contains unsupported columns. They should have been ignored in the row group reader." ) - use_s3_storage = dataset in CACHED_ASSETS_S3_SUPPORTED_DATASETS - logging.debug(f"create response for {dataset=} {config=} {split=}- {use_s3_storage}") - storage_options = ( - S3StorageOptions( - assets_base_url=cached_assets_base_url, - assets_directory=cached_assets_directory, - overwrite=False, - s3_client=s3_client, - s3_folder_name=cached_assets_s3_folder_name, - ) - if use_s3_storage - else DirectoryStorageOptions( - assets_base_url=cached_assets_base_url, assets_directory=cached_assets_directory, overwrite=True - ) + logging.debug(f"create response for {dataset=} {config=} {split=}") + storage_options = S3StorageOptions( + assets_base_url=cached_assets_base_url, + assets_directory=cached_assets_directory, + overwrite=False, + s3_client=s3_client, + s3_folder_name=cached_assets_s3_folder_name, ) return PaginatedResponse( features=to_features_list(features), diff --git a/services/rows/tests/routes/test_rows.py b/services/rows/tests/routes/test_rows.py index 5b1e1000f1..da7edda998 100644 --- a/services/rows/tests/routes/test_rows.py +++ b/services/rows/tests/routes/test_rows.py @@ -507,21 +507,20 @@ def test_create_response_with_image( ) folder_name = "cached-assets" - with patch("rows.routes.rows.CACHED_ASSETS_S3_SUPPORTED_DATASETS", [dataset]): - response = create_response( - dataset=dataset, - config=config, - split=split, - cached_assets_base_url=app_config.cached_assets.base_url, - s3_client=s3_client, - cached_assets_directory=cached_assets_directory, - cached_assets_s3_folder_name=folder_name, - pa_table=ds_image.data, - offset=0, - features=ds_image.features, - unsupported_columns=[], - num_rows_total=10, - ) + response = create_response( + dataset=dataset, + config=config, + split=split, + cached_assets_base_url=app_config.cached_assets.base_url, + s3_client=s3_client, + cached_assets_directory=cached_assets_directory, + cached_assets_s3_folder_name=folder_name, + pa_table=ds_image.data, + offset=0, + features=ds_image.features, + unsupported_columns=[], + num_rows_total=10, + ) assert response["features"] == [{"feature_idx": 0, "name": "image", "type": {"_type": "Image"}}] assert response["rows"] == [ { diff --git a/services/search/src/search/routes/search.py b/services/search/src/search/routes/search.py index c970d2032b..bf4411d8ba 100644 --- a/services/search/src/search/routes/search.py +++ b/services/search/src/search/routes/search.py @@ -37,7 +37,7 @@ from libcommon.prometheus import StepProfiler from libcommon.s3_client import S3Client from libcommon.storage import StrPath, init_dir -from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions +from libcommon.storage_options import S3StorageOptions from libcommon.utils import PaginatedResponse from libcommon.viewer_utils.features import ( get_supported_unsupported_columns, @@ -65,8 +65,6 @@ REPO_TYPE = "dataset" HUB_DOWNLOAD_CACHE_FOLDER = "cache" -CACHED_ASSETS_S3_SUPPORTED_DATASETS: list[str] = ["asoria/image"] # for testing - def get_download_folder( root_directory: StrPath, dataset: str, config: str, split: str, revision: Optional[str] @@ -137,20 +135,13 @@ def create_response( unsupported_features=UNSUPPORTED_FEATURES, ) pa_table = pa_table.drop(unsupported_columns) - use_s3_storage = dataset in CACHED_ASSETS_S3_SUPPORTED_DATASETS - logging.debug(f"create response for {dataset=} {config=} {split=}- {use_s3_storage}") - storage_options = ( - S3StorageOptions( - assets_base_url=cached_assets_base_url, - assets_directory=cached_assets_directory, - overwrite=False, - s3_client=s3_client, - s3_folder_name=cached_assets_s3_folder_name, - ) - if use_s3_storage - else DirectoryStorageOptions( - assets_base_url=cached_assets_base_url, assets_directory=cached_assets_directory, overwrite=True - ) + logging.debug(f"create response for {dataset=} {config=} {split=}") + storage_options = S3StorageOptions( + assets_base_url=cached_assets_base_url, + assets_directory=cached_assets_directory, + overwrite=False, + s3_client=s3_client, + s3_folder_name=cached_assets_s3_folder_name, ) return PaginatedResponse(