Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cached assets on s3 for all datasets #1882

Merged
merged 1 commit into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 8 additions & 17 deletions services/rows/src/rows/routes/rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from libcommon.s3_client import S3Client
from libcommon.simple_cache import CachedArtifactError, CachedArtifactNotFoundError
from libcommon.storage import StrPath
from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions
from libcommon.storage_options import S3StorageOptions
from libcommon.utils import PaginatedResponse
from libcommon.viewer_utils.asset import update_last_modified_date_of_rows_in_assets_dir
from libcommon.viewer_utils.features import to_features_list
Expand All @@ -49,8 +49,6 @@
# audio still has some errors when librosa is imported
UNSUPPORTED_FEATURES = [Value("binary")]

CACHED_ASSETS_S3_SUPPORTED_DATASETS: list[str] = ["asoria/image"] # for testing


def create_response(
dataset: str,
Expand All @@ -70,20 +68,13 @@ def create_response(
raise RuntimeError(
"The pyarrow table contains unsupported columns. They should have been ignored in the row group reader."
)
use_s3_storage = dataset in CACHED_ASSETS_S3_SUPPORTED_DATASETS
logging.debug(f"create response for {dataset=} {config=} {split=}- {use_s3_storage}")
storage_options = (
S3StorageOptions(
assets_base_url=cached_assets_base_url,
assets_directory=cached_assets_directory,
overwrite=False,
s3_client=s3_client,
s3_folder_name=cached_assets_s3_folder_name,
)
if use_s3_storage
else DirectoryStorageOptions(
assets_base_url=cached_assets_base_url, assets_directory=cached_assets_directory, overwrite=True
)
logging.debug(f"create response for {dataset=} {config=} {split=}")
storage_options = S3StorageOptions(
assets_base_url=cached_assets_base_url,
assets_directory=cached_assets_directory,
overwrite=False,
s3_client=s3_client,
s3_folder_name=cached_assets_s3_folder_name,
)
return PaginatedResponse(
features=to_features_list(features),
Expand Down
29 changes: 14 additions & 15 deletions services/rows/tests/routes/test_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,21 +507,20 @@ def test_create_response_with_image(
)
folder_name = "cached-assets"

with patch("rows.routes.rows.CACHED_ASSETS_S3_SUPPORTED_DATASETS", [dataset]):
response = create_response(
dataset=dataset,
config=config,
split=split,
cached_assets_base_url=app_config.cached_assets.base_url,
s3_client=s3_client,
cached_assets_directory=cached_assets_directory,
cached_assets_s3_folder_name=folder_name,
pa_table=ds_image.data,
offset=0,
features=ds_image.features,
unsupported_columns=[],
num_rows_total=10,
)
response = create_response(
dataset=dataset,
config=config,
split=split,
cached_assets_base_url=app_config.cached_assets.base_url,
s3_client=s3_client,
cached_assets_directory=cached_assets_directory,
cached_assets_s3_folder_name=folder_name,
pa_table=ds_image.data,
offset=0,
features=ds_image.features,
unsupported_columns=[],
num_rows_total=10,
)
assert response["features"] == [{"feature_idx": 0, "name": "image", "type": {"_type": "Image"}}]
assert response["rows"] == [
{
Expand Down
25 changes: 8 additions & 17 deletions services/search/src/search/routes/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from libcommon.prometheus import StepProfiler
from libcommon.s3_client import S3Client
from libcommon.storage import StrPath, init_dir
from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions
from libcommon.storage_options import S3StorageOptions
from libcommon.utils import PaginatedResponse
from libcommon.viewer_utils.features import (
get_supported_unsupported_columns,
Expand Down Expand Up @@ -65,8 +65,6 @@
REPO_TYPE = "dataset"
HUB_DOWNLOAD_CACHE_FOLDER = "cache"

CACHED_ASSETS_S3_SUPPORTED_DATASETS: list[str] = ["asoria/image"] # for testing


def get_download_folder(
root_directory: StrPath, dataset: str, config: str, split: str, revision: Optional[str]
Expand Down Expand Up @@ -137,20 +135,13 @@ def create_response(
unsupported_features=UNSUPPORTED_FEATURES,
)
pa_table = pa_table.drop(unsupported_columns)
use_s3_storage = dataset in CACHED_ASSETS_S3_SUPPORTED_DATASETS
logging.debug(f"create response for {dataset=} {config=} {split=}- {use_s3_storage}")
storage_options = (
S3StorageOptions(
assets_base_url=cached_assets_base_url,
assets_directory=cached_assets_directory,
overwrite=False,
s3_client=s3_client,
s3_folder_name=cached_assets_s3_folder_name,
)
if use_s3_storage
else DirectoryStorageOptions(
assets_base_url=cached_assets_base_url, assets_directory=cached_assets_directory, overwrite=True
)
logging.debug(f"create response for {dataset=} {config=} {split=}")
storage_options = S3StorageOptions(
assets_base_url=cached_assets_base_url,
assets_directory=cached_assets_directory,
overwrite=False,
s3_client=s3_client,
s3_folder_name=cached_assets_s3_folder_name,
)

return PaginatedResponse(
Expand Down