Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding revision for assets creation path #1988

Merged
merged 9 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions chart/templates/_env/_envCachedAssets.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,6 @@
value: "{{ include "cachedAssets.baseUrl" . }}"
- name: CACHED_ASSETS_STORAGE_DIRECTORY
value: {{ .Values.cachedAssets.storageDirectory | quote }}
- name: CACHED_ASSETS_CLEAN_CACHE_PROBA
value: {{ .Values.cachedAssets.cleanCacheProba | quote }}
- name: CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER
value: {{ .Values.cachedAssets.keepFirstRowsNumber | quote }}
- name: CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER
value: {{ .Values.cachedAssets.keepMostRecentRowsNumber | quote }}
- name: CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER
value: {{ .Values.cachedAssets.maxCleanedRowsNumber | quote }}
- name: CACHED_ASSETS_S3_FOLDER_NAME
value: {{ .Values.cachedAssets.s3FolderName | quote }}
{{- end -}}
36 changes: 18 additions & 18 deletions docs/source/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -2270,12 +2270,12 @@
"row_idx": 0,
"row": {
"imageA": {
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/0/imageA/image.jpg",
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/0/imageA/image.jpg",
"height": 256,
"width": 256
},
"imageB": {
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/0/imageB/image.jpg",
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/0/imageB/image.jpg",
"height": 256,
"width": 256
}
Expand All @@ -2286,12 +2286,12 @@
"row_idx": 1,
"row": {
"imageA": {
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/1/imageA/image.jpg",
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/1/imageA/image.jpg",
"height": 256,
"width": 256
},
"imageB": {
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/1/imageB/image.jpg",
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/1/imageB/image.jpg",
"height": 256,
"width": 256
}
Expand All @@ -2302,12 +2302,12 @@
"row_idx": 2,
"row": {
"imageA": {
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/2/imageA/image.jpg",
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/2/imageA/image.jpg",
"height": 256,
"width": 256
},
"imageB": {
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/default/train/2/imageB/image.jpg",
"src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/main/--/default/train/2/imageB/image.jpg",
"height": 256,
"width": 256
}
Expand Down Expand Up @@ -2390,7 +2390,7 @@
"id": "id10059_229vKIGbxrI_00001",
"audio": [
{
"src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/0/audio/audio.wav",
"src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/0/audio/audio.wav",
"type": "audio/wav"
}
],
Expand All @@ -2408,7 +2408,7 @@
"id": "id10059_229vKIGbxrI_00002",
"audio": [
{
"src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/1/audio/audio.wav",
"src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/1/audio/audio.wav",
"type": "audio/wav"
}
],
Expand All @@ -2426,7 +2426,7 @@
"id": "id10059_229vKIGbxrI_00003",
"audio": [
{
"src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/voxceleb/train/2/audio/audio.wav",
"src": "https://datasets-server.huggingface.co/assets/asapp/slue/--/main/--/voxceleb/train/2/audio/audio.wav",
"type": "audio/wav"
}
],
Expand Down Expand Up @@ -2711,12 +2711,12 @@
"row_idx": 234,
"row": {
"imageA": {
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/234/imageA/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/234/imageA/image.jpg",
"height": 256,
"width": 256
},
"imageB": {
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/234/imageB/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/234/imageB/image.jpg",
"height": 256,
"width": 256
}
Expand All @@ -2727,12 +2727,12 @@
"row_idx": 235,
"row": {
"imageA": {
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/235/imageA/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/235/imageA/image.jpg",
"height": 256,
"width": 256
},
"imageB": {
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/235/imageB/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/235/imageB/image.jpg",
"height": 256,
"width": 256
}
Expand All @@ -2743,12 +2743,12 @@
"row_idx": 236,
"row": {
"imageA": {
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/236/imageA/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/236/imageA/image.jpg",
"height": 256,
"width": 256
},
"imageB": {
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/default/train/236/imageB/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/main/--/default/train/236/imageB/image.jpg",
"height": 256,
"width": 256
}
Expand Down Expand Up @@ -3128,7 +3128,7 @@
"row_idx": 16,
"row": {
"image": {
"src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/16/image/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/16/image/image.jpg",
"height": 431,
"width": 431
},
Expand All @@ -3140,7 +3140,7 @@
"row_idx": 54,
"row": {
"image": {
"src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/54/image/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/54/image/image.jpg",
"height": 1280,
"width": 1280
},
Expand All @@ -3152,7 +3152,7 @@
"row_idx": 56,
"row": {
"image": {
"src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/default/train/56/image/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/main/--/default/train/56/image/image.jpg",
"height": 1280,
"width": 1280
},
Expand Down
2 changes: 1 addition & 1 deletion docs/source/rows.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ Here is an example of image, from the first row of the cifar100 dataset:
"row_idx": 0,
"row": {
"img": {
"src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/cifar100/train/0/img/image.jpg",
"src": "https://datasets-server.huggingface.co/cached-assets/cifar100/--/main/--/cifar100/train/0/img/image.jpg",
"height": 32,
"width": 32
},
Expand Down
6 changes: 2 additions & 4 deletions libs/libapi/src/libapi/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
def get_index_file_location_and_download_if_missing(
duckdb_index_file_directory: StrPath,
dataset: str,
revision: str,
config: str,
split: str,
revision: Optional[str],
filename: str,
url: str,
target_revision: str,
Expand Down Expand Up @@ -55,9 +55,7 @@ def get_index_file_location_and_download_if_missing(
return index_file_location


def get_download_folder(
root_directory: StrPath, dataset: str, config: str, split: str, revision: Optional[str]
) -> str:
def get_download_folder(root_directory: StrPath, dataset: str, revision: str, config: str, split: str) -> str:
payload = (dataset, config, split, revision)
hash_suffix = sha1(json.dumps(payload, sort_keys=True).encode(), usedforsecurity=False).hexdigest()[:8]
subdirectory = "".join([c if re.match(r"[\w-]", c) else "-" for c in f"{dataset}-{hash_suffix}"])
Expand Down
2 changes: 2 additions & 0 deletions libs/libapi/src/libapi/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

def create_response(
dataset: str,
revision: str,
config: str,
split: str,
cached_assets_base_url: str,
Expand Down Expand Up @@ -45,6 +46,7 @@ def create_response(
"rows": to_rows_list(
pa_table=pa_table,
dataset=dataset,
revision=revision,
config=config,
split=split,
storage_options=storage_options,
Expand Down
4 changes: 4 additions & 0 deletions libs/libapi/src/libapi/rows_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def _transform_row(
row_idx_and_row: tuple[int, Row],
dataset: str,
revision: str,
config: str,
split: str,
features: Features,
Expand All @@ -25,6 +26,7 @@ def _transform_row(
transformed_row = {
featureName: get_cell_value(
dataset=dataset,
revision=revision,
config=config,
split=split,
row_idx=offset + row_idx if row_idx_column is None else row[row_idx_column],
Expand All @@ -42,6 +44,7 @@ def _transform_row(

def transform_rows(
dataset: str,
revision: str,
config: str,
split: str,
rows: list[Row],
Expand All @@ -53,6 +56,7 @@ def transform_rows(
fn = partial(
_transform_row,
dataset=dataset,
revision=revision,
config=config,
split=split,
features=features,
Expand Down
113 changes: 2 additions & 111 deletions libs/libapi/src/libapi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
# Copyright 2022 The HuggingFace Authors.

import logging
import os
import random
import shutil
from collections.abc import Callable, Coroutine
from http import HTTPStatus
from itertools import islice
from typing import Any, Optional, Union

import pyarrow as pa
Expand All @@ -21,10 +17,8 @@
CacheEntry,
get_best_response,
)
from libcommon.storage import StrPath
from libcommon.storage_options import DirectoryStorageOptions, S3StorageOptions
from libcommon.utils import Priority, RowItem, orjson_dumps
from libcommon.viewer_utils.asset import glob_rows_in_assets_dir
from starlette.requests import Request
from starlette.responses import JSONResponse, Response

Expand Down Expand Up @@ -207,6 +201,7 @@ def get_cache_entry_from_steps(
def to_rows_list(
pa_table: pa.Table,
dataset: str,
revision: str,
config: str,
split: str,
offset: int,
Expand All @@ -223,6 +218,7 @@ def to_rows_list(
try:
transformed_rows = transform_rows(
dataset=dataset,
revision=revision,
config=config,
split=split,
rows=pa_table.to_pylist(),
Expand All @@ -243,108 +239,3 @@ def to_rows_list(
}
for idx, row in enumerate(transformed_rows)
]


def _greater_or_equal(row_dir_name: str, row_idx: int, on_error: bool) -> bool:
try:
return int(row_dir_name) >= row_idx
except ValueError:
return on_error


def clean_cached_assets(
dataset: str,
cached_assets_directory: StrPath,
keep_first_rows_number: int,
keep_most_recent_rows_number: int,
max_cleaned_rows_number: int,
) -> None:
"""
The cached assets directory is cleaned to save disk space using this simple (?) heuristic:

1. it takes a big sample of rows from the cache using glob (max `max_cleaned_rows_number`)
2. it keeps the most recent ones (max `keep_most_recent_rows_number`)
3. it keeps the rows below a certain index (max `keep_first_rows_number`)
4. it discards the rest

To check for the most recent rows, it looks at the "last modified time" of rows directories.
This time is updated every time a row is accessed using `update_last_modified_date_of_rows_in_assets_dir()`.

Args:
dataset (`str`):
Dataset name e.g 'squad' or 'lhoestq/demo1'.
Rows are cleaned in any dataset configuration or split of this dataset.
cached_assets_directory (`StrPath`):
Directory containing the cached image and audio files.
keep_first_rows_number (`int`):
Keep the rows with an index below a certain number.
keep_most_recent_rows_number (`int`):
Keep the most recently accessed rows.
max_cleaned_rows_number (`int`):
Maximum number of rows to discard.
"""
if keep_first_rows_number < 0 or keep_most_recent_rows_number < 0 or max_cleaned_rows_number < 0:
raise ValueError(
"Failed to run cached assets cleaning. Make sure all of keep_first_rows_number,"
f" keep_most_recent_rows_number and max_cleaned_rows_number are set (got {keep_first_rows_number},"
f" {keep_most_recent_rows_number} and {max_cleaned_rows_number})"
)
row_directories = glob_rows_in_assets_dir(dataset, cached_assets_directory)
row_directories_sample = list(
islice(
(
row_dir
for row_dir in row_directories
if _greater_or_equal(row_dir.name, keep_first_rows_number, on_error=True)
),
max_cleaned_rows_number + keep_most_recent_rows_number,
)
)
if len(row_directories_sample) > keep_most_recent_rows_number:
row_dirs_to_delete = sorted(row_directories_sample, key=os.path.getmtime, reverse=True)[
keep_most_recent_rows_number:
]
for row_dir_to_delete in row_dirs_to_delete:
shutil.rmtree(row_dir_to_delete, ignore_errors=True)


def clean_cached_assets_randomly(
clean_cache_proba: float,
dataset: str,
cached_assets_directory: StrPath,
keep_first_rows_number: int,
keep_most_recent_rows_number: int,
max_cleaned_rows_number: int,
) -> None:
"""Randomly clean the cached assets' directory.

Args:
clean_cache_proba (`float`):
Probability to clean the cached assets' directory.
dataset (`str`):
Dataset name e.g 'squad' or 'lhoestq/demo1'.
Rows are cleaned in any dataset configuration or split of this dataset.
cached_assets_directory (`StrPath`):
Directory containing the cached image and audio files.
keep_first_rows_number (`int`):
Keep the rows with an index below a certain number.
keep_most_recent_rows_number (`int`):
Keep the most recently accessed rows.
max_cleaned_rows_number (`int`):
Maximum number of rows to discard.
"""
# no need to do it every time
if random.random() < clean_cache_proba: # nosec
if keep_first_rows_number < 0 and keep_most_recent_rows_number < 0 and max_cleaned_rows_number < 0:
logging.debug(
"Params keep_first_rows_number, keep_most_recent_rows_number and"
" max_cleaned_rows_number are not set. Skipping cached assets cleaning."
)
else:
clean_cached_assets(
dataset=dataset,
cached_assets_directory=cached_assets_directory,
keep_first_rows_number=keep_first_rows_number,
keep_most_recent_rows_number=keep_most_recent_rows_number,
max_cleaned_rows_number=max_cleaned_rows_number,
)
Loading
Loading