Skip to content

Commit

Permalink
assets to s3 (#1928)
Browse files Browse the repository at this point in the history
* CHange to S3StorageOptions

* Env variables

* refactor s3 env variables

* fix envS3

* fix filter
  • Loading branch information
AndreaFrancis authored Oct 6, 2023
1 parent 6c5e7ef commit a9612ac
Show file tree
Hide file tree
Showing 27 changed files with 480 additions and 221 deletions.
2 changes: 2 additions & 0 deletions chart/templates/_env/_envAssets.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
value: "{{ include "assets.baseUrl" . }}"
- name: ASSETS_STORAGE_DIRECTORY
value: {{ .Values.assets.storageDirectory | quote }}
- name: ASSETS_S3_FOLDER_NAME
value: {{ .Values.assets.s3FolderName | quote }}
{{- end -}}
2 changes: 2 additions & 0 deletions chart/templates/_env/_envCachedAssets.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@
value: {{ .Values.cachedAssets.keepMostRecentRowsNumber | quote }}
- name: CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER
value: {{ .Values.cachedAssets.maxCleanedRowsNumber | quote }}
- name: CACHED_ASSETS_S3_FOLDER_NAME
value: {{ .Values.cachedAssets.s3FolderName | quote }}
{{- end -}}
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 The HuggingFace Authors.

{{- define "envCachedAssetsS3" -}}
- name: CACHED_ASSETS_S3_BUCKET
value: {{ .Values.cachedAssetsS3.bucket | quote }}
- name: CACHED_ASSETS_S3_REGION
value: {{ .Values.cachedAssetsS3.region | quote }}
- name: CACHED_ASSETS_S3_FOLDER_NAME
value: {{ .Values.cachedAssetsS3.folderName | quote }}
- name: CACHED_ASSETS_S3_ACCESS_KEY_ID
{{- define "envS3" -}}
- name: S3_BUCKET
value: {{ .Values.s3.bucket | quote }}
- name: S3_REGION
value: {{ .Values.s3.region | quote }}
- name: S3_ACCESS_KEY_ID
{{- if .Values.secrets.s3.accessKeyId.fromSecret }}
valueFrom:
secretKeyRef:
Expand All @@ -18,7 +16,7 @@
{{- else }}
value: {{ .Values.secrets.s3.accessKeyId.value | quote }}
{{- end }}
- name: CACHED_ASSETS_S3_SECRET_ACCESS_KEY
- name: S3_SECRET_ACCESS_KEY
{{- if .Values.secrets.s3.secretAccessKey.fromSecret }}
valueFrom:
secretKeyRef:
Expand Down
2 changes: 1 addition & 1 deletion chart/templates/services/rows/_container.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
imagePullPolicy: {{ .Values.images.pullPolicy }}
env:
{{ include "envCachedAssets" . | nindent 2 }}
{{ include "envCachedAssetsS3" . | nindent 2 }}
{{ include "envS3" . | nindent 2 }}
{{ include "envCache" . | nindent 2 }}
{{ include "envParquetMetadata" . | nindent 2 }}
{{ include "envQueue" . | nindent 2 }}
Expand Down
2 changes: 1 addition & 1 deletion chart/templates/services/search/_container.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
imagePullPolicy: {{ .Values.images.pullPolicy }}
env:
{{ include "envCachedAssets" . | nindent 2 }}
{{ include "envCachedAssetsS3" . | nindent 2 }}
{{ include "envS3" . | nindent 2 }}
{{ include "envCache" . | nindent 2 }}
{{ include "envQueue" . | nindent 2 }}
{{ include "envCommon" . | nindent 2 }}
Expand Down
1 change: 1 addition & 0 deletions chart/templates/worker/_container.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
imagePullPolicy: {{ .Values.images.pullPolicy }}
env:
{{ include "envAssets" . | nindent 2 }}
{{ include "envS3" . | nindent 2 }}
{{ include "envCache" . | nindent 2 }}
{{ include "envCommon" . | nindent 2 }}
{{ include "envDatasetsBased" . | nindent 2 }}
Expand Down
5 changes: 3 additions & 2 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ assets:
# baseUrl: "not used for now"
# Directory on the shared storage (audio files and images)
storageDirectory: "/storage/assets"
s3FolderName: "assets"

cachedAssets:
# base URL for the cached assets files. It should be set accordingly to the datasets-server domain, eg https://datasets-server.huggingface.co/cached-assets
Expand All @@ -264,11 +265,11 @@ cachedAssets:
keepMostRecentRowsNumber: 200
# When cleaning the cached assets directory: maximum number of rows to discard.
maxCleanedRowsNumber: 10000
s3FolderName: "cached-assets"

cachedAssetsS3:
s3:
bucket: "hf-datasets-server-statics"
region: "us-east-1"
folderName: "cached-assets"

parquetMetadata:
# Directory on the shared storage (parquet metadata files used for random access in /rows)
Expand Down
28 changes: 0 additions & 28 deletions libs/libapi/src/libapi/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,6 @@ def from_env(cls) -> "UvicornConfig":
API_MAX_AGE_SHORT = 10 # 10 seconds


CACHED_ASSETS_S3_BUCKET = "hf-datasets-server-statics"
CACHED_ASSETS_S3_ACCESS_KEY_ID = None
CACHED_ASSETS_S3_SECRET_ACCESS_KEY = None
CACHED_ASSETS_S3_REGION = "us-east-1"
CACHED_ASSETS_S3_FOLDER_NAME = "cached-assets"


@dataclass(frozen=True)
class CachedAssetsS3Config:
bucket: str = CACHED_ASSETS_S3_BUCKET
access_key_id: Optional[str] = CACHED_ASSETS_S3_ACCESS_KEY_ID
secret_access_key: Optional[str] = CACHED_ASSETS_S3_SECRET_ACCESS_KEY
region: str = CACHED_ASSETS_S3_REGION
folder_name: str = CACHED_ASSETS_S3_FOLDER_NAME

@classmethod
def from_env(cls) -> "CachedAssetsS3Config":
env = Env(expand_vars=True)
with env.prefixed("CACHED_ASSETS_S3_"):
return cls(
bucket=env.str(name="BUCKET", default=CACHED_ASSETS_S3_BUCKET),
access_key_id=env.str(name="ACCESS_KEY_ID", default=CACHED_ASSETS_S3_ACCESS_KEY_ID),
secret_access_key=env.str(name="SECRET_ACCESS_KEY", default=CACHED_ASSETS_S3_SECRET_ACCESS_KEY),
region=env.str(name="REGION", default=CACHED_ASSETS_S3_REGION),
folder_name=env.str(name="FOLDER_NAME", default=CACHED_ASSETS_S3_FOLDER_NAME),
)


@dataclass(frozen=True)
class ApiConfig:
external_auth_url: Optional[str] = API_EXTERNAL_AUTH_URL # not documented
Expand Down
31 changes: 31 additions & 0 deletions libs/libcommon/src/libcommon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,14 @@

ASSETS_BASE_URL = "assets"
ASSETS_STORAGE_DIRECTORY = None
ASSETS_S3_FOLDER_NAME = "assets"


@dataclass(frozen=True)
class AssetsConfig:
base_url: str = ASSETS_BASE_URL
storage_directory: Optional[str] = ASSETS_STORAGE_DIRECTORY
s3_folder_name: str = ASSETS_S3_FOLDER_NAME

@classmethod
def from_env(cls) -> "AssetsConfig":
Expand All @@ -56,6 +58,32 @@ def from_env(cls) -> "AssetsConfig":
return cls(
base_url=env.str(name="BASE_URL", default=ASSETS_BASE_URL),
storage_directory=env.str(name="STORAGE_DIRECTORY", default=ASSETS_STORAGE_DIRECTORY),
s3_folder_name=env.str(name="S3_FOLDER_NAME", default=ASSETS_S3_FOLDER_NAME),
)


S3_BUCKET = "hf-datasets-server-statics"
S3_ACCESS_KEY_ID = None
S3_SECRET_ACCESS_KEY = None
S3_REGION = "us-east-1"


@dataclass(frozen=True)
class S3Config:
bucket: str = S3_BUCKET
access_key_id: Optional[str] = S3_ACCESS_KEY_ID
secret_access_key: Optional[str] = S3_SECRET_ACCESS_KEY
region: str = S3_REGION

@classmethod
def from_env(cls) -> "S3Config":
env = Env(expand_vars=True)
with env.prefixed("S3_"):
return cls(
bucket=env.str(name="BUCKET", default=S3_BUCKET),
access_key_id=env.str(name="ACCESS_KEY_ID", default=S3_ACCESS_KEY_ID),
secret_access_key=env.str(name="SECRET_ACCESS_KEY", default=S3_SECRET_ACCESS_KEY),
region=env.str(name="REGION", default=S3_REGION),
)


Expand All @@ -65,6 +93,7 @@ def from_env(cls) -> "AssetsConfig":
CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER = 100
CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER = 200
CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER = 10_000
CACHED_ASSETS_S3_FOLDER_NAME = "cached-assets"


@dataclass(frozen=True)
Expand All @@ -75,6 +104,7 @@ class CachedAssetsConfig:
keep_first_rows_number: int = CACHED_ASSETS_KEEP_FIRST_ROWS_NUMBER
keep_most_recent_rows_number: int = CACHED_ASSETS_KEEP_MOST_RECENT_ROWS_NUMBER
max_cleaned_rows_number: int = CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER
s3_folder_name: str = CACHED_ASSETS_S3_FOLDER_NAME

@classmethod
def from_env(cls) -> "CachedAssetsConfig":
Expand All @@ -93,6 +123,7 @@ def from_env(cls) -> "CachedAssetsConfig":
max_cleaned_rows_number=env.float(
name="MAX_CLEAN_SAMPLE_SIZE", default=CACHED_ASSETS_MAX_CLEANED_ROWS_NUMBER
),
s3_folder_name=env.str(name="S3_FOLDER_NAME", default=CACHED_ASSETS_S3_FOLDER_NAME),
)


Expand Down
10 changes: 5 additions & 5 deletions services/rows/src/rows/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def create_app_with_config(app_config: AppConfig) -> Starlette:
cache_resource = CacheMongoResource(database=app_config.cache.mongo_database, host=app_config.cache.mongo_url)
queue_resource = QueueMongoResource(database=app_config.queue.mongo_database, host=app_config.queue.mongo_url)
s3_client = S3Client(
aws_access_key_id=app_config.cached_assets_s3.access_key_id,
aws_secret_access_key=app_config.cached_assets_s3.secret_access_key,
region_name=app_config.cached_assets_s3.region,
bucket_name=app_config.cached_assets_s3.bucket,
aws_access_key_id=app_config.s3.access_key_id,
aws_secret_access_key=app_config.s3.secret_access_key,
region_name=app_config.s3.region,
bucket_name=app_config.s3.bucket,
)
resources: list[Resource] = [cache_resource, queue_resource]
if not cache_resource.is_available():
Expand All @@ -83,8 +83,8 @@ def create_app_with_config(app_config: AppConfig) -> Starlette:
processing_graph=processing_graph,
cached_assets_base_url=app_config.cached_assets.base_url,
cached_assets_directory=cached_assets_directory,
cached_assets_s3_folder_name=app_config.cached_assets.s3_folder_name,
s3_client=s3_client,
cached_assets_s3_folder_name=app_config.cached_assets_s3.folder_name,
parquet_metadata_directory=parquet_metadata_directory,
max_arrow_data_in_memory=app_config.rows_index.max_arrow_data_in_memory,
hf_endpoint=app_config.common.hf_endpoint,
Expand Down
13 changes: 7 additions & 6 deletions services/rows/src/rows/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from dataclasses import dataclass, field

from libapi.config import ApiConfig, CachedAssetsS3Config
from libapi.config import ApiConfig
from libcommon.config import (
CacheConfig,
CachedAssetsConfig,
Expand All @@ -13,6 +13,7 @@
ProcessingGraphConfig,
QueueConfig,
RowsIndexConfig,
S3Config,
)


Expand All @@ -21,13 +22,13 @@ class AppConfig:
api: ApiConfig = field(default_factory=ApiConfig)
cache: CacheConfig = field(default_factory=CacheConfig)
cached_assets: CachedAssetsConfig = field(default_factory=CachedAssetsConfig)
cached_assets_s3: CachedAssetsS3Config = field(default_factory=CachedAssetsS3Config)
common: CommonConfig = field(default_factory=CommonConfig)
log: LogConfig = field(default_factory=LogConfig)
parquet_metadata: ParquetMetadataConfig = field(default_factory=ParquetMetadataConfig)
processing_graph: ProcessingGraphConfig = field(default_factory=ProcessingGraphConfig)
queue: QueueConfig = field(default_factory=QueueConfig)
rows_index: RowsIndexConfig = field(default_factory=RowsIndexConfig)
processing_graph: ProcessingGraphConfig = field(default_factory=ProcessingGraphConfig)
parquet_metadata: ParquetMetadataConfig = field(default_factory=ParquetMetadataConfig)
s3: S3Config = field(default_factory=S3Config)

@classmethod
def from_env(cls) -> "AppConfig":
Expand All @@ -36,11 +37,11 @@ def from_env(cls) -> "AppConfig":
api=ApiConfig.from_env(hf_endpoint=common_config.hf_endpoint),
cache=CacheConfig.from_env(),
cached_assets=CachedAssetsConfig.from_env(),
cached_assets_s3=CachedAssetsS3Config.from_env(),
common=common_config,
log=LogConfig.from_env(),
parquet_metadata=ParquetMetadataConfig.from_env(),
processing_graph=ProcessingGraphConfig.from_env(),
queue=QueueConfig.from_env(),
parquet_metadata=ParquetMetadataConfig.from_env(),
rows_index=RowsIndexConfig.from_env(),
s3=S3Config.from_env(),
)
10 changes: 5 additions & 5 deletions services/rows/tests/routes/test_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,10 +460,10 @@ def test_rows_index_query_with_empty_dataset(rows_index_with_empty_dataset: Rows

def test_create_response(ds: Dataset, app_config: AppConfig, cached_assets_directory: StrPath) -> None:
s3_client = S3Client(
region_name=app_config.cached_assets_s3.region,
aws_access_key_id=app_config.cached_assets_s3.access_key_id,
aws_secret_access_key=app_config.cached_assets_s3.secret_access_key,
bucket_name=app_config.cached_assets_s3.bucket,
region_name=app_config.s3.region,
aws_access_key_id=app_config.s3.access_key_id,
aws_secret_access_key=app_config.s3.secret_access_key,
bucket_name=app_config.s3.bucket,
)
response = create_response(
dataset="ds",
Expand All @@ -472,7 +472,7 @@ def test_create_response(ds: Dataset, app_config: AppConfig, cached_assets_direc
cached_assets_base_url=app_config.cached_assets.base_url,
cached_assets_directory=cached_assets_directory,
s3_client=s3_client,
cached_assets_s3_folder_name=app_config.cached_assets_s3.folder_name,
cached_assets_s3_folder_name=app_config.cached_assets.s3_folder_name,
pa_table=ds.data,
offset=0,
features=ds.features,
Expand Down
12 changes: 6 additions & 6 deletions services/search/src/search/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def create_app_with_config(app_config: AppConfig) -> Starlette:
cache_resource = CacheMongoResource(database=app_config.cache.mongo_database, host=app_config.cache.mongo_url)
queue_resource = QueueMongoResource(database=app_config.queue.mongo_database, host=app_config.queue.mongo_url)
s3_client = S3Client(
aws_access_key_id=app_config.cached_assets_s3.access_key_id,
aws_secret_access_key=app_config.cached_assets_s3.secret_access_key,
region_name=app_config.cached_assets_s3.region,
bucket_name=app_config.cached_assets_s3.bucket,
aws_access_key_id=app_config.s3.access_key_id,
aws_secret_access_key=app_config.s3.secret_access_key,
region_name=app_config.s3.region,
bucket_name=app_config.s3.bucket,
)
resources: list[Resource] = [cache_resource, queue_resource]
if not cache_resource.is_available():
Expand All @@ -89,8 +89,8 @@ def create_app_with_config(app_config: AppConfig) -> Starlette:
duckdb_index_file_directory=duckdb_index_cache_directory,
cached_assets_base_url=app_config.cached_assets.base_url,
cached_assets_directory=cached_assets_directory,
cached_assets_s3_folder_name=app_config.cached_assets.s3_folder_name,
s3_client=s3_client,
cached_assets_s3_folder_name=app_config.cached_assets_s3.folder_name,
cache_max_days=app_config.cache.max_days,
target_revision=app_config.duckdb_index.target_revision,
hf_endpoint=app_config.common.hf_endpoint,
Expand All @@ -114,7 +114,7 @@ def create_app_with_config(app_config: AppConfig) -> Starlette:
cached_assets_base_url=app_config.cached_assets.base_url,
cached_assets_directory=cached_assets_directory,
s3_client=s3_client,
cached_assets_s3_folder_name=app_config.cached_assets_s3.folder_name,
cached_assets_s3_folder_name=app_config.cached_assets.s3_folder_name,
hf_endpoint=app_config.common.hf_endpoint,
hf_token=app_config.common.hf_token,
blocked_datasets=app_config.common.blocked_datasets,
Expand Down
7 changes: 4 additions & 3 deletions services/search/src/search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
from typing import Optional

from environs import Env
from libapi.config import ApiConfig, CachedAssetsS3Config
from libapi.config import ApiConfig
from libcommon.config import (
CacheConfig,
CachedAssetsConfig,
CommonConfig,
LogConfig,
ProcessingGraphConfig,
QueueConfig,
S3Config,
)

DUCKDB_INDEX_CACHE_DIRECTORY = None
Expand All @@ -39,24 +40,24 @@ class AppConfig:
api: ApiConfig = field(default_factory=ApiConfig)
cached_assets: CachedAssetsConfig = field(default_factory=CachedAssetsConfig)
cache: CacheConfig = field(default_factory=CacheConfig)
cached_assets_s3: CachedAssetsS3Config = field(default_factory=CachedAssetsS3Config)
common: CommonConfig = field(default_factory=CommonConfig)
log: LogConfig = field(default_factory=LogConfig)
queue: QueueConfig = field(default_factory=QueueConfig)
processing_graph: ProcessingGraphConfig = field(default_factory=ProcessingGraphConfig)
duckdb_index: DuckDbIndexConfig = field(default_factory=DuckDbIndexConfig)
s3: S3Config = field(default_factory=S3Config)

@classmethod
def from_env(cls) -> "AppConfig":
common_config = CommonConfig.from_env()
return cls(
common=common_config,
cached_assets=CachedAssetsConfig.from_env(),
cached_assets_s3=CachedAssetsS3Config.from_env(),
cache=CacheConfig.from_env(),
log=LogConfig.from_env(),
processing_graph=ProcessingGraphConfig.from_env(),
queue=QueueConfig.from_env(),
api=ApiConfig.from_env(hf_endpoint=common_config.hf_endpoint),
duckdb_index=DuckDbIndexConfig.from_env(),
s3=S3Config.from_env(),
)
10 changes: 5 additions & 5 deletions services/search/tests/routes/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,10 @@ def test_create_response(ds: Dataset, app_config: AppConfig, cached_assets_direc
}
)
s3_client = S3Client(
region_name=app_config.cached_assets_s3.region,
aws_access_key_id=app_config.cached_assets_s3.access_key_id,
aws_secret_access_key=app_config.cached_assets_s3.secret_access_key,
bucket_name=app_config.cached_assets_s3.bucket,
region_name=app_config.s3.region,
aws_access_key_id=app_config.s3.access_key_id,
aws_secret_access_key=app_config.s3.secret_access_key,
bucket_name=app_config.s3.bucket,
)
response = create_response(
dataset=dataset,
Expand All @@ -99,7 +99,7 @@ def test_create_response(ds: Dataset, app_config: AppConfig, cached_assets_direc
cached_assets_base_url=app_config.cached_assets.base_url,
cached_assets_directory=cached_assets_directory,
s3_client=s3_client,
cached_assets_s3_folder_name=app_config.cached_assets_s3.folder_name,
cached_assets_s3_folder_name=app_config.cached_assets.s3_folder_name,
pa_table=pa_table,
offset=0,
features=ds.features,
Expand Down
Loading

0 comments on commit a9612ac

Please sign in to comment.