Skip to content

Commit

Permalink
sanitize src url (#2080)
Browse files Browse the repository at this point in the history
* add sanitize url test

* fix style

* fix style

* Fix poetry lock file
  • Loading branch information
AndreaFrancis authored Nov 21, 2023
1 parent c6cd8a9 commit f41c757
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 20 deletions.
25 changes: 24 additions & 1 deletion libs/libcommon/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libs/libcommon/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ pytest-cov = "^2.12.1"
ruff = "^0.1.3"
types-psutil = "^5.9.5"
types-pytz = "^2022.1.1"
validators = "^0.22.0"
boto3 = "^1.28.0"
moto = "^4.2.8"

Expand Down
40 changes: 28 additions & 12 deletions libs/libcommon/src/libcommon/viewer_utils/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import TypedDict
from urllib import parse

from PIL import Image # type: ignore
from pydub import AudioSegment # type:ignore
Expand All @@ -18,10 +19,6 @@
SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE = {".wav": "audio/wav", ".mp3": "audio/mpeg"}


def get_url_dir_path(dataset: str, revision: str, config: str, split: str, row_idx: int, column: str) -> str:
return f"{dataset}/{DATASET_SEPARATOR}/{revision}/{DATASET_SEPARATOR}/{config}/{split}/{str(row_idx)}/{column}"


def delete_asset_dir(dataset: str, directory: StrPath) -> None:
dir_path = Path(directory).resolve() / dataset
remove_dir(dir_path)
Expand All @@ -38,6 +35,13 @@ class AudioSource(TypedDict):
type: str


def generate_asset_src(
base_url: str, dataset: str, revision: str, config: str, split: str, row_idx: int, column: str, filename: str
) -> tuple[str, str]:
dir_path = f"{parse.quote(dataset)}/{DATASET_SEPARATOR}/{revision}/{DATASET_SEPARATOR}/{parse.quote(config)}/{parse.quote(split)}/{str(row_idx)}/{parse.quote(column)}"
return dir_path, f"{base_url}/{dir_path}/{filename}"


def create_image_file(
dataset: str,
revision: str,
Expand All @@ -55,11 +59,17 @@ def create_image_file(
overwrite = public_assets_storage.overwrite
storage_client = public_assets_storage.storage_client

url_dir_path = get_url_dir_path(
dataset=dataset, revision=revision, config=config, split=split, row_idx=row_idx, column=column
dir_path, src = generate_asset_src(
base_url=assets_base_url,
dataset=dataset,
revision=revision,
config=config,
split=split,
row_idx=row_idx,
column=column,
filename=filename,
)
src = f"{assets_base_url}/{url_dir_path}/{filename}"
object_key = f"{url_dir_path}/{filename}"
object_key = f"{dir_path}/{filename}"
image_path = f"{storage_client.get_base_directory()}/{object_key}"

if overwrite or not storage_client.exists(object_key=object_key):
Expand All @@ -85,11 +95,17 @@ def create_audio_file(
overwrite = public_assets_storage.overwrite
storage_client = public_assets_storage.storage_client

url_dir_path = get_url_dir_path(
revision=revision, dataset=dataset, config=config, split=split, row_idx=row_idx, column=column
dir_path, src = generate_asset_src(
base_url=assets_base_url,
dataset=dataset,
revision=revision,
config=config,
split=split,
row_idx=row_idx,
column=column,
filename=filename,
)
src = f"{assets_base_url}/{url_dir_path}/{filename}"
object_key = f"{url_dir_path}/{filename}"
object_key = f"{dir_path}/{filename}"
audio_path = f"{storage_client.get_base_directory()}/{object_key}"
suffix = f".{filename.split('.')[-1]}"
if suffix not in SUPPORTED_AUDIO_EXTENSION_TO_MEDIA_TYPE:
Expand Down
7 changes: 1 addition & 6 deletions libs/libcommon/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,7 @@
import pytest

from libcommon.exceptions import DatasetInBlockListError
from libcommon.utils import (
inputs_to_string,
is_image_url,
orjson_dumps,
raise_if_blocked,
)
from libcommon.utils import inputs_to_string, is_image_url, orjson_dumps, raise_if_blocked


@pytest.mark.parametrize(
Expand Down
26 changes: 25 additions & 1 deletion libs/libcommon/tests/viewer_utils/test_assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

import pytest
import soundfile # type: ignore
import validators # type: ignore
from datasets import Dataset
from PIL import Image as PILImage # type: ignore

from libcommon.public_assets_storage import PublicAssetsStorage
from libcommon.storage_client import StorageClient
from libcommon.viewer_utils.asset import create_audio_file, create_image_file
from libcommon.viewer_utils.asset import create_audio_file, create_image_file, generate_asset_src

ASSETS_FOLDER = "assets"
ASSETS_BASE_URL = f"http://localhost/{ASSETS_FOLDER}"
Expand Down Expand Up @@ -83,3 +84,26 @@ def test_create_audio_file(datasets: Mapping[str, Dataset], public_assets_storag
]

assert public_assets_storage.storage_client.exists(audio_key)


@pytest.mark.parametrize(
"dataset,config,split,column",
[
("dataset", "config", "split", "column"),
("dataset", "config?<script>alert('XSS');</script>&", "split", "column?"),
],
)
def test_generate_asset_src(dataset: str, config: str, split: str, column: str) -> None:
base_url = "https://datasets-server.huggingface.co/assets"
filename = "image.jpg"
_, src = generate_asset_src(
base_url=base_url,
dataset=dataset,
revision="revision",
config=config,
split=split,
row_idx=0,
column=column,
filename=filename,
)
assert validators.url(src)

0 comments on commit f41c757

Please sign in to comment.