Skip to content

Commit

Permalink
Mark tests that require librosa (#7044)
Browse files Browse the repository at this point in the history
* Implement test require_librosa

* Mark tests that require librosa

* Mark tests in test_audiofolder with require_librosa

* Mark test in test_upstream_hub with require_librosa
  • Loading branch information
albertvillanova committed Aug 14, 2024
1 parent 9fd0929 commit d86ec2c
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 6 deletions.
22 changes: 22 additions & 0 deletions tests/features/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from datasets.features import Audio, Features, Sequence, Value

from ..utils import (
require_librosa,
require_sndfile,
)

Expand Down Expand Up @@ -57,6 +58,7 @@ def test_audio_feature_type_to_arrow():
assert features.arrow_schema == pa.schema({"sequence_of_audios": pa.list_(Audio().pa_type)})


@require_librosa
@pytest.mark.parametrize(
"build_example",
[
Expand All @@ -81,6 +83,7 @@ def test_audio_feature_encode_example(shared_datadir, build_example):
assert decoded_example.keys() == {"path", "array", "sampling_rate"}


@require_librosa
@pytest.mark.parametrize(
"build_example",
[
Expand All @@ -101,6 +104,7 @@ def test_audio_feature_encode_example_pcm(shared_datadir, build_example):
assert decoded_example.keys() == {"path", "array", "sampling_rate"}


@require_librosa
@require_sndfile
def test_audio_decode_example(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand All @@ -115,6 +119,7 @@ def test_audio_decode_example(shared_datadir):
Audio(decode=False).decode_example(audio_path)


@require_librosa
@require_sndfile
def test_audio_resampling(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand All @@ -126,6 +131,7 @@ def test_audio_resampling(shared_datadir):
assert decoded_example["sampling_rate"] == 16000


@require_librosa
@require_sndfile
def test_audio_decode_example_mp3(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
Expand All @@ -137,6 +143,7 @@ def test_audio_decode_example_mp3(shared_datadir):
assert decoded_example["sampling_rate"] == 44100


@require_librosa
@require_sndfile
def test_audio_decode_example_opus(shared_datadir):
audio_path = str(shared_datadir / "test_audio_48000.opus")
Expand All @@ -148,6 +155,7 @@ def test_audio_decode_example_opus(shared_datadir):
assert decoded_example["sampling_rate"] == 48000


@require_librosa
@pytest.mark.parametrize("sampling_rate", [16_000, 48_000])
def test_audio_decode_example_pcm(shared_datadir, sampling_rate):
audio_path = str(shared_datadir / "test_audio_16000.pcm")
Expand All @@ -160,6 +168,7 @@ def test_audio_decode_example_pcm(shared_datadir, sampling_rate):
assert decoded_example["sampling_rate"] == sampling_rate


@require_librosa
@require_sndfile
def test_audio_resampling_mp3_different_sampling_rates(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
Expand All @@ -179,6 +188,7 @@ def test_audio_resampling_mp3_different_sampling_rates(shared_datadir):
assert decoded_example["sampling_rate"] == 48000


@require_librosa
@require_sndfile
def test_dataset_with_audio_feature(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -206,6 +216,7 @@ def test_dataset_with_audio_feature(shared_datadir):
assert column[0]["sampling_rate"] == 44100


@require_librosa
@require_sndfile
def test_dataset_with_audio_feature_tar_wav(tar_wav_path):
audio_filename = "test_audio_44100.wav"
Expand Down Expand Up @@ -236,6 +247,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path):
assert column[0]["sampling_rate"] == 44100


@require_librosa
@require_sndfile
def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
audio_filename = "test_audio_44100.mp3"
Expand Down Expand Up @@ -300,6 +312,7 @@ def test_dataset_with_audio_feature_with_none():
assert item["nested"]["audio"] is None


@require_librosa
@require_sndfile
def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -327,6 +340,7 @@ def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir):
assert column[0]["sampling_rate"] == 16000


@require_librosa
@require_sndfile
def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
Expand Down Expand Up @@ -354,6 +368,7 @@ def test_resampling_at_loading_dataset_with_audio_feature_mp3(shared_datadir):
assert column[0]["sampling_rate"] == 16000


@require_librosa
@require_sndfile
def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -384,6 +399,7 @@ def test_resampling_after_loading_dataset_with_audio_feature(shared_datadir):
assert column[0]["sampling_rate"] == 16000


@require_librosa
@require_sndfile
def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.mp3")
Expand Down Expand Up @@ -414,6 +430,7 @@ def test_resampling_after_loading_dataset_with_audio_feature_mp3(shared_datadir)
assert column[0]["sampling_rate"] == 16000


@require_librosa
@pytest.mark.parametrize(
"build_data",
[
Expand All @@ -438,6 +455,7 @@ def test_dataset_cast_to_audio_features(shared_datadir, build_data):
assert item["audio"].keys() == {"path", "array", "sampling_rate"}


@require_librosa
def test_dataset_concatenate_audio_features(shared_datadir):
# we use a different data structure between 1 and 2 to make sure they are compatible with each other
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand All @@ -451,6 +469,7 @@ def test_dataset_concatenate_audio_features(shared_datadir):
assert concatenated_dataset[1]["audio"]["array"].shape == dset2[0]["audio"]["array"].shape


@require_librosa
def test_dataset_concatenate_nested_audio_features(shared_datadir):
# we use a different data structure between 1 and 2 to make sure they are compatible with each other
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -493,6 +512,7 @@ def process_text(example):
assert item == {"audio": expected_audio, "text": "Hello World!"}


@require_librosa
@require_sndfile
def test_dataset_with_audio_feature_map_is_decoded(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -522,6 +542,7 @@ def process_audio_sampling_rate_by_batch(batch):
assert item["double_sampling_rate"] == 88200


@require_librosa
@require_sndfile
def test_formatted_dataset_with_audio_feature(shared_datadir):
audio_path = str(shared_datadir / "test_audio_44100.wav")
Expand Down Expand Up @@ -585,6 +606,7 @@ def jsonl_audio_dataset_path(shared_datadir, tmp_path_factory):
return path


@require_librosa
@require_sndfile
@pytest.mark.parametrize("streaming", [False, True])
def test_load_dataset_with_audio_feature(streaming, jsonl_audio_dataset_path, shared_datadir):
Expand Down
10 changes: 8 additions & 2 deletions tests/packaged_modules/test_audiofolder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import shutil
import textwrap

import librosa
import numpy as np
import pytest
import soundfile as sf
Expand All @@ -12,7 +11,7 @@
from datasets.download.streaming_download_manager import StreamingDownloadManager
from datasets.packaged_modules.audiofolder.audiofolder import AudioFolder, AudioFolderConfig

from ..utils import require_sndfile
from ..utils import require_librosa, require_sndfile


@pytest.fixture
Expand Down Expand Up @@ -195,6 +194,8 @@ def data_files_with_two_splits_and_metadata(request, tmp_path, audio_file):

@pytest.fixture
def data_files_with_zip_archives(tmp_path, audio_file):
import librosa

data_dir = tmp_path / "audiofolder_data_dir_with_zip_archives"
data_dir.mkdir(parents=True, exist_ok=True)
archive_dir = data_dir / "archive"
Expand Down Expand Up @@ -242,6 +243,7 @@ def test_config_raises_when_invalid_data_files(data_files) -> None:
_ = AudioFolderConfig(name="name", data_files=data_files)


@require_librosa
@require_sndfile
# check that labels are inferred correctly from dir names
def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache_dir):
Expand All @@ -256,6 +258,7 @@ def test_generate_examples_with_labels(data_files_with_labels_no_metadata, cache
assert dataset[1]["label"] == label_feature._str2int["uk"]


@require_librosa
@require_sndfile
@pytest.mark.parametrize("drop_metadata", [None, True, False])
@pytest.mark.parametrize("drop_labels", [None, True, False])
Expand Down Expand Up @@ -385,6 +388,7 @@ def test_generate_examples_with_metadata_that_misses_one_audio(
)


@require_librosa
@require_sndfile
@pytest.mark.parametrize("streaming", [False, True])
def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_files_with_one_split_and_metadata):
Expand All @@ -403,6 +407,7 @@ def test_data_files_with_metadata_and_single_split(streaming, cache_dir, data_fi
assert all(example["text"] is not None for example in dataset)


@require_librosa
@require_sndfile
@pytest.mark.parametrize("streaming", [False, True])
def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data_files_with_two_splits_and_metadata):
Expand All @@ -421,6 +426,7 @@ def test_data_files_with_metadata_and_multiple_splits(streaming, cache_dir, data
assert all(example["text"] is not None for example in dataset)


@require_librosa
@require_sndfile
@pytest.mark.parametrize("streaming", [False, True])
def test_data_files_with_metadata_and_archives(streaming, cache_dir, data_files_with_zip_archives):
Expand Down
3 changes: 2 additions & 1 deletion tests/packaged_modules/test_webdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datasets import Audio, DownloadManager, Features, Image, Sequence, Value
from datasets.packaged_modules.webdataset.webdataset import WebDataset

from ..utils import require_pil, require_sndfile, require_torch
from ..utils import require_librosa, require_pil, require_sndfile, require_torch


@pytest.fixture
Expand Down Expand Up @@ -159,6 +159,7 @@ def test_image_webdataset_missing_keys(image_wds_file):
assert decoded["txt"] is None


@require_librosa
@require_sndfile
def test_audio_webdataset(audio_wds_file):
data_files = {"train": [audio_wds_file]}
Expand Down
13 changes: 12 additions & 1 deletion tests/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,15 @@
)
from datasets.table import InMemoryTable

from .utils import require_jax, require_pil, require_polars, require_sndfile, require_tf, require_torch
from .utils import (
require_jax,
require_librosa,
require_pil,
require_polars,
require_sndfile,
require_tf,
require_torch,
)


class AnyArray:
Expand Down Expand Up @@ -300,6 +308,7 @@ def test_numpy_formatter_image(self):
self.assertEqual(batch["image"][0].dtype, np.uint8)
self.assertEqual(batch["image"][0].shape, (480, 640, 3))

@require_librosa
@require_sndfile
def test_numpy_formatter_audio(self):
pa_table = pa.table({"audio": [{"bytes": None, "path": str(AUDIO_PATH_1)}]})
Expand Down Expand Up @@ -419,6 +428,7 @@ def test_torch_formatter_image(self):
self.assertEqual(batch["image"][0].shape, (3, 480, 640))

@require_torch
@require_librosa
@require_sndfile
def test_torch_formatter_audio(self):
import torch
Expand Down Expand Up @@ -602,6 +612,7 @@ def test_jax_formatter_image(self):
self.assertEqual(batch["image"][0].shape, (480, 640, 3))

@require_jax
@require_librosa
@require_sndfile
def test_jax_formatter_audio(self):
import jax.numpy as jnp
Expand Down
6 changes: 4 additions & 2 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@
)
from datasets.utils.file_utils import cached_path
from datasets.utils.hub import hf_dataset_url
from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN
from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error

from .fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN
from .utils import for_all_test_methods, require_librosa, require_pil, require_sndfile, xfail_if_500_502_http_error


pytestmark = pytest.mark.integration
Expand Down Expand Up @@ -383,6 +384,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo):
assert ds.features == hub_ds.features
assert ds[:] == hub_ds[:]

@require_librosa
@require_sndfile
def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo):
audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav")
Expand Down
1 change: 1 addition & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def parse_flag_from_env(key, default=False):
require_zstandard = pytest.mark.skipif(not config.ZSTANDARD_AVAILABLE, reason="test requires zstandard")

# Audio
require_librosa = pytest.mark.skipif(find_spec("librosa") is None, reason="test requires librosa")
require_sndfile = pytest.mark.skipif(
# On Windows and OS X, soundfile installs sndfile
find_spec("soundfile") is None or version.parse(importlib.metadata.version("soundfile")) < version.parse("0.12.0"),
Expand Down

0 comments on commit d86ec2c

Please sign in to comment.