diff --git a/noxfile.py b/noxfile.py index 6adb4f33d..188fe1281 100644 --- a/noxfile.py +++ b/noxfile.py @@ -157,7 +157,7 @@ def copy_url(url: str, path: str) -> None: @nox.session(name='test-data-tar-source') def test_data_tar_source(session) -> None: - """Make a .tar.gz file of just the 'generated' test data used to run tests on CI.""" + """Make a .tar.gz file of just the 'source' test data used to run tests.""" session.log(f"Making tarfile with source data: {SOURCE_TEST_DATA_TAR}") make_tarfile(SOURCE_TEST_DATA_TAR, SOURCE_TEST_DATA_DIRS) diff --git a/src/vak/common/constants.py b/src/vak/common/constants.py index e8aaad94e..fcf2ab7d6 100644 --- a/src/vak/common/constants.py +++ b/src/vak/common/constants.py @@ -42,3 +42,17 @@ # ---- output (default) file extensions. Using the `pathlib` name "suffix" ---- ANNOT_CSV_SUFFIX = ".annot.csv" NET_OUTPUT_SUFFIX = ".output.npz" + +# ---- the key for loading the spectrogram matrix from an npz file +# TODO: replace this with vocalpy constants when we move to VocalPy +SPECT_KEY = "s" +TIMEBINS_KEY = "t" + +# TODO: replace this with vocalpy extension when we move to VocalPy +# ---- the extension used to save spectrograms in npz array files +# used by :func:`vak.prep.spectrogram_dataset.audio_helper.make +SPECT_NPZ_EXTENSION = ".spect.npz" +SPECT_FORMAT_EXT_MAP = { + "npz": SPECT_NPZ_EXTENSION, + "mat": ".mat", +} diff --git a/src/vak/common/learncurve.py b/src/vak/common/learncurve.py index ee291c4f6..68ffe7b59 100644 --- a/src/vak/common/learncurve.py +++ b/src/vak/common/learncurve.py @@ -1,10 +1,10 @@ -def get_train_dur_replicate_split_name( +def get_train_dur_replicate_subset_name( train_dur: int, replicate_num: int ) -> str: - """Get name of a training set split for a learning curve, + """Get name of a training set subset for a learning curve, for a specified training duration and replicate number. - Used when preparing the training set splits for a learning curve, + Used when preparing the training set subsets for a learning curve, and when training models to generate the results for the curve. """ return f"train-dur-{float(train_dur)}-replicate-{int(replicate_num)}" diff --git a/src/vak/datasets/frame_classification/__init__.py b/src/vak/datasets/frame_classification/__init__.py index 98eda614b..29bc8b68c 100644 --- a/src/vak/datasets/frame_classification/__init__.py +++ b/src/vak/datasets/frame_classification/__init__.py @@ -1,6 +1,6 @@ -from . import constants +from . import constants, helper from .frames_dataset import FramesDataset from .metadata import Metadata from .window_dataset import WindowDataset -__all__ = ["constants", "Metadata", "FramesDataset", "WindowDataset"] +__all__ = ["constants", "helper", "Metadata", "FramesDataset", "WindowDataset"] diff --git a/src/vak/datasets/frame_classification/constants.py b/src/vak/datasets/frame_classification/constants.py index 0ec942562..6867b9b67 100644 --- a/src/vak/datasets/frame_classification/constants.py +++ b/src/vak/datasets/frame_classification/constants.py @@ -1,8 +1,8 @@ -FRAMES_ARRAY_EXT = ".frames.npy" -FRAMES_NPY_PATH_COL_NAME = "frames_npy_path" +FRAMES_PATH_COL_NAME = "frames_path" FRAME_LABELS_EXT = ".frame_labels.npy" FRAME_LABELS_NPY_PATH_COL_NAME = "frame_labels_npy_path" ANNOTATION_CSV_FILENAME = "y.csv" SAMPLE_IDS_ARRAY_FILENAME = "sample_ids.npy" INDS_IN_SAMPLE_ARRAY_FILENAME = "inds_in_sample.npy" WINDOW_INDS_ARRAY_FILENAME = "window_inds.npy" +FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT = "wav" diff --git a/src/vak/datasets/frame_classification/frames_dataset.py b/src/vak/datasets/frame_classification/frames_dataset.py index 6713ba4ca..ffe748b3d 100644 --- a/src/vak/datasets/frame_classification/frames_dataset.py +++ b/src/vak/datasets/frame_classification/frames_dataset.py @@ -1,3 +1,6 @@ +"""A dataset class used for neural network models with the +frame classification task, where the source data consists of audio signals +or spectrograms of varying lengths.""" from __future__ import annotations import pathlib @@ -7,8 +10,9 @@ import numpy.typing as npt import pandas as pd -from . import constants +from . import constants, helper from .metadata import Metadata +from ... import common class FramesDataset: @@ -20,32 +24,120 @@ class FramesDataset: Attributes ---------- - dataset_path - dataset_df - frame_dur : float - Duration of a single frame, in seconds. - duration : float - Total duration of the dataset. + dataset_path : pathlib.Path + Path to directory that represents a + frame classification dataset, + as created by + :func:`vak.prep.prep_frame_classification_dataset`. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + dataset_df : pandas.DataFrame + A frame classification dataset, + represented as a :class:`pandas.DataFrame`. + This will be only the rows that correspond + to either ``subset`` or ``split`` from the + ``dataset_df`` that was passed in when + instantiating the class. + frames_paths : numpy.ndarray + Paths to npy files containing frames, + either spectrograms or audio signals + that are input to the model. + frame_labels_paths : numpy.ndarray + Paths to npy files containing vectors + with a label for each frame. + The targets for the outputs of the model. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. + sample_ids : numpy.ndarray + Indexing vector representing which sample + from the dataset every frame belongs to. + inds_in_sample : numpy.ndarray + Indexing vector representing which index + within each sample from the dataset + that every frame belongs to. + frame_dur: float + Duration of a frame, i.e., a single sample in audio + or a single timebin in a spectrogram. + item_transform : callable, optional + Transform applied to each item :math:`(x, y)` + returned by :meth:`FramesDataset.__getitem__`. """ def __init__( self, dataset_path: str | pathlib.Path, dataset_df: pd.DataFrame, + input_type: str, split: str, sample_ids: npt.NDArray, inds_in_sample: npt.NDArray, frame_dur: float, - input_type: str, + subset: str | None = None, item_transform: Callable | None = None, ): - self.dataset_path = pathlib.Path(dataset_path) + """Initialize a new instance of a FramesDataset. + Parameters + ---------- + dataset_path : pathlib.Path + Path to directory that represents a + frame classification dataset, + as created by + :func:`vak.prep.prep_frame_classification_dataset`. + dataset_df : pandas.DataFrame + A frame classification dataset, + represented as a :class:`pandas.DataFrame`. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + sample_ids : numpy.ndarray + Indexing vector representing which sample + from the dataset every frame belongs to. + inds_in_sample : numpy.ndarray + Indexing vector representing which index + within each sample from the dataset + that every frame belongs to. + frame_dur: float + Duration of a frame, i.e., a single sample in audio + or a single timebin in a spectrogram. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + item_transform : callable, optional + Transform applied to each item :math:`(x, y)` + returned by :meth:`FramesDataset.__getitem__`. + """ + from ... import prep # avoid circular import, use for constants.INPUT_TYPES + if input_type not in prep.constants.INPUT_TYPES: + raise ValueError( + f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n" + f"Value for ``input_type`` was: {input_type}" + ) + + self.dataset_path = pathlib.Path(dataset_path) self.split = split - dataset_df = dataset_df[dataset_df.split == split].copy() + self.subset = subset + # subset takes precedence over split, if specified + if subset: + dataset_df = dataset_df[dataset_df.subset == subset].copy() + else: + dataset_df = dataset_df[dataset_df.split == split].copy() self.dataset_df = dataset_df + self.input_type = input_type self.frames_paths = self.dataset_df[ - constants.FRAMES_NPY_PATH_COL_NAME + constants.FRAMES_PATH_COL_NAME ].values if split != "predict": self.frame_labels_paths = self.dataset_df[ @@ -53,16 +145,6 @@ def __init__( ].values else: self.frame_labels_paths = None - - if input_type == "audio": - self.source_paths = self.dataset_df["audio_path"].values - elif input_type == "spect": - self.source_paths = self.dataset_df["spect_path"].values - else: - raise ValueError( - f"Invalid `input_type`: {input_type}. Must be one of {{'audio', 'spect'}}." - ) - self.sample_ids = sample_ids self.inds_in_sample = inds_in_sample self.frame_dur = float(frame_dur) @@ -78,10 +160,20 @@ def shape(self): tmp_item = self.__getitem__(tmp_x_ind) return tmp_item["frames"].shape + def _load_frames(self, frames_path): + """Helper function that loads "frames", + the input to the frame classification model. + Loads audio or spectrogram, depending on + :attr:`self.input_type`. + This function assumes that audio is in wav format + and spectrograms are in npz files. + """ + return helper.load_frames(frames_path, self.input_type) + def __getitem__(self, idx): - source_path = self.source_paths[idx] - frames = np.load(self.dataset_path / self.frames_paths[idx]) - item = {"frames": frames, "source_path": source_path} + frames_path = self.dataset_path / self.frames_paths[idx] + frames = self._load_frames(frames_path) + item = {"frames": frames, "frames_path": frames_path} if self.frame_labels_paths is not None: frame_labels = np.load( self.dataset_path / self.frame_labels_paths[idx] @@ -102,19 +194,34 @@ def from_dataset_path( cls, dataset_path: str | pathlib.Path, split: str = "val", + subset: str | None = None, item_transform: Callable | None = None, ): - """ + """Make a :class:`FramesDataset` instance, + given the path to a frame classification dataset. Parameters ---------- - dataset_path - split - item_transform + dataset_path : pathlib.Path + Path to directory that represents a + frame classification dataset, + as created by + :func:`vak.prep.prep_frame_classification_dataset`. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + item_transform : callable, optional + Transform applied to each item :math:`(x, y)` + returned by :meth:`FramesDataset.__getitem__`. Returns ------- - + frames_dataset : FramesDataset """ dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) @@ -125,20 +232,26 @@ def from_dataset_path( dataset_df = pd.read_csv(dataset_csv_path) split_path = dataset_path / split - sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME + if subset: + sample_ids_path = split_path / helper.sample_ids_array_filename_for_subset(subset) + else: + sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME sample_ids = np.load(sample_ids_path) - inds_in_sample_path = ( - split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME - ) + + if subset: + inds_in_sample_path = split_path / helper.inds_in_sample_array_filename_for_subset(subset) + else: + inds_in_sample_path = split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME inds_in_sample = np.load(inds_in_sample_path) return cls( dataset_path, dataset_df, + input_type, split, sample_ids, inds_in_sample, frame_dur, - input_type, + subset, item_transform, ) diff --git a/src/vak/datasets/frame_classification/helper.py b/src/vak/datasets/frame_classification/helper.py new file mode 100644 index 000000000..2e9b1b4d2 --- /dev/null +++ b/src/vak/datasets/frame_classification/helper.py @@ -0,0 +1,37 @@ +"""Helper functions used with frame classification datasets.""" +from __future__ import annotations + +from . import constants +from ... import common + + +def sample_ids_array_filename_for_subset(subset: str) -> str: + """Returns name of sample IDs array file for a subset of the training data.""" + return constants.SAMPLE_IDS_ARRAY_FILENAME.replace( + '.npy', f'-{subset}.npy' + ) + + +def inds_in_sample_array_filename_for_subset(subset: str) -> str: + """Returns name of inds in sample array file for a subset of the training data.""" + return constants.INDS_IN_SAMPLE_ARRAY_FILENAME.replace( + '.npy', f'-{subset}.npy' + ) + + +def load_frames(frames_path, input_type): + """Helper function that loads "frames", + the input to the frame classification model. + Loads audio or spectrogram, depending on + :attr:`self.input_type`. + This function assumes that audio is in wav format + and spectrograms are in npz files. + """ + if input_type == "audio": + frames, _ = common.constants.AUDIO_FORMAT_FUNC_MAP[ + constants.FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT + ](frames_path) + elif input_type == "spect": + spect_dict = common.files.spect.load(frames_path) + frames = spect_dict[common.constants.SPECT_KEY] + return frames diff --git a/src/vak/datasets/frame_classification/window_dataset.py b/src/vak/datasets/frame_classification/window_dataset.py index aa61d37ff..c4f309f94 100644 --- a/src/vak/datasets/frame_classification/window_dataset.py +++ b/src/vak/datasets/frame_classification/window_dataset.py @@ -1,3 +1,20 @@ +"""A dataset class used for neural network models with the +frame classification task, where the source data consists of audio signals +or spectrograms of varying lengths. + +Unlike :class:`vak.datasets.frame_classification.FramesDataset`, +this class does not return entire samples +from the source dataset. +Instead each paired samples :math:`(x_i, y_i)` +returned by this dataset class consists of +a window :math:`x_i` of fixed length +:math:`w` from the underlying data ``X`` of total length :math:`T`. +Each :math:`y_i` is a vector of the same size :math:`w`, containing +an integer class label for each *frame* in the window :math:`x_i`. +The entire dataset consists of some number of windows +:math:`I` determined by a ``stride`` parameter :math:`s`, +:math:`I = (T - w) / s`. +""" from __future__ import annotations import pathlib @@ -7,8 +24,9 @@ import numpy.typing as npt import pandas as pd -from . import constants +from . import constants, helper from .metadata import Metadata +from ... import common def get_window_inds(n_frames: int, window_size: int, stride: int = 1): @@ -43,7 +61,7 @@ def get_window_inds(n_frames: int, window_size: int, stride: int = 1): class WindowDataset: """Dataset used for training neural network models - on the frame classification task. + on the frame classification task, where the source data consists of audio signals or spectrograms of varying lengths. @@ -85,55 +103,167 @@ class WindowDataset: Attributes ---------- - X : numpy.ndarray - Y : numpy.ndarray + dataset_path : pathlib.Path + Path to directory that represents a + frame classification dataset, + as created by + :func:`vak.prep.prep_frame_classification_dataset`. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + dataset_df : pandas.DataFrame + A frame classification dataset, + represented as a :class:`pandas.DataFrame`. + This will be only the rows that correspond + to either ``subset`` or ``split`` from the + ``dataset_df`` that was passed in when + instantiating the class. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. + frame_paths : numpy.ndarray + Paths to npy files containing frames, + either spectrograms or audio signals + that are input to the model. + frame_labels_paths : numpy.ndarray + Paths to npy files containing vectors + with a label for each frame. + The targets for the outputs of the model. + sample_ids : numpy.ndarray + Indexing vector representing which sample + from the dataset every frame belongs to. + inds_in_sample : numpy.ndarray + Indexing vector representing which index + within each sample from the dataset + that every frame belongs to. window_size : int - frame_dur : float - Duration of a single frame, in seconds. - duration : float - Total duration of the dataset. + Size of windows to return; + number of frames. + frame_dur: float + Duration of a frame, i.e., a single sample in audio + or a single timebin in a spectrogram. + stride : int + The size of the stride used to determine which windows + are included in the dataset. The default is 1. + Used to compute ``window_inds``, + with the function + :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`. + window_inds : numpy.ndarray, optional + A vector of valid window indices for the dataset. + If specified, this takes precedence over ``stride``. + transform : callable + The transform applied to the frames, + the input to the neural network :math:`x`. + target_transform : callable + The transform applied to the target for the output + of the neural network :math:`y`. """ def __init__( self, dataset_path: str | pathlib.Path, dataset_df: pd.DataFrame, + input_type: str, split: str, sample_ids: npt.NDArray, inds_in_sample: npt.NDArray, window_size: int, frame_dur: float, stride: int = 1, + subset: str | None = None, window_inds: npt.NDArray | None = None, transform: Callable | None = None, target_transform: Callable | None = None, ): - self.dataset_path = pathlib.Path(dataset_path) + """Initialize a new instance of a WindowDataset. + + Parameters + ---------- + dataset_path : pathlib.Path + Path to directory that represents a + frame classification dataset, + as created by + :func:`vak.prep.prep_frame_classification_dataset`. + dataset_df : pandas.DataFrame + A frame classification dataset, + represented as a :class:`pandas.DataFrame`. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + sample_ids : numpy.ndarray + Indexing vector representing which sample + from the dataset every frame belongs to. + inds_in_sample : numpy.ndarray + Indexing vector representing which index + within each sample from the dataset + that every frame belongs to. + window_size : int + Size of windows to return; + number of frames. + frame_dur: float + Duration of a frame, i.e., a single sample in audio + or a single timebin in a spectrogram. + stride : int + The size of the stride used to determine which windows + are included in the dataset. The default is 1. + Used to compute ``window_inds``, + with the function + :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + window_inds : numpy.ndarray, optional + A vector of valid window indices for the dataset. + If specified, this takes precedence over ``stride``. + transform : callable + The transform applied to the input to the neural network :math:`x`. + target_transform : callable + The transform applied to the target for the output + of the neural network :math:`y`. + """ + from ... import prep # avoid circular import, use for constants.INPUT_TYPES + if input_type not in prep.constants.INPUT_TYPES: + raise ValueError( + f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n" + f"Value for ``input_type`` was: {input_type}" + ) + self.dataset_path = pathlib.Path(dataset_path) self.split = split - dataset_df = dataset_df[dataset_df.split == split].copy() + self.subset = subset + # subset takes precedence over split, if specified + if subset: + dataset_df = dataset_df[dataset_df.subset == subset].copy() + else: + dataset_df = dataset_df[dataset_df.split == split].copy() self.dataset_df = dataset_df - + self.input_type = input_type self.frames_paths = self.dataset_df[ - constants.FRAMES_NPY_PATH_COL_NAME + constants.FRAMES_PATH_COL_NAME ].values self.frame_labels_paths = self.dataset_df[ constants.FRAME_LABELS_NPY_PATH_COL_NAME ].values - self.sample_ids = sample_ids self.inds_in_sample = inds_in_sample - self.window_size = window_size self.frame_dur = float(frame_dur) self.stride = stride - if window_inds is None: window_inds = get_window_inds( sample_ids.shape[-1], window_size, stride ) self.window_inds = window_inds - self.transform = transform self.target_transform = target_transform @@ -149,6 +279,16 @@ def shape(self): # e.g. when initializing a neural network model return one_x.shape + def _load_frames(self, frames_path): + """Helper function that loads "frames", + the input to the frame classification model. + Loads audio or spectrogram, depending on + :attr:`self.input_type`. + This function assumes that audio is in wav format + and spectrograms are in npz files. + """ + return helper.load_frames(frames_path, self.input_type) + def __getitem__(self, idx): window_idx = self.window_inds[idx] sample_ids = self.sample_ids[ @@ -156,17 +296,21 @@ def __getitem__(self, idx): ] uniq_sample_ids = np.unique(sample_ids) if len(uniq_sample_ids) == 1: + # we repeat ourselves here to avoid running a loop on one item sample_id = uniq_sample_ids[0] - frames = np.load(self.dataset_path / self.frames_paths[sample_id]) + frames_path = self.dataset_path / self.frames_paths[sample_id] + frames = self._load_frames(frames_path) frame_labels = np.load( self.dataset_path / self.frame_labels_paths[sample_id] ) + elif len(uniq_sample_ids) > 1: frames = [] frame_labels = [] for sample_id in sorted(uniq_sample_ids): + frames_path = self.dataset_path / self.frames_paths[sample_id] frames.append( - np.load(self.dataset_path / self.frames_paths[sample_id]) + self._load_frames(frames_path) ) frame_labels.append( np.load( @@ -210,37 +354,66 @@ def from_dataset_path( window_size: int, stride: int = 1, split: str = "train", + subset: str | None = None, transform: Callable | None = None, target_transform: Callable | None = None, ): - """ + """Make a :class:`WindowDataset` instance, + given the path to a frame classification dataset. Parameters ---------- - dataset_path - window_size - stride - split - transform - target_transform + dataset_path : pathlib.Path + Path to directory that represents a + frame classification dataset, + as created by + :func:`vak.prep.prep_frame_classification_dataset`. + window_size : int + Size of windows to return; + number of frames. + stride : int + The size of the stride used to determine which windows + are included in the dataset. The default is 1. + Used to compute ``window_inds``, + with the function + :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + transform : callable + The transform applied to the input to the neural network :math:`x`. + target_transform : callable + The transform applied to the target for the output + of the neural network :math:`y`. Returns ------- - + dataset : vak.datasets.frame_classification.WindowDataset """ dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) frame_dur = metadata.frame_dur + input_type = metadata.input_type dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_df = pd.read_csv(dataset_csv_path) split_path = dataset_path / split - sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME + if subset: + sample_ids_path = split_path / helper.sample_ids_array_filename_for_subset(subset) + else: + sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME sample_ids = np.load(sample_ids_path) - inds_in_sample_path = ( - split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME - ) + + if subset: + inds_in_sample_path = split_path / helper.inds_in_sample_array_filename_for_subset(subset) + else: + inds_in_sample_path = split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME inds_in_sample = np.load(inds_in_sample_path) window_inds_path = split_path / constants.WINDOW_INDS_ARRAY_FILENAME @@ -252,12 +425,14 @@ def from_dataset_path( return cls( dataset_path, dataset_df, + input_type, split, sample_ids, inds_in_sample, window_size, frame_dur, stride, + subset, window_inds, transform, target_transform, diff --git a/src/vak/datasets/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py index 2281b4d21..8755ddacf 100644 --- a/src/vak/datasets/parametric_umap/parametric_umap.py +++ b/src/vak/datasets/parametric_umap/parametric_umap.py @@ -1,3 +1,4 @@ +"""A dataset class used to train Parametric UMAP models.""" from __future__ import annotations import pathlib @@ -185,16 +186,64 @@ def get_graph_elements( class ParametricUMAPDataset(Dataset): - """Dataset used for training Parametric UMAP models""" + """A dataset class used to train Parametric UMAP models.""" def __init__( self, - data: npt.NDArray, - graph, + dataset_path: str | pathlib.Path, dataset_df: pd.DataFrame, + split: str, + subset: str | None = None, n_epochs: int = 200, + n_neighbors: int = 10, + metric: str = "euclidean", + random_state: int | None = None, transform: Callable | None = None, ): + """Initialize a :class:`ParametricUMAPDataset` instance. + + Parameters + ---------- + dataset_path : pathlib.Path + Path to directory that represents a + parametric UMAP dataset, + as created by + :func:`vak.prep.prep_parametric_umap_dataset`. + dataset_df : pandas.DataFrame + A parametric UMAP dataset, + represented as a :class:`pandas.DataFrame`. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + n_epochs : int + Number of epochs model will be trained. Default is 200. + transform : callable, optional + """ + # subset takes precedence over split, if specified + if subset: + dataset_df = dataset_df[dataset_df.subset == subset].copy() + else: + dataset_df = dataset_df[dataset_df.split == split].copy() + + data = np.stack( + [ + np.load(dataset_path / spect_path) + for spect_path in dataset_df.spect_path.values + ] + ) + + graph = get_umap_graph( + data, + n_neighbors=n_neighbors, + metric=metric, + random_state=random_state, + ) + ( graph, epochs_per_sample, @@ -246,28 +295,49 @@ def from_dataset_path( cls, dataset_path: str | pathlib.Path, split: str, + subset: str | None = None, n_neighbors: int = 10, metric: str = "euclidean", random_state: int | None = None, n_epochs: int = 200, transform: Callable | None = None, ): - """ + """Make a :class:`ParametricUMAPDataset` instance, + given the path to parametric UMAP dataset. Parameters ---------- - dataset_path : str, pathlib.Path - Path to a directory that represents a dataset. - split - n_neighbors - metric - random_state - n_epochs - transform + dataset_path : pathlib.Path + Path to directory that represents a + parametric UMAP dataset, + as created by + :func:`vak.prep.prep_parametric_umap_dataset`. + split : str + The name of a split from the dataset, + one of {'train', 'val', 'test'}. + subset : str, optional + Name of subset to use. + If specified, this takes precedence over split. + Subsets are typically taken from the training data + for use when generating a learning curve. + n_neighbors : int + Number of nearest neighbors to use + when computing approximate nearest neighbors. + Parameter passed to :class:`pynndescent.NNDescent` + and :func:`umap._umap.fuzzy_simplicial_set`. + metric : str + Distance metric. Default is "cosine". + Parameter passed to :class:`pynndescent.NNDescent` + and :func:`umap._umap.fuzzy_simplicial_set`. + random_state : numpy.random.RandomState + Either a numpy.random.RandomState instance, + or None. + transform : callable + The transform applied to the input to the neural network :math:`x`. Returns ------- - + dataset : vak.datasets.parametric_umap.ParametricUMAPDataset """ import vak.datasets # import here just to make classmethod more explicit @@ -278,27 +348,17 @@ def from_dataset_path( dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_df = pd.read_csv(dataset_csv_path) - split_df = dataset_df[dataset_df.split == split] - - data = np.stack( - [ - np.load(dataset_path / spect_path) - for spect_path in split_df.spect_path.values - ] - ) - graph = get_umap_graph( - data, - n_neighbors=n_neighbors, - metric=metric, - random_state=random_state, - ) return cls( - data, - graph, - split_df, + dataset_path, + dataset_df, + split, + subset, n_epochs, - transform=transform, + n_neighbors, + metric, + random_state, + transform, ) diff --git a/src/vak/learncurve/frame_classification.py b/src/vak/learncurve/frame_classification.py index 265a6dc12..0eff85775 100644 --- a/src/vak/learncurve/frame_classification.py +++ b/src/vak/learncurve/frame_classification.py @@ -202,8 +202,8 @@ def learning_curve_for_frame_classification_model( f"Saving results to: {results_path_this_replicate}", ) - # `split` lets us use correct subset ("split") of training set for this duration / replicate - split = common.learncurve.get_train_dur_replicate_split_name( + # `subset` lets us use correct subset of training set for this duration / replicate + subset = common.learncurve.get_train_dur_replicate_subset_name( train_dur, replicate_num ) @@ -225,7 +225,7 @@ def learning_curve_for_frame_classification_model( ckpt_step=ckpt_step, patience=patience, device=device, - split=split, + subset=subset, ) logger.info(f"Evaluating model from replicate {replicate_num} ") diff --git a/src/vak/models/frame_classification_model.py b/src/vak/models/frame_classification_model.py index 84616d4d9..f1d36ec02 100644 --- a/src/vak/models/frame_classification_model.py +++ b/src/vak/models/frame_classification_model.py @@ -332,9 +332,9 @@ def predict_step(self, batch: tuple, batch_idx: int): containing the spectrogram for which a prediction was generated. """ - x, source_path = batch["frames"].to(self.device), batch["source_path"] - if isinstance(source_path, list) and len(source_path) == 1: - source_path = source_path[0] + x, frames_path = batch["frames"].to(self.device), batch["frames_path"] + if isinstance(frames_path, list) and len(frames_path) == 1: + frames_path = frames_path[0] # TODO: fix this weirdness. Diff't collate_fn? if x.ndim in (5, 4): if x.shape[0] == 1: @@ -342,7 +342,7 @@ def predict_step(self, batch: tuple, batch_idx: int): else: raise ValueError(f"invalid shape for x: {x.shape}") y_pred = self.network(x) - return {source_path: y_pred} + return {frames_path: y_pred} @classmethod def from_config( diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py index c5c7e52da..b029ea809 100644 --- a/src/vak/predict/frame_classification.py +++ b/src/vak/predict/frame_classification.py @@ -239,9 +239,9 @@ def predict_with_frame_classification_model( results = trainer.predict(model, pred_loader) # TODO: figure out how to overload `on_predict_epoch_end` to return dict pred_dict = { - source_path: y_pred + frames_path: y_pred for result in results - for source_path, y_pred in result.items() + for frames_path, y_pred in result.items() } # ---------------- converting to annotations ------------------------------------------------------------------ progress_bar = tqdm(pred_loader) @@ -256,11 +256,11 @@ def predict_with_frame_classification_model( annots = [] logger.info("converting predictions to annotations") for ind, batch in enumerate(progress_bar): - padding_mask, source_path = batch["padding_mask"], batch["source_path"] + padding_mask, frames_path = batch["padding_mask"], batch["frames_path"] padding_mask = np.squeeze(padding_mask) - if isinstance(source_path, list) and len(source_path) == 1: - source_path = source_path[0] - y_pred = pred_dict[source_path] + if isinstance(frames_path, list) and len(frames_path) == 1: + frames_path = frames_path[0] + y_pred = pred_dict[frames_path] if save_net_outputs: # not sure if there's a better way to get outputs into right shape; @@ -271,7 +271,7 @@ def predict_with_frame_classification_model( net_output = net_output[:, padding_mask] net_output = net_output.cpu().numpy() net_output_path = output_dir.joinpath( - pathlib.Path(source_path).stem + pathlib.Path(frames_path).stem + f"{model_name}{constants.NET_OUTPUT_SUFFIX}" ) np.savez(net_output_path, net_output) @@ -281,12 +281,12 @@ def predict_with_frame_classification_model( if input_type == "audio": frames, samplefreq = constants.AUDIO_FORMAT_FUNC_MAP[audio_format]( - source_path + frames_path ) frame_times = np.arange(frames.shape[-1]) / samplefreq elif input_type == "spect": spect_dict = files.spect.load( - dataset_path / source_path, spect_format=spect_format + frames_path, spect_format=spect_format ) frame_times = spect_dict[timebins_key] @@ -311,7 +311,7 @@ def predict_with_frame_classification_model( labels=labels, onsets_s=onsets_s, offsets_s=offsets_s ) - audio_fname = files.spect.find_audio_fname(source_path) + audio_fname = files.spect.find_audio_fname(frames_path) annot = crowsetta.Annotation( seq=seq, notated_path=audio_fname, annot_path=annot_csv_path.name ) diff --git a/src/vak/prep/audio_dataset.py b/src/vak/prep/audio_dataset.py index 3f6ff192e..9e8fe2970 100644 --- a/src/vak/prep/audio_dataset.py +++ b/src/vak/prep/audio_dataset.py @@ -36,9 +36,10 @@ def prep_audio_dataset( annot_file: str | pathlib.Path | None = None, labelset: set | None = None, ) -> pd.DataFrame: - """Creates a dataset of audio files from a directory, + """Gets a set of audio files from a directory, optionally paired with an annotation file or files, - and return a Pandas DataFrame that represents the dataset. + and return a Pandas DataFrame that represents the set + of files. Finds all files with ``audio_format`` in ``data_dir``, then finds any annotations with ``annot_format`` if specified, @@ -74,9 +75,14 @@ def prep_audio_dataset( Returns ------- - dataset_df : pandas.Dataframe - Dataframe that represents a dataset of audio files, - optionally with annotations. + source_files_df : pandas.Dataframe + A set of source files that will be used to prepare a + data set for use with neural network models, + represented as a :class:`pandas.DataFrame`. + Will contain paths to audio files, + possibly paired with annotation files. + The columns of the dataframe are specified by + :const:`vak.prep.audio_dataset.DF_COLUMNS`. """ # pre-conditions --------------------------------------------------------------------------------------------------- if audio_format not in constants.VALID_AUDIO_FORMATS: diff --git a/src/vak/prep/frame_classification/__init__.py b/src/vak/prep/frame_classification/__init__.py index f9ee48004..5f217779d 100644 --- a/src/vak/prep/frame_classification/__init__.py +++ b/src/vak/prep/frame_classification/__init__.py @@ -1,10 +1,14 @@ -from . import dataset_arrays, frame_classification, learncurve, validators +from . import frame_classification, learncurve, make_splits, validators +from .assign_samples_to_splits import assign_samples_to_splits from .frame_classification import prep_frame_classification_dataset +from .source_files import get_or_make_source_files __all__ = [ - "dataset_arrays", + "assign_samples_to_splits", "frame_classification", + "get_or_make_source_files", "learncurve", + "make_splits", "prep_frame_classification_dataset", "validators", ] diff --git a/src/vak/prep/frame_classification/assign_samples_to_splits.py b/src/vak/prep/frame_classification/assign_samples_to_splits.py new file mode 100644 index 000000000..ced7d89f9 --- /dev/null +++ b/src/vak/prep/frame_classification/assign_samples_to_splits.py @@ -0,0 +1,138 @@ +"""Assign samples in a dataset to splits. + +Given a set of source files represented by a dataframe, +assign each sample (row) to a split. + +Helper function called by :func:`vak.prep.frame_classification.prep_frame_classification_dataset`. +""" +from __future__ import annotations + +import logging +import pathlib + +import pandas as pd + +from .. import dataset_df_helper, split + + +logger = logging.getLogger(__name__) + + +def assign_samples_to_splits( + purpose: str, + dataset_df: pd.DataFrame, + dataset_path: str | pathlib.Path, + train_dur: float | None = None, + val_dur: float | None = None, + test_dur: float | None = None, + labelset: set | None = None, +) -> pd.DataFrame: + """Assign samples in a dataset to splits. + + Given a set of source files represented by a dataframe, + assign each sample (row) to a split. + + Helper function called by :func:`vak.prep.frame_classification.prep_frame_classification_dataset`. + + If no durations are specified for splits, + or the purpose is either `'eval'` or `'predict'`, + then all rows in the dataframe + will be assigned to ``purpose``. + + Parameters + ---------- + purpose : str + Purpose of the dataset. + One of {'train', 'eval', 'predict', 'learncurve'}. + These correspond to commands of the vak command-line interface. + train_dur : float + Total duration of training set, in seconds. + When creating a learning curve, + training subsets of shorter duration + will be drawn from this set. Default is None. + val_dur : float + Total duration of validation set, in seconds. + Default is None. + test_dur : float + Total duration of test set, in seconds. + Default is None. + dataset_df : pandas.DataFrame + That represents a dataset. + dataset_path : pathlib.Path + Path to csv saved from ``dataset_df``. + labelset : str, list, set + Set of unique labels for vocalizations. Strings or integers. + Default is ``None``. If not ``None``, then files will be skipped + where the associated annotation + contains labels not found in ``labelset``. + ``labelset`` is converted to a Python ``set`` using + :func:`vak.converters.labelset_to_set`. + See help for that function for details on how to specify ``labelset``. + + Returns + ------- + dataset_df : pandas.DataFrame + The same ``dataset_df`` with a `'split'` column added, + where each element in that column assigns the corresponding + row to one of the splits in the dataset. + """ + + # ---- (possibly) split into train / val / test sets --------------------------------------------- + # catch case where user specified duration for just training set, raise a helpful error instead of failing silently + if (purpose == "train" or purpose == "learncurve") and ( + (train_dur is not None and train_dur > 0) + and (val_dur is None or val_dur == 0) + and (test_dur is None or val_dur == 0) + ): + raise ValueError( + "A duration specified for just training set, but prep function does not currently support creating a " + "single split of a specified duration. Either remove the train_dur option from the prep section and " + "rerun, in which case all data will be included in the training set, or specify values greater than " + "zero for test_dur (and val_dur, if a validation set will be used)" + ) + + if all( + [dur is None for dur in (train_dur, val_dur, test_dur)] + ) or purpose in ( + "eval", + "predict", + ): + # then we're not going to split + logger.info("Will not split dataset.") + do_split = False + else: + if val_dur is not None and train_dur is None and test_dur is None: + raise ValueError( + "cannot specify only val_dur, unclear how to split dataset into training and test sets" + ) + else: + logger.info("Will split dataset.") + do_split = True + + if do_split: + dataset_df = split.frame_classification_dataframe( + dataset_df, + dataset_path, + labelset=labelset, + train_dur=train_dur, + val_dur=val_dur, + test_dur=test_dur, + ) + + elif ( + do_split is False + ): # add a split column, but assign everything to the same 'split' + # ideally we would just say split=purpose in call to add_split_col, but + # we have to special case, because "eval" looks for a 'test' split (not an "eval" split) + if purpose == "eval": + split_name = ( + "test" # 'split_name' to avoid name clash with split package + ) + elif purpose == "predict": + split_name = "predict" + + dataset_df = dataset_df_helper.add_split_col( + dataset_df, split=split_name + ) + + return dataset_df diff --git a/src/vak/prep/frame_classification/frame_classification.py b/src/vak/prep/frame_classification/frame_classification.py index 4d291c8bc..c84c26985 100644 --- a/src/vak/prep/frame_classification/frame_classification.py +++ b/src/vak/prep/frame_classification/frame_classification.py @@ -1,3 +1,5 @@ +"""Function that prepares datasets for neural network models +that perform the frame classification task.""" from __future__ import annotations import json @@ -6,17 +8,19 @@ import warnings import crowsetta.formats.seq +import pandas as pd from ... import datasets from ...common import labels from ...common.converters import expanded_user_path, labelset_to_set from ...common.logging import config_logging_for_cli, log_version from ...common.timenow import get_timenow_as_str -from .. import dataset_df_helper, sequence_dataset, split -from ..audio_dataset import prep_audio_dataset -from ..spectrogram_dataset.prep import prep_spectrogram_dataset -from . import dataset_arrays, validators -from .learncurve import make_learncurve_splits_from_dataset_df +from .. import dataset_df_helper, sequence_dataset +from . import validators +from .assign_samples_to_splits import assign_samples_to_splits +from .source_files import get_or_make_source_files +from .learncurve import make_subsets_from_dataset_df +from .make_splits import make_splits logger = logging.getLogger(__name__) @@ -33,13 +37,14 @@ def prep_frame_classification_dataset( annot_file: str | pathlib.Path | None = None, labelset: set | None = None, audio_dask_bag_kwargs: dict | None = None, - train_dur: int | None = None, - val_dur: int | None = None, - test_dur: int | None = None, + train_dur: float | None = None, + val_dur: float | None = None, + test_dur: float | None = None, train_set_durs: list[float] | None = None, num_replicates: int | None = None, spect_key: str = "s", timebins_key: str = "t", + freqbins_key: str = "f", ): """Prepare datasets for neural network models that perform the frame classification task. @@ -116,9 +121,11 @@ def prep_frame_classification_dataset( Each replicate uses a different randomly drawn subset of the training data (but of the same duration). spect_key : str - key for accessing spectrogram in files. Default is 's'. + Key for accessing spectrogram in files. Default is 's'. timebins_key : str - key for accessing vector of time bins in files. Default is 't'. + Key for accessing vector of time bins in files. Default is 't'. + freqbins_key : str + Key for accessing vector of frequency bins in files. Default is 'f'. Returns ------- @@ -264,96 +271,35 @@ def prep_frame_classification_dataset( ) logger.info(f"Will prepare dataset as directory: {dataset_path}") - # ---- actually make the dataset ----------------------------------------------------------------------------------- - if input_type == "spect": - dataset_df = prep_spectrogram_dataset( - labelset=labelset, - data_dir=data_dir, - annot_format=annot_format, - annot_file=annot_file, - audio_format=audio_format, - spect_format=spect_format, - spect_params=spect_params, - spect_output_dir=dataset_path, - audio_dask_bag_kwargs=audio_dask_bag_kwargs, - ) - elif input_type == "audio": - dataset_df = prep_audio_dataset( - audio_format=audio_format, - data_dir=data_dir, - annot_format=annot_format, - labelset=labelset, - ) - - if dataset_df.empty: - raise ValueError( - "Calling `vak.prep.spectrogram_dataset.prep_spectrogram_dataset` " - "with arguments passed to `vak.core.prep` " - "returned an empty dataframe.\n" - "Please double-check arguments to `vak.core.prep` function." - ) + # ---- get or make source files: either audio or spectrogram, possible paired with annotation files ---------------- + source_files_df: pd.DataFrame = get_or_make_source_files( + data_dir, + input_type, + audio_format, + spect_format, + spect_params, + dataset_path, + annot_format, + annot_file, + labelset, + audio_dask_bag_kwargs, + ) # save before (possibly) splitting, just in case duration args are not valid # (we can't know until we make dataset) - dataset_df.to_csv(dataset_csv_path) - - # ---- (possibly) split into train / val / test sets --------------------------------------------- - # catch case where user specified duration for just training set, raise a helpful error instead of failing silently - if (purpose == "train" or purpose == "learncurve") and ( - (train_dur is not None and train_dur > 0) - and (val_dur is None or val_dur == 0) - and (test_dur is None or val_dur == 0) - ): - raise ValueError( - "A duration specified for just training set, but prep function does not currently support creating a " - "single split of a specified duration. Either remove the train_dur option from the prep section and " - "rerun, in which case all data will be included in the training set, or specify values greater than " - "zero for test_dur (and val_dur, if a validation set will be used)" - ) + source_files_df.to_csv(dataset_csv_path) - if all( - [dur is None for dur in (train_dur, val_dur, test_dur)] - ) or purpose in ( - "eval", - "predict", - ): - # then we're not going to split - logger.info("Will not split dataset.") - do_split = False - else: - if val_dur is not None and train_dur is None and test_dur is None: - raise ValueError( - "cannot specify only val_dur, unclear how to split dataset into training and test sets" - ) - else: - logger.info("Will split dataset.") - do_split = True - - if do_split: - dataset_df = split.frame_classification_dataframe( - dataset_df, - dataset_path, - labelset=labelset, - train_dur=train_dur, - val_dur=val_dur, - test_dur=test_dur, - ) - - elif ( - do_split is False - ): # add a split column, but assign everything to the same 'split' - # ideally we would just say split=purpose in call to add_split_col, but - # we have to special case, because "eval" looks for a 'test' split (not an "eval" split) - if purpose == "eval": - split_name = ( - "test" # 'split_name' to avoid name clash with split package - ) - elif purpose == "predict": - split_name = "predict" - - dataset_df = dataset_df_helper.add_split_col( - dataset_df, split=split_name - ) + # ---- assign samples to splits; adds a 'split' column to dataset_df, calling `vak.prep.split` if needed ----------- + # once we assign a split, we consider this the ``dataset_df`` + dataset_df: pd.DataFrame = assign_samples_to_splits( + purpose, + source_files_df, + dataset_path, + train_dur, + val_dur, + test_dur, + labelset, + ) # ---- create and save labelmap ------------------------------------------------------------------------------------ # we do this before creating array files since we need to load the labelmap to make frame label vectors @@ -374,8 +320,9 @@ def prep_frame_classification_dataset( else: labelmap = None - # ---- make arrays that represent final dataset -------------------------------------------------------------------- - dataset_df = dataset_arrays.make_npy_files_for_each_split( + # ---- actually move/copy/create files into directories representing splits ---------------------------------------- + # now we're *remaking* the dataset_df (actually adding additional rows with the splits) + dataset_df: pd.DataFrame = make_splits( dataset_df, dataset_path, input_type, @@ -384,20 +331,18 @@ def prep_frame_classification_dataset( audio_format, spect_key, timebins_key, + freqbins_key, ) - # ---- if purpose is learncurve, additionally prep splits for that ------------------------------------------------- + # ---- if purpose is learncurve, additionally prep training data subsets for the learning curve -------------------- if purpose == "learncurve": - dataset_df = make_learncurve_splits_from_dataset_df( + dataset_df: pd.DataFrame = make_subsets_from_dataset_df( dataset_df, input_type, train_set_durs, num_replicates, dataset_path, labelmap, - audio_format, - spect_key, - timebins_key, ) # ---- save csv file that captures provenance of source data ------------------------------------------------------- diff --git a/src/vak/prep/frame_classification/learncurve.py b/src/vak/prep/frame_classification/learncurve.py index 363675d1d..1ebef22f6 100644 --- a/src/vak/prep/frame_classification/learncurve.py +++ b/src/vak/prep/frame_classification/learncurve.py @@ -6,71 +6,233 @@ import pathlib from typing import Sequence +import attrs +import dask.bag as db +import numpy as np import pandas as pd +from dask.diagnostics import ProgressBar -from ... import common +from ... import common, datasets from .. import split -from .dataset_arrays import make_npy_files_for_each_split logger = logging.getLogger(__name__) -def make_learncurve_splits_from_dataset_df( +@attrs.define(frozen=True) +class Sample: + """Dataclass representing one sample + in a frame classification dataset. + + Used to add paths for arrays from the sample + to a ``dataset_df``, and to build + the ``sample_ids`` vector and ``inds_in_sample`` vector + for the entire dataset.""" + + source_id: int = attrs.field() + sample_id_vec: np.ndarray + inds_in_sample_vec: np.ndarray + + +def make_index_vectors_for_each_subset( + subsets_df: pd.DataFrame, + dataset_path: str | pathlib.Path, + input_type: str, +) -> pd.DataFrame: + r"""Make npy files containing indexing vectors + for each subset of the training data + used to generate a learning curve + with a frame classification dataset. + + This function is basically the same as + :func:`vak.prep.frame_classification.make_splits.make_splits`, + *except* that it only makes the indexing vectors + for each subset of the training data. + These indexing vectors are needed for each subset + to properly grab windows from the npy files during training. + There is no need to remake the npy files themselves though. + + All the indexing vectors for each split are saved + in the "train" directory split inside ``dataset_path``. + + The indexing vectors are used by + :class:`vak.datasets.frame_classification.WindowDataset` + and :class:`vak.datasets.frame_classification.FramesDataset`. + These vectors make it possible to work with files, + to avoid loading the entire dataset into memory, + and to avoid working with memory-mapped arrays. + The first is the ``sample_ids`` vector, + that represents the "ID" of any sample :math:`(x, y)` in the split. + We use these IDs to load the array files corresponding to the samples. + For a split with :math:`m` samples, this will be an array of length :math:`T`, + the total number of frames across all samples, + with elements :math:`i \in (0, 1, ..., m - 1)` + indicating which frames correspond to which sample :math:`m_i`: + :math:`(0, 0, 0, ..., 1, 1, ..., m - 1, m -1)`. + The second vector is the ``inds_in_sample`` vector. + This vector is the same length as ``sample_ids``, but its values represent + the indices of frames within each sample :math:`x_t`. + For a data set with :math:`T` total frames across all samples, + where :math:`t_i` indicates the number of frames in each :math:`x_i`, + this vector will look like :math:`(0, 1, ..., t_0, 0, 1, ..., t_1, ... t_m)`. + + Parameters + ---------- + subset_df : pandas.DataFrame + A :class:`pandas.DataFrame` representing the training data subsets. + This DataFrame is created by + :func:`vak.prep.frame_classification.learncurve.make_subsets_from_dataset_df`, + and then passed into this function. + It is created from a ``pandas.DataFrame`` + returned by :func:`vak.prep.frame_classification.get_or_make_source_files` + with a ``'split'`` column added. + dataset_path : pathlib.Path + Path to directory that represents dataset. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. + + Returns + ------- + None + """ + subsets = [ + subset + for subset in sorted(subsets_df.subset.dropna().unique()) + ] + for subset in subsets: + logger.info(f"Making indexing vectors for subset: {subset}") + subset_df = subsets_df[subsets_df.subset == subset].copy() + frames_paths = subset_df[ + datasets.frame_classification.constants.FRAMES_PATH_COL_NAME + ].values + + def _return_index_arrays( + source_id_path_tup, + ): + """Function we use with dask to parallelize. + Defined in-line so variables are in scope. + """ + source_id, frames_path = source_id_path_tup + + frames_path = dataset_path / pathlib.Path(frames_path) + + frames = datasets.frame_classification.helper.load_frames( + frames_path, input_type + ) + + n_frames = frames.shape[-1] + sample_id_vec = np.ones((n_frames,)).astype(np.int32) * source_id + inds_in_sample_vec = np.arange(n_frames) + + return Sample( + source_id, + sample_id_vec, + inds_in_sample_vec, + ) + + # ---- make npy files for this split, parallelized with dask + # using nested function just defined + source_id_frames_path_tups = [ + (source_id, frames_path) + for source_id, frames_path in enumerate(frames_paths) + ] + + source_id_frames_path_bag = db.from_sequence(source_id_frames_path_tups) + with ProgressBar(): + samples = list( + source_id_frames_path_bag.map( + _return_index_arrays + ) + ) + samples = sorted(samples, key=lambda sample: sample.source_id) + + # ---- save indexing vectors in train directory + sample_id_vec = np.concatenate( + list(sample.sample_id_vec for sample in samples) + ) + np.save( + dataset_path / "train" / + datasets.frame_classification.helper.sample_ids_array_filename_for_subset(subset), + sample_id_vec, + ) + inds_in_sample_vec = np.concatenate( + list(sample.inds_in_sample_vec for sample in samples) + ) + np.save( + dataset_path / "train" / + datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(subset), + inds_in_sample_vec, + ) + + +def make_subsets_from_dataset_df( dataset_df: pd.DataFrame, input_type: str, train_set_durs: Sequence[float], num_replicates: int, dataset_path: pathlib.Path, labelmap: dict, - audio_format: str | None = None, - spect_key: str = "s", - timebins_key: str = "t", ) -> pd.DataFrame: - """Make splits for a learning curve - from a dataframe representing the entire dataset, - one split for each combination of (training set duration, - replicate number). - Each split is a randomly drawn subset of data + """Make subsets of the training data split for a learning curve. + + Makes subsets given a dataframe representing the entire dataset, + with one subset for each combination of (training set duration, + replicate number). Each subset is randomly drawn from the total training split. Uses :func:`vak.prep.split.frame_classification_dataframe` to make - splits/subsets of the training data - from ``dataset_df``, and then uses - :func:`vak.prep.frame_classification.dataset_arrays.make_npy_files_for_each_split` - to make the array files for each split. - - A new directory will be made for each combination of - (training set duration, replicate number) as shown below, - for ``train_durs=[4.0, 6.0], num_replicates=2``. - - .. code-block:: console - 032312-vak-frame-classification-dataset-generated-230820_144833 - ├── 032312_prep_230820_144833.csv - ├── labelmap.json - ├── metadata.json - ├── prep_230820_144833.log - ├── spectrograms_generated_230820_144833 - ├── test - ├── train - ├── train-dur-4.0-replicate-1 - ├── train-dur-4.0-replicate-2 - ├── train-dur-6.0-replicate-1 - ├── train-dur-6.0-replicate-2 - ├── TweetyNet_learncurve_audio_cbin_annot_notmat.toml - └── val + subsets of the training data from ``dataset_df``. + + A new column will be added to the dataframe, `'subset'`, + and additional rows for each subset. + The dataframe is returned with these subsets added. + (The `'split'` for these rows will still be `'train'`.) + Additionally, a separate set of indexing vectors + will be made for each subset, using + :func:`vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset`. + .. code-block:: console + + 032312-vak-frame-classification-dataset-generated-231005_121809 + ├── 032312_prep_231005_121809.csv + ├── labelmap.json + ├── metadata.json + ├── prep_231005_121809.log + ├── TweetyNet_learncurve_audio_cbin_annot_notmat.toml + ├── train + ├── gy6or6_baseline_230312_0808.138.cbin.spect.frame_labels.npy + ├── gy6or6_baseline_230312_0808.138.cbin.spect.frames.npy + ├── gy6or6_baseline_230312_0809.141.cbin.spect.frame_labels.npy + ├── gy6or6_baseline_230312_0809.141.cbin.spect.frames.npy + ├── gy6or6_baseline_230312_0813.163.cbin.spect.frame_labels.npy + ├── gy6or6_baseline_230312_0813.163.cbin.spect.frames.npy + ├── gy6or6_baseline_230312_0816.179.cbin.spect.frame_labels.npy + ├── gy6or6_baseline_230312_0816.179.cbin.spect.frames.npy + ├── gy6or6_baseline_230312_0820.196.cbin.spect.frame_labels.npy + ├── gy6or6_baseline_230312_0820.196.cbin.spect.frames.npy + ├── inds_in_sample.npy + ├── inds_in_sample-train-dur-4.0-replicate-1.npy + ├── inds_in_sample-train-dur-4.0-replicate-2.npy + ├── inds_in_sample-train-dur-6.0-replicate-1.npy + ├── inds_in_sample-train-dur-6.0-replicate-2.npy + ├── sample_ids.npy + ├── sample_ids-train-dur-4.0-replicate-1.npy + ├── sample_ids-train-dur-4.0-replicate-2.npy + ├── sample_ids-train-dur-6.0-replicate-1.npy + └── sample_ids-train-dur-6.0-replicate-2.npy + ... Parameters ---------- dataset_df : pandas.DataFrame - Representing an entire dataset of vocalizations. - input_type : str - The type of input to the neural network model. - One of {'audio', 'spect'}. + Dataframe representing a dataset for frame classification models. + It is returned by + :func:`vak.prep.frame_classification.get_or_make_source_files`, + and has a ``'split'`` column added. train_set_durs : list - of int, durations in seconds of subsets taken from training data - to create a learning curve, e.g. [5, 10, 15, 20]. + Durations in seconds of subsets taken from training data + to create a learning curve, e.g., `[5., 10., 15., 20.]`. num_replicates : int number of times to replicate training for each training set duration to better estimate metrics for a training set of that size. @@ -78,32 +240,24 @@ def make_learncurve_splits_from_dataset_df( data (but of the same duration). dataset_path : str, pathlib.Path Directory where splits will be saved. - labelmap : dict - A :class:`dict` that maps a set of human-readable - string labels to the integer classes predicted by a neural - network model. As returned by :func:`vak.labels.to_map`. - audio_format : str - A :class:`string` representing the format of audio files. - One of :constant:`vak.common.constants.VALID_AUDIO_FORMATS`. - spect_key : str - Key for accessing spectrogram in files. Default is 's'. - timebins_key : str - Key for accessing vector of time bins in files. Default is 't'. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. Returns ------- dataset_df_out : pandas.DataFrame A pandas.DataFrame that has the original splits - from ``dataset_df`` as well as the additional subsets + from ``dataset_df``, as well as the additional subsets of the training data added, along with additional - 'train_dur' and 'replicate_num' columns - that can be used during analysis. + columns, ``'subset', 'train_dur', 'replicate_num'``, + that are used by :mod:`vak`. Other functions like :func:`vak.learncurve.learncurve` specify a specific subset of the training data - by getting the split name with the function + by getting the subset name with the function :func:`vak.common.learncurve.get_train_dur_replicate_split_name`, and then filtering ``dataset_df_out`` with that name - using the 'split' column. + using the 'subset' column. """ dataset_path = pathlib.Path(dataset_path) @@ -114,14 +268,14 @@ def make_learncurve_splits_from_dataset_df( # will concat after loop, then use ``csv_path`` to replace # original dataset df with this one - all_train_durs_and_replicates_df = [] + subsets_df = [] for train_dur in train_set_durs: logger.info( f"Subsetting training set for training set of duration: {train_dur}", ) for replicate_num in range(1, num_replicates + 1): - train_dur_replicate_split_name = ( - common.learncurve.get_train_dur_replicate_split_name( + train_dur_replicate_subset_name = ( + common.learncurve.get_train_dur_replicate_subset_name( train_dur, replicate_num ) ) @@ -138,30 +292,27 @@ def make_learncurve_splits_from_dataset_df( train_dur_replicate_df.split == "train" ] # next line, make split name in csv match the split name used for directory in dataset dir - train_dur_replicate_df["split"] = train_dur_replicate_split_name + train_dur_replicate_df["subset"] = train_dur_replicate_subset_name train_dur_replicate_df["train_dur"] = train_dur train_dur_replicate_df["replicate_num"] = replicate_num - all_train_durs_and_replicates_df.append(train_dur_replicate_df) + subsets_df.append(train_dur_replicate_df) - all_train_durs_and_replicates_df = pd.concat( - all_train_durs_and_replicates_df + subsets_df = pd.concat( + subsets_df ) - all_train_durs_and_replicates_df = make_npy_files_for_each_split( - all_train_durs_and_replicates_df, + + make_index_vectors_for_each_subset( + subsets_df, dataset_path, input_type, - "learncurve", # purpose - labelmap, - audio_format, - spect_key, - timebins_key, ) # keep the same validation, test, and total train sets by concatenating them with the train subsets + dataset_df["subset"] = None # add column but have it be empty dataset_df = pd.concat( ( - all_train_durs_and_replicates_df, - dataset_df, + subsets_df, + dataset_df ) ) # We reset the entire index across all splits, instead of repeating indices, diff --git a/src/vak/prep/frame_classification/dataset_arrays.py b/src/vak/prep/frame_classification/make_splits.py similarity index 64% rename from src/vak/prep/frame_classification/dataset_arrays.py rename to src/vak/prep/frame_classification/make_splits.py index dbeb46978..a66187990 100644 --- a/src/vak/prep/frame_classification/dataset_arrays.py +++ b/src/vak/prep/frame_classification/make_splits.py @@ -5,6 +5,7 @@ import copy import logging import pathlib +import shutil import attrs import crowsetta @@ -16,6 +17,7 @@ from ... import common, datasets, transforms from .. import constants as prep_constants + logger = logging.getLogger(__name__) @@ -85,44 +87,98 @@ class Sample: Used to add paths for arrays from the sample to a ``dataset_df``, and to build the ``sample_ids`` vector and ``inds_in_sample`` vector - for the entire dataset.""" + for the entire dataset. + Attributes + ---------- + source_id : int + Integer ID number used for sorting. + frames_path : str + The path to the input to the model + :math:`x` after it has been moved, + copied, or created from a ``source_path``. + Path will be written relative to ``dataset_path``. + We preserve the original paths as metadata, + and consider the files in the split to contain + frames, regardless of the source domain + of the data. + frame_labels_npy_path : str + Path to frame labels, + relative to ``dataset_path``. + sample_id_vec : numpy.ndarray + Sample ID vector for this sample. + inds_in_sample_vec : numpy.ndarray + Indices within sample. + """ source_id: int = attrs.field() - frame_npy_path: str + source_path: str frame_labels_npy_path: str sample_id_vec: np.ndarray inds_in_sample_vec: np.ndarray -def make_npy_files_for_each_split( +def make_splits( dataset_df: pd.DataFrame, dataset_path: str | pathlib.Path, input_type: str, purpose: str, labelmap: dict, - audio_format: str, + audio_format: str | None = None, spect_key: str = "s", timebins_key: str = "t", + freqbins_key: str = "f", ) -> pd.DataFrame: - r"""Make npy files containing arrays - for each split of a frame classification dataset. - - All the npy files for each split are saved - in a new directory inside ``dataset_path`` - that has the same name as the split. + r"""Make each split of a frame classification dataset. + + This function takes a :class:`pandas.Dataframe` returned by + :func:`vak.prep.spectrogram_dataset.prep_spectrogram_dataset` + or :func:`vak.prep.audio_dataset.prep_audio_dataset`, + after it has been assigned a `'split'` column, + and then copies, moves, or generates the required files + as appropriate for each split. + + For each unique `'split'` in the :class:`pandas.Dataframe`, + a directory is made inside ``dataset_path``. + At a high level, all files needed for working with that split + will be in that directory E.g., the ``train`` directory inside ``dataset_path`` would have all the files for every row in ``dataset_df`` for which ``dataset_df['split'] == 'train'``. - The function creates two npy files for each row in ``dataset_df``. - One has the extension '.frames.npy` and contains the input - to the frame classification model. The other has the extension - '.frame_labels.npy', and contains a vector + The inputs to the neural network model + are moved or copied into the split directory, + or generated if necessary. + If the ``input_type`` is `'audio'`, + then the audio files are copied from their original directory. + If the ``input_type`` is `'spect'`, + and the spectrogram files are already + in ``dataset_path``, they are moved into the split directory + (under the assumption they were generated + by ``vak.prep.spectrogram_dataset.audio_helper``). + If they are npz files, but they are not in ``dataset_path``, + then they are validated to make sure they have the appropriate keys, + and then copied into the split directory. + This could be the case if the files were generated + by another program. + If they are mat files, they will be converted to npz + with the default keys for arrays, + and then saved in a new npz file in the split directory. + This step is required so that all dataset + prepared by :mod:`vak` are in a "normalized" or + "canonicalized" format. + + In addition to copying or moving the audio or spectrogram + files that are inputs to the neural network model, + other npy files are made for each split + and saved in the corresponding directory. + This function creates one npy file for each row in ``dataset_df``. + It has the extension '.frame_labels.npy', and contains a vector where each element is the target label that the network should predict for the corresponding frame. - Taken together, these two files are the data + Taken together, the audio or spectrogram file in each row + along with its corresponding frame labels are the data for each sample :math:`(x, y)` in the dataset, - where :math:`x_t` is the frames and :math:`y_t` is the frame labels. + where :math:`x_t` supplies the "frames", and :math:`y_t` is the frame labels. This function also creates two additional npy files for each split. These npy files are "indexing" vectors that @@ -132,9 +188,9 @@ def make_npy_files_for_each_split( to avoid loading the entire dataset into memory, and to avoid working with memory-mapped arrays. The first is the ``sample_ids`` vector, - that represents the "ID" of any sample :math:`(x, y)` in the dataset. + that represents the "ID" of any sample :math:`(x, y)` in the split. We use these IDs to load the array files corresponding to the samples. - For a dataset with :math:`m` samples, this will be an array of length :math:`T`, + For a split with :math:`m` samples, this will be an array of length :math:`T`, the total number of frames across all samples, with elements :math:`i \in (0, 1, ..., m - 1)` indicating which frames correspond to which sample :math:`m_i`: @@ -176,6 +232,8 @@ def make_npy_files_for_each_split( Key for accessing spectrogram in files. Default is 's'. timebins_key : str Key for accessing vector of time bins in files. Default is 't'. + freqbins_key : str + key for accessing vector of frequency bins in files. Default is 'f'. Returns ------- @@ -190,6 +248,12 @@ def make_npy_files_for_each_split( f"Value for ``input_type`` was: {input_type}" ) + if input_type == "audio" and audio_format is None: + raise ValueError( + f"Value for `input_type` was 'audio' but `audio_format` is None. " + f"Please specify the audio format." + ) + dataset_df_out = [] splits = [ split @@ -197,6 +261,7 @@ def make_npy_files_for_each_split( if split != "None" ] for split in splits: + logger.info(f"Making split for dataset: {split}") split_subdir = dataset_path / split split_subdir.mkdir() @@ -221,7 +286,9 @@ def make_npy_files_for_each_split( source_paths = split_df["spect_path"].values else: raise ValueError(f"Invalid ``input_type``: {input_type}") - # do this *again* after sorting the dataframe + source_paths = [pathlib.Path(source_path) for source_path in source_paths] + + # we get annots again, *after* sorting the dataframe if purpose != "predict": annots = common.annotation.from_df(split_df) else: @@ -235,32 +302,50 @@ def _save_dataset_arrays_and_return_index_arrays( Defined in-line so variables are in scope """ source_id, source_path, annot = source_id_path_annot_tup - source_path = pathlib.Path(source_path) if input_type == "audio": + # we always copy audio to the split directory, to avoid damaging source data + frames_path = shutil.copy(source_path, split_subdir) + # after copying, we load frames to compute frame labels frames, samplefreq = common.constants.AUDIO_FORMAT_FUNC_MAP[ audio_format ](source_path) if ( audio_format == "cbin" - ): # convert to ~wav, from int16 to float64 + ): # convert to ~wav, from int16 to float64damage frames = frames.astype(np.float64) / 32768.0 if annot: frame_times = np.arange(frames.shape[-1]) / samplefreq elif input_type == "spect": - spect_dict = np.load(source_path) + if source_path.suffix.endswith('mat'): + spect_dict = common.files.spect.load(source_path, "mat") + # convert to .npz and save in spect_output_dir + spect_dict_npz = { + "s": spect_dict[spect_key], + "t": spect_dict[timebins_key], + "f": spect_dict[freqbins_key], + } + frames_path = split_subdir / ( + source_path.stem + ".npz" + ) + np.savez(frames_path, **spect_dict_npz) + elif source_path.suffix.endswith('npz'): + spect_dict = common.files.spect.load(source_path, "npz") + if source_path.is_relative_to(dataset_path): + # it's already in dataset_path, we just move it into the split + frames_path = shutil.move(source_path, split_subdir) + else: + # it's somewhere else we copy it to be safe + if not all([key in spect_dict for key in ('s', 't', 'f')]): + raise ValueError( + f"The following spectrogram file did not have valid keys: {source_path}\n." + f"All npz files should have keys 's', 't', 'f' corresponding to the spectrogram," + f"the frequencies vector, and the time vector." + ) + frames_path = shutil.copy(source_path, split_subdir) frames = spect_dict[spect_key] if annot: frame_times = spect_dict[timebins_key] - frames_npy_path = split_subdir / ( - source_path.stem - + datasets.frame_classification.constants.FRAMES_ARRAY_EXT - ) - np.save(frames_npy_path, frames) - frames_npy_path = str( - # make sure we save path in csv as relative to dataset root - frames_npy_path.relative_to(dataset_path) - ) n_frames = frames.shape[-1] sample_id_vec = np.ones((n_frames,)).astype(np.int32) * source_id @@ -288,9 +373,13 @@ def _save_dataset_arrays_and_return_index_arrays( else: frame_labels_npy_path = None + # Rewrite ``frames_path`` as relative to root + # because all functions and classes downstream expect this + frames_path = pathlib.Path(frames_path).relative_to(dataset_path) + return Sample( source_id, - frames_npy_path, + frames_path, frame_labels_npy_path, sample_id_vec, inds_in_sample_vec, @@ -338,19 +427,31 @@ def _save_dataset_arrays_and_return_index_arrays( inds_in_sample_vec, ) - frame_npy_paths = [str(sample.frame_npy_path) for sample in samples] + # We convert `frames_paths` back to string + # (just in case they are pathlib.Paths) before adding back to dataframe. + # Note that these are all in split dirs, written relative to ``dataset_path``. + frames_paths = [str(sample.source_path) for sample in samples] split_df[ - datasets.frame_classification.constants.FRAMES_NPY_PATH_COL_NAME - ] = frame_npy_paths + datasets.frame_classification.constants.FRAMES_PATH_COL_NAME + ] = frames_paths frame_labels_npy_paths = [ - str(sample.frame_labels_npy_path) for sample in samples + sample.frame_labels_npy_path + if isinstance(sample.frame_labels_npy_path, str) else None + for sample in samples ] split_df[ datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME ] = frame_labels_npy_paths dataset_df_out.append(split_df) + # ---- clean up + # Remove any spect npz files that were *not* added to a split + spect_npz_files_not_in_split = sorted(dataset_path.glob(f'*{common.constants.SPECT_NPZ_EXTENSION}')) + if len(spect_npz_files_not_in_split) > 0: + for spect_npz_file in spect_npz_files_not_in_split: + spect_npz_file.unlink() + # we reset the entire index across all splits, instead of repeating indices, # and we set drop=False because we don't want to add a new column 'index' or 'level_0' dataset_df_out = pd.concat(dataset_df_out).reset_index(drop=True) diff --git a/src/vak/prep/frame_classification/source_files.py b/src/vak/prep/frame_classification/source_files.py new file mode 100644 index 000000000..6b9fd1333 --- /dev/null +++ b/src/vak/prep/frame_classification/source_files.py @@ -0,0 +1,179 @@ +import logging +import pathlib + +import pandas as pd + +from ...common.converters import expanded_user_path, labelset_to_set +from .. import constants +from ..audio_dataset import prep_audio_dataset +from ..spectrogram_dataset.prep import prep_spectrogram_dataset + + +logger = logging.getLogger(__name__) + + +def get_or_make_source_files( + data_dir: str | pathlib.Path, + input_type: str, + audio_format: str | None = None, + spect_format: str | None = None, + spect_params: dict | None = None, + spect_output_dir: str | pathlib.Path | None = None, + annot_format: str | None = None, + annot_file: str | pathlib.Path | None = None, + labelset: set | None = None, + audio_dask_bag_kwargs: dict | None = None, +) -> pd.DataFrame: + """Get source files for a dataset, or make them. + + Gets either audio or spectrogram files from ``data dir``, + possibly paired with annotation files. + + If ``input_type`` is ``'audio'``, then this function will look + for files with the extension for ``audio_format`` in ``data_dir``. + If ``input_type`` is ``'spectrogram'``, and ``spect_format`` is specified, + then this function will look for files with the extension for that format + in ``data_dir``. If ``input_type`` is spectrogram, + and ``audio_format`` is specified, + this function will look for audio files with that extension + and then generate spectrograms for them using ``spect_params``. + If an ``annot_format`` is specified, this function will additionally + look for annotation files for the audio or spectrogram files. + If all annotations are in a single file, this can be specified + with the ``annot_file`` parameter, and that will be used instead + of looking for other annotation files. + + Parameters + ---------- + data_dir : str, Path + Path to directory with files from which to make dataset. + input_type : str + The type of input to the neural network model. + One of {'audio', 'spect'}. + audio_format : str + Format of audio files. One of {'wav', 'cbin'}. + Default is ``None``, but either ``audio_format`` or ``spect_format`` + must be specified. + spect_format : str + Format of files containing spectrograms as 2-d matrices. One of {'mat', 'npz'}. + Default is None, but either audio_format or spect_format must be specified. + spect_params : dict, vak.config.SpectParams + Parameters for creating spectrograms. Default is ``None``. + spect_output_dir : str + Path to location where spectrogram files should be saved. + Default is None. If ``input_type`` is ``'spect'``, + then ``spect_output_dir`` defaults to ``data_dir``. + annot_format : str + Format of annotations. Any format that can be used with the + :module:`crowsetta` library is valid. Default is ``None``. + annot_file : str + Path to a single annotation file. Default is ``None``. + Used when a single file contains annotates multiple audio + or spectrogram files. + audio_dask_bag_kwargs : dict + Keyword arguments used when calling :func:`dask.bag.from_sequence` + inside :func:`vak.io.audio`, where it is used to parallelize + the conversion of audio files into spectrograms. + Option should be specified in config.toml file as an inline table, + e.g., ``audio_dask_bag_kwargs = { npartitions = 20 }``. + Allows for finer-grained control + when needed to process files of different sizes. + labelset : str, list, set + Set of unique labels for vocalizations. Strings or integers. + Default is ``None``. If not ``None``, then files will be skipped + where the associated annotation + contains labels not found in ``labelset``. + ``labelset`` is converted to a Python ``set`` using + :func:`vak.converters.labelset_to_set`. + See help for that function for details on how to specify ``labelset``. + + Returns + ------- + source_files_df : pandas.DataFrame + Source files that will become the dataset, + represented as a pandas.DataFrame. + Each row corresponds to one sample in the dataset, + either an audio file or spectrogram file, + possibly paired with annotations. + """ + if input_type not in constants.INPUT_TYPES: + raise ValueError( + f"``input_type`` must be one of: {constants.INPUT_TYPES}\n" + f"Value for ``input_type`` was: {input_type}" + ) + + if input_type == "audio" and spect_format is not None: + raise ValueError( + f"Input type was 'audio' but a ``spect_format`` was specified: '{spect_format}'. " + f"Please specify ``audio_format`` only." + ) + + if input_type == "audio" and audio_format is None: + raise ValueError( + "Input type was 'audio' but no ``audio_format`` was specified. " + ) + + if audio_format is None and spect_format is None: + raise ValueError( + "Must specify either ``audio_format`` or ``spect_format``" + ) + + if audio_format and spect_format: + raise ValueError( + "Cannot specify both ``audio_format`` and ``spect_format``, " + "unclear whether to create spectrograms from audio files or " + "use already-generated spectrograms from array files" + ) + + if labelset is not None: + labelset = labelset_to_set(labelset) + + data_dir = expanded_user_path(data_dir) + if not data_dir.is_dir(): + raise NotADirectoryError( + f"Path specified for ``data_dir`` not found: {data_dir}" + ) + + if annot_file is not None: + annot_file = expanded_user_path(annot_file) + if not annot_file.exists(): + raise FileNotFoundError( + f"Path specified for ``annot_file`` not found: {annot_file}" + ) + + if input_type == "spect": + source_files_df = prep_spectrogram_dataset( + data_dir, + annot_format, + labelset, + annot_file, + audio_format, + spect_format, + spect_params, + spect_output_dir, + audio_dask_bag_kwargs, + ) + if source_files_df.empty: + raise ValueError( + "Calling `vak.prep.spectrogram_dataset.prep_spectrogram_dataset` " + "with arguments passed to `vak.prep.prep_frame_classification_dataset` " + "returned an empty dataframe.\n" + "Please double-check arguments to `prep_frame_classification_dataset` function." + ) + + elif input_type == "audio": + source_files_df = prep_audio_dataset( + audio_format, + data_dir, + annot_format, + labelset, + ) + if source_files_df.empty: + raise ValueError( + "Calling `vak.prep.audio_dataset.prep_audio_dataset` " + "with arguments passed to `vak.prep.prep_frame_classification_dataset` " + "returned an empty dataframe.\n" + "Please double-check arguments to `prep_frame_classification_dataset` function." + ) + + return source_files_df diff --git a/src/vak/prep/spectrogram_dataset/audio_helper.py b/src/vak/prep/spectrogram_dataset/audio_helper.py index 2c84a7f2a..f5e6a17a0 100644 --- a/src/vak/prep/spectrogram_dataset/audio_helper.py +++ b/src/vak/prep/spectrogram_dataset/audio_helper.py @@ -236,7 +236,7 @@ def _spect_file(audio_file): } basename = os.path.basename(audio_file) npz_fname = os.path.join( - os.path.normpath(output_dir), basename + ".spect.npz" + os.path.normpath(output_dir), basename + constants.SPECT_NPZ_EXTENSION ) np.savez(npz_fname, **spect_dict) return npz_fname diff --git a/src/vak/prep/spectrogram_dataset/prep.py b/src/vak/prep/spectrogram_dataset/prep.py index 323ddbd56..d36266403 100644 --- a/src/vak/prep/spectrogram_dataset/prep.py +++ b/src/vak/prep/spectrogram_dataset/prep.py @@ -2,13 +2,12 @@ import logging import pathlib -from datetime import datetime import attrs import crowsetta import pandas as pd -from ...common import annotation +from ...common import annotation, constants from ...common.converters import expanded_user_path, labelset_to_set from ...config.spect_params import SpectParamsConfig from . import audio_helper, spect_helper @@ -70,10 +69,8 @@ def prep_spectrogram_dataset( Parameters for creating spectrograms. Default is None (implying that spectrograms are already made). spect_output_dir : str - path to location where spectrogram files should be saved. + Path to location where spectrogram files should be saved. Default is None, in which case it defaults to ``data_dir``. - A new directory will be created in ``spect_output_dir`` with - the name 'spectrograms_generated_{time stamp}'. audio_dask_bag_kwargs : dict Keyword arguments used when calling ``dask.bag.from_sequence`` inside ``vak.io.audio``, where it is used to parallelize @@ -85,8 +82,17 @@ def prep_spectrogram_dataset( Returns ------- - dataset_df : pandas.DataFrame - The dataset prepared from the directory specified + source_files_df : pandas.DataFrame + A set of source files that will be used to prepare a + data set for use with neural network models, + represented as a :class:`pandas.DataFrame`. + Will contain paths to spectrogram files, + possibly paired with annotation files, + as well as the original audio files if the + spectrograms were generated from audio by + :func:`vak.prep.audio_helper.make_spectrogram_files_from_audio_files`. + The columns of the dataframe are specified by + :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`. """ # ---- pre-conditions ---------------------------------------------------------------------------------------------- if labelset is not None: @@ -115,11 +121,6 @@ def prep_spectrogram_dataset( else: spect_output_dir = data_dir - timenow = datetime.now().strftime("%y%m%d_%H%M%S") - spect_dirname = f"spectrograms_generated_{timenow}" - spect_output_dir = spect_output_dir / spect_dirname - spect_output_dir.mkdir() - if annot_format is not None: if annot_file is None: annot_files = annotation.files_from_dir( @@ -158,10 +159,11 @@ def prep_spectrogram_dataset( dask_bag_kwargs=audio_dask_bag_kwargs, ) spect_format = "npz" - spect_ext = ".spect.npz" + spect_ext = constants.SPECT_NPZ_EXTENSION else: # if audio format is None spect_files = None - spect_ext = None + # make sure we use the vak extension for spectrogram files + spect_ext = constants.SPECT_FORMAT_EXT_MAP[spect_format] make_dataframe_kwargs = { "spect_format": spect_format, @@ -169,7 +171,6 @@ def prep_spectrogram_dataset( "annot_list": annot_list, "annot_format": annot_format, "spect_ext": spect_ext, - "spect_output_dir": spect_output_dir, } if ( @@ -196,7 +197,7 @@ def prep_spectrogram_dataset( ]: make_dataframe_kwargs[key] = spect_params[key] - dataset_df = spect_helper.make_dataframe_of_spect_files( + source_files_df = spect_helper.make_dataframe_of_spect_files( **make_dataframe_kwargs ) - return dataset_df + return source_files_df diff --git a/src/vak/prep/spectrogram_dataset/spect_helper.py b/src/vak/prep/spectrogram_dataset/spect_helper.py index a29ebe485..ab692806e 100644 --- a/src/vak/prep/spectrogram_dataset/spect_helper.py +++ b/src/vak/prep/spectrogram_dataset/spect_helper.py @@ -1,7 +1,8 @@ -"""function that converts a set of array files (.npz, .mat) containing spectrograms -into a pandas DataFrame that represents a dataset used by ``vak`` +"""Function that converts a set of array files (.npz, .mat) containing spectrograms +into a pandas DataFrame that represents a dataset used by ``vak``. -the returned DataFrame has columns as specified by vak.io.spect.DF_COLUMNS +The columns of the dataframe are specified by + :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`. """ from __future__ import annotations @@ -15,7 +16,7 @@ from ...common import constants, files from ...common.annotation import map_annotated_to_annot -from ...common.converters import expanded_user_path, labelset_to_set +from ...common.converters import labelset_to_set logger = logging.getLogger(__name__) @@ -34,7 +35,6 @@ def make_dataframe_of_spect_files( spect_format: str, spect_dir: str | pathlib.Path | None = None, - spect_output_dir: str | pathlib.Path | None = None, spect_files: list | None = None, spect_ext: str | None = None, annot_list: list | None = None, @@ -46,60 +46,63 @@ def make_dataframe_of_spect_files( spect_key: str = "s", audio_path_key: str = "audio_path", ) -> pd.DataFrame: - """Creates a dataset of spectrogram files from a directory, + """Get a set of spectrogram files from a directory, optionally paired with an annotation file or files, - and returns a Pandas DataFrame that represents the dataset. + and returns a Pandas DataFrame that represents all the files. Spectrogram files are array in npz files created by numpy or in mat files created by Matlab. - If files are in mat format, they will be converted to npz - with the default keys for arrays, and saved in - ``spect_output_dir``. This step is required so that all dataset - prepared by :mod:`vak` are in a "normalized" or - "canonicalized" format. If no ``spect_output_dir`` is provided - when the ``spect_format`` is ``'mat'``, then this function - will raise an error. Parameters ---------- spect_format : str - format of files containing spectrograms. One of {'mat', 'npz'} + Format of files containing spectrograms. One of {'mat', 'npz'} spect_dir : str - path to directory of files containing spectrograms as arrays. + Path to directory of files containing spectrograms as arrays. Default is None. spect_files : list List of paths to array files. Default is None. annot_list : list - of annotations for array files. Default is None + List of annotations for array files. Default is None annot_format : str - name of annotation format. Added as a column to the DataFrame if specified. + Name of annotation format. Added as a column to the DataFrame if specified. Used by other functions that open annotation files via their paths from the DataFrame. Should be a format that the crowsetta library recognizes. Default is None. labelset : str, list, set - of str or int, set of unique labels for vocalizations. Default is None. + Set of unique labels for vocalizations, of str or int. Default is None. If not None, then files will be skipped where the associated annotation contains labels not found in ``labelset``. - ``labelset`` is converted to a Python ``set`` using ``vak.converters.labelset_to_set``. + ``labelset`` is converted to a Python ``set`` using + :func:`vak.common.converters.labelset_to_set`. See help for that function for details on how to specify labelset. n_decimals_trunc : int - number of decimal places to keep when truncating the timebin duration calculated from - the vector of time bins. + number of decimal places to keep when truncating the time + bin duration calculated from the vector of time bins. Default is 3, i.e. assumes milliseconds is the last significant digit. freqbins_key : str - key for accessing vector of frequency bins in files. Default is 'f'. + Key for accessing vector of frequency bins in files. Default is 'f'. timebins_key : str - key for accessing vector of time bins in files. Default is 't'. + Key for accessing vector of time bins in files. Default is 't'. spect_key : str - key for accessing spectrogram in files. Default is 's'. + Key for accessing spectrogram in files. Default is 's'. audio_path_key : str - key for accessing path to source audio file for spectrogram in files. + Key for accessing path to source audio file for spectrogram in files. Default is 'audio_path'. Returns ------- - dataset_df : pandas.Dataframe - Dataframe that represents a dataset of vocalizations. + source_files_df : pandas.DataFrame + A set of source files that will be used to prepare a + data set for use with neural network models, + represented as a :class:`pandas.DataFrame`. + Will contain paths to spectrogram files, + possibly paired with annotation files, + as well as the original audio files if the + spectrograms were generated from audio by + :func:`vak.prep.audio_helper.make_spectrogram_files_from_audio_files`. + The columns of the dataframe are specified by + :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`. Notes ----- @@ -120,14 +123,6 @@ def make_dataframe_of_spect_files( f"format '{spect_format}' not recognized." ) - if spect_format == "mat" and spect_output_dir is None: - raise ValueError( - "Must provide ``spect_output_dir`` when ``spect_format`` is '.mat'." - "so that array files can be converted to npz format. " - "This is needed so that all datasets are in a standardized, " - "canonical format that other functions in the library expect." - ) - if all([arg is None for arg in (spect_dir, spect_files)]): raise ValueError( "must specify one of: spect_dir, spect_files" @@ -151,13 +146,6 @@ def make_dataframe_of_spect_files( if labelset is not None: labelset = labelset_to_set(labelset) - if spect_output_dir: - spect_output_dir = expanded_user_path(spect_output_dir) - if not spect_output_dir.is_dir(): - raise NotADirectoryError( - f"spect_output_dir not found: {spect_output_dir}" - ) - # ---- get a list of spectrogram files + associated annotation files ----------------------------------------------- if spect_dir: # then get spect_files from that dir # note we already validated format above @@ -217,11 +205,14 @@ def make_dataframe_of_spect_files( # ---- actually make the dataframe --------------------------------------------------------------------------------- # this is defined here so all other arguments to 'to_dataframe' are in scope def _to_record(spect_annot_tuple): - """helper function that enables parallelized creation of "records", - i.e. rows for dataframe, from . - Accepts a two-element tuple containing (1) a dictionary that represents a spectrogram + """helper function that enables parallelized creation + of "records", i.e. rows for dataframe. + Accepts a two-element tuple containing + (1) a dictionary that represents a spectrogram and (2) annotation for that file""" spect_path, annot = spect_annot_tuple + spect_path = pathlib.Path(spect_path) + spect_dict = files.spect.load(spect_path, spect_format) spect_dur = spect_dict[spect_key].shape[-1] * timebin_dur @@ -236,18 +227,6 @@ def _to_record(spect_annot_tuple): # (or an error) audio_path = files.spect.find_audio_fname(spect_path) - if spect_format == "mat": - # convert to .npz and save in spect_output_dir - spect_dict_npz = { - "s": spect_dict[spect_key], - "t": spect_dict[timebins_key], - "f": spect_dict[freqbins_key], - } - spect_path = spect_output_dir / ( - pathlib.Path(spect_path).stem + ".npz" - ) - np.savez(spect_path, **spect_dict_npz) - if annot is not None: annot_path = annot.annot_path else: diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py index 925b27d0e..256daaa84 100644 --- a/src/vak/train/frame_classification.py +++ b/src/vak/train/frame_classification.py @@ -45,7 +45,7 @@ def train_frame_classification_model( ckpt_step: int | None = None, patience: int | None = None, device: str | None = None, - split: str = "train", + subset: str | None = None, ) -> None: """Train a model from the frame classification family and save results. @@ -141,11 +141,11 @@ def train_frame_classification_model( number of validation steps to wait without performance on the validation set improving before stopping the training. Default is None, in which case training only stops after the specified number of epochs. - split : str - Name of split from dataset found at ``dataset_path`` to use - when training model. Default is 'train'. This parameter is used by - `vak.learncurve.learncurve` to specify specific subsets of the - training set to use when training models for a learning curve. + subset : str + Name of a subset from the training split of the dataset + to use when training model. This parameter is used by + :func:`vak.learncurve.learncurve` to specify subsets + when training models for a learning curve. """ for path, path_name in zip( (checkpoint_path, spect_scaler_path), @@ -221,7 +221,8 @@ def train_frame_classification_model( logger.info("will normalize spectrograms") spect_standardizer = transforms.StandardizeSpect.fit_dataset_path( dataset_path, - split=split, + split="train", + subset=subset, ) joblib.dump( spect_standardizer, results_path.joinpath("StandardizeSpect") @@ -249,7 +250,8 @@ def train_frame_classification_model( train_dataset_params = {} train_dataset = WindowDataset.from_dataset_path( dataset_path=dataset_path, - split=split, + split="train", + subset=subset, transform=transform, target_transform=target_transform, **train_dataset_params, diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py index a254397ed..675dac90d 100644 --- a/src/vak/train/parametric_umap.py +++ b/src/vak/train/parametric_umap.py @@ -94,7 +94,7 @@ def train_parametric_umap_model( val_step: int | None = None, ckpt_step: int | None = None, device: str | None = None, - split: str = "train", + subset: str | None = None, ) -> None: """Train a model from the parametric UMAP family and save results. @@ -228,7 +228,8 @@ def train_parametric_umap_model( train_dataset_params = {} train_dataset = ParametricUMAPDataset.from_dataset_path( dataset_path=dataset_path, - split=split, + split="train", + subset=subset, transform=transform, **train_dataset_params, ) diff --git a/src/vak/train/train_.py b/src/vak/train/train_.py index c25046827..79ee2897f 100644 --- a/src/vak/train/train_.py +++ b/src/vak/train/train_.py @@ -32,7 +32,7 @@ def train( ckpt_step: int | None = None, patience: int | None = None, device: str | None = None, - split: str = "train", + subset: str | None = None, ): """Train a model and save results. @@ -185,7 +185,7 @@ def train( ckpt_step=ckpt_step, patience=patience, device=device, - split=split, + subset=subset, ) elif model_family == "ParametricUMAPModel": train_parametric_umap_model( @@ -205,7 +205,7 @@ def train( val_step=val_step, ckpt_step=ckpt_step, device=device, - split=split, + subset=subset, ) else: raise ValueError(f"Model family not recognized: {model_family}") diff --git a/src/vak/transforms/defaults/frame_classification.py b/src/vak/transforms/defaults/frame_classification.py index 2b5733b18..c4abd0f34 100644 --- a/src/vak/transforms/defaults/frame_classification.py +++ b/src/vak/transforms/defaults/frame_classification.py @@ -103,7 +103,7 @@ def __init__( self.annot_transform = vak_transforms.ToLongTensor() - def __call__(self, frames, frame_labels, source_path=None): + def __call__(self, frames, frame_labels, frames_path=None): if self.spect_standardizer: frames = self.spect_standardizer(frames) @@ -124,8 +124,9 @@ def __call__(self, frames, frame_labels, source_path=None): if padding_mask is not None: item["padding_mask"] = padding_mask - if source_path is not None: - item["source_path"] = source_path + if frames_path is not None: + # make sure frames_path is a str, not a pathlib.Path + item["frames_path"] = str(frames_path) return item @@ -171,7 +172,7 @@ def __init__( ] ) - def __call__(self, frames, source_path=None): + def __call__(self, frames, frames_path=None): if self.spect_standardizer: frames = self.spect_standardizer(frames) @@ -190,8 +191,9 @@ def __call__(self, frames, source_path=None): if padding_mask is not None: item["padding_mask"] = padding_mask - if source_path is not None: - item["source_path"] = source_path + if frames_path is not None: + # make sure frames_path is a str, not a pathlib.Path + item["frames_path"] = str(frames_path) return item diff --git a/src/vak/transforms/transforms.py b/src/vak/transforms/transforms.py index 9e2c66935..298c7f475 100644 --- a/src/vak/transforms/transforms.py +++ b/src/vak/transforms/transforms.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +from ..common import constants from ..common.validators import column_or_1d from . import functional as F @@ -76,7 +77,7 @@ def __init__(self, mean_freqs=None, std_freqs=None, non_zero_std=None): self.non_zero_std = non_zero_std @classmethod - def fit_dataset_path(cls, dataset_path, split="train"): + def fit_dataset_path(cls, dataset_path, split="train", subset: str | None = None): """Returns a :class:`StandardizeSpect` instance that is fit to a split from a dataset, given the path to that dataset and the @@ -99,14 +100,18 @@ def fit_dataset_path(cls, dataset_path, split="train"): dataset_path = pathlib.Path(dataset_path) metadata = Metadata.from_dataset_path(dataset_path) + input_type = metadata.input_type dataset_csv_path = dataset_path / metadata.dataset_csv_filename dataset_path = dataset_csv_path.parent - df = pd.read_csv(dataset_csv_path) - df = df[df["split"] == split].copy() - frames_paths = df[ - frame_classification.constants.FRAMES_NPY_PATH_COL_NAME + dataset_df = pd.read_csv(dataset_csv_path) + if subset: + dataset_df = dataset_df[dataset_df.split == split].copy() + else: + dataset_df = dataset_df[dataset_df.split == split].copy() + frames_paths = dataset_df[ + frame_classification.constants.FRAMES_PATH_COL_NAME ].values - frames = np.load(dataset_path / frames_paths[0]) + frames = np.load(dataset_path / frames_paths[0])[constants.SPECT_KEY] # in files, spectrograms are in orientation (freq bins, time bins) # so we take mean and std across columns, i.e. time bins, i.e. axis 1 @@ -114,7 +119,7 @@ def fit_dataset_path(cls, dataset_path, split="train"): std_freqs = np.std(frames, axis=1) for frames_path in frames_paths[1:]: - frames = np.load(dataset_path / frames_path) + frames = np.load(dataset_path / frames_path)[constants.SPECT_KEY] mean_freqs += np.mean(frames, axis=1) std_freqs += np.std(frames, axis=1) mean_freqs = mean_freqs / len(frames_paths) diff --git a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml index b0c9e6662..0922283e8 100644 --- a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" output_dir = "./tests/data_for_tests/generated/prep/learncurve/audio_cbin_annot_notmat/TweetyNet" -audio_format = "cbin" +spect_format = "npz" annot_format = "notmat" labelset = "iabcdefghjk" train_dur = 50 diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml index 61af4b692..da6a9175c 100644 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412" output_dir = "./tests/data_for_tests/generated/prep/predict/audio_cbin_annot_notmat/TweetyNet" -audio_format = "cbin" +spect_format = "npz" [SPECT_PARAMS] fft_size = 512 diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml deleted file mode 100644 index 769dfba72..000000000 --- a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml +++ /dev/null @@ -1,41 +0,0 @@ -[PREP] -dataset_type = "frame classification" -input_type = "spect" -data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" -audio_format = "wav" - -[SPECT_PARAMS] -fft_size = 512 -step_size = 64 -freq_cutoffs = [ 500, 10000,] -thresh = 6.25 -transform_type = "log_spect" - -[PREDICT] -spect_scaler_path = "/home/user/results_181014_194418/spect_scaler" -checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -labelmap_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/labelmap.json" -model = "TweetyNet" -batch_size = 11 -num_workers = 16 -device = "cuda" -output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" -annot_csv_filename = "Bird0.annot.csv" - -[PREDICT.transform_params] -window_size = 88 - -[TweetyNet.network] -conv1_filters = 8 -conv1_kernel_size = [3, 3] -conv2_filters = 16 -conv2_kernel_size = [5, 5] -pool1_size = [4, 1] -pool1_stride = [4, 1] -pool2_size = [4, 1] -pool2_stride = [4, 1] -hidden_size = 32 - -[TweetyNet.optimizer] -lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml deleted file mode 100644 index e3988e6ab..000000000 --- a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml +++ /dev/null @@ -1,51 +0,0 @@ -[PREP] -dataset_type = "frame classification" -input_type = "spect" -labelset = "012345678" -data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" -audio_format = "wav" -annot_format = "birdsong-recognition-dataset" -annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" -test_dur = 50 -train_dur = 15 -val_dur = 30 - -[SPECT_PARAMS] -fft_size = 512 -step_size = 64 -freq_cutoffs = [ 500, 10000,] -thresh = 6.25 -transform_type = "log_spect" - -[TRAIN] -model = "TweetyNet" -normalize_spectrograms = true -batch_size = 11 -num_epochs = 2 -val_step = 50 -ckpt_step = 200 -patience = 4 -num_workers = 16 -device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" - -[TRAIN.train_dataset_params] -window_size = 88 - -[TRAIN.val_transform_params] -window_size = 88 - -[TweetyNet.network] -conv1_filters = 8 -conv1_kernel_size = [3, 3] -conv2_filters = 16 -conv2_kernel_size = [5, 5] -pool1_size = [4, 1] -pool1_stride = [4, 1] -pool2_size = [4, 1] -pool2_stride = [4, 1] -hidden_size = 32 - -[TweetyNet.optimizer] -lr = 0.001 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml index c53ca4766..932208616 100644 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml +++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml @@ -3,7 +3,7 @@ dataset_type = "frame classification" input_type = "spect" data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312" output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/TweetyNet" -audio_format = "cbin" +spect_format = "npz" annot_format = "notmat" labelset = "iabcdefghjk" train_dur = 50 diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml deleted file mode 100644 index d995aa4d5..000000000 --- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml +++ /dev/null @@ -1,53 +0,0 @@ -[PREP] -dataset_type = "frame classification" -input_type = "spect" -labelset = "012345678" -data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0" -output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" -audio_format = "wav" -annot_format = "birdsong-recognition-dataset" -annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml" -test_dur = 50 -train_dur = 15 -val_dur = 30 - -[SPECT_PARAMS] -fft_size = 512 -step_size = 64 -freq_cutoffs = [ 500, 10000,] -thresh = 6.25 -transform_type = "log_spect" - -[TRAIN] -model = "TweetyNet" -normalize_spectrograms = true -batch_size = 11 -num_epochs = 2 -val_step = 50 -ckpt_step = 200 -patience = 4 -num_workers = 16 -device = "cuda" -root_results_dir = "./tests/data_for_tests/generated/results/train_continue/audio_wav_annot_birdsong-recognition-dataset/TweetyNet" -checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt" -spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect" - -[TRAIN.train_dataset_params] -window_size = 88 - -[TRAIN.val_transform_params] -window_size = 88 - -[TweetyNet.network] -conv1_filters = 8 -conv1_kernel_size = [3, 3] -conv2_filters = 16 -conv2_kernel_size = [5, 5] -pool1_size = [4, 1] -pool1_stride = [4, 1] -pool2_size = [4, 1] -pool2_stride = [4, 1] -hidden_size = 32 - -[TweetyNet.optimizer] -lr = 0.001 diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json index d92c5e674..bff69985e 100644 --- a/tests/data_for_tests/configs/configs.json +++ b/tests/data_for_tests/configs/configs.json @@ -3,120 +3,117 @@ { "filename": "TweetyNet_train_audio_cbin_annot_notmat.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "train", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": "audio_cbin_annot_notmat/gy6or6/032312", + "data_dir": null, "use_dataset_from_config": null, "use_result_from_config": null }, { "filename": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "learncurve", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": null, + "data_dir": "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032312", "use_dataset_from_config": null, "use_result_from_config": null }, { "filename": "TweetyNet_eval_audio_cbin_annot_notmat.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "eval", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": "audio_cbin_annot_notmat/gy6or6/032412", + "data_dir": null, "use_dataset_from_config": null, "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { "filename": "TweetyNet_predict_audio_cbin_annot_notmat.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "predict", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": null, + "data_dir": "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032412", "use_dataset_from_config": null, "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, { "filename": "TweetyNet_train_continue_audio_cbin_annot_notmat.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "train_continue", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": null, + "data_dir": "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032312", "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml", "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml" }, - { - "filename": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", - "model": "TweetyNet", - "config_type": "train", - "audio_format": "wav", - "spect_format": null, - "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": null, - "use_result_from_config": null - }, - { - "filename": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml", - "model": "TweetyNet", - "config_type": "predict", - "audio_format": "wav", - "spect_format": null, - "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": null, - "use_result_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" - }, - { - "filename": "TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml", - "model": "TweetyNet", - "config_type": "train_continue", - "audio_format": "wav", - "spect_format": null, - "annot_format": "birdsong-recognition-dataset", - "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml", - "use_result_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml" - }, { "filename": "TweetyNet_train_spect_mat_annot_yarden.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "train", "audio_format": null, "spect_format": "mat", "annot_format": "yarden", + "spect_output_dir": null, + "data_dir": null, "use_dataset_from_config": null, "use_result_from_config": null }, { "filename": "TweetyNet_train_continue_spect_mat_annot_yarden.toml", "model": "TweetyNet", + "model_family": "frame_classification", "config_type": "train_continue", "audio_format": null, "spect_format": "mat", "annot_format": "yarden", + "spect_output_dir": null, + "data_dir": null, "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml", "use_result_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml" }, { "filename": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml", "model": "ConvEncoderUMAP", + "model_family": "parametric_umap", "config_type": "train", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": null, + "data_dir": null, "use_dataset_from_config": null, "use_result_from_config": null }, { "filename": "ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml", "model": "ConvEncoderUMAP", + "model_family": "parametric_umap", "config_type": "eval", "audio_format": "cbin", "spect_format": null, "annot_format": "notmat", + "spect_output_dir": null, + "data_dir": null, "use_dataset_from_config": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml", "use_result_from_config": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml" } diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py index e910c8357..ac174ea00 100644 --- a/tests/fixtures/__init__.py +++ b/tests/fixtures/__init__.py @@ -8,6 +8,7 @@ from .device import * from .model import * from .path import * +from .source_files import * from .spect import * from .split import * from .test_data import * diff --git a/tests/fixtures/annot.py b/tests/fixtures/annot.py index f140abb48..40ba6dec0 100644 --- a/tests/fixtures/annot.py +++ b/tests/fixtures/annot.py @@ -87,84 +87,8 @@ def labelset_notmat(generated_test_configs_root): return LABELSET_NOTMAT -ANNOT_FILE_BIRDSONGREC = SOURCE_TEST_DATA_ROOT.joinpath( - "audio_wav_annot_birdsongrec", "Bird0", "Annotation.xml" -) - - -@pytest.fixture -def annot_file_birdsongrec(): - return ANNOT_FILE_BIRDSONGREC - - -scribe_birdsongrec = crowsetta.Transcriber(format="birdsong-recognition-dataset") -ANNOT_LIST_BIRDSONGREC = scribe_birdsongrec.from_file(ANNOT_FILE_BIRDSONGREC).to_annot() - - -@pytest.fixture -def annot_list_birdsongrec(): - return ANNOT_LIST_BIRDSONGREC - - -ANNOT_DIR_TEXTGRID = SOURCE_TEST_DATA_ROOT.joinpath("audio_wav_annot_textgrid", "AGBk") - - -@pytest.fixture -def annot_dir_textgrid(): - return ANNOT_DIR_TEXTGRID - - -ANNOT_FILES_TEXTGRID = sorted(ANNOT_DIR_TEXTGRID.glob("*.TextGrid")) - - -@pytest.fixture -def annot_files_textgrid(): - return ANNOT_FILES_TEXTGRID - - -scribe_textgrid = crowsetta.Transcriber(format="textgrid") -ANNOT_LIST_TEXTGRID = [scribe_textgrid.from_file(textgrid).to_annot() - for textgrid in ANNOT_FILES_TEXTGRID] - - -@pytest.fixture -def annot_list_textgrid(): - return ANNOT_LIST_TEXTGRID - - -ANNOT_DIR_SIMPLE_SEQ = SOURCE_TEST_DATA_ROOT.joinpath( - "audio_cbin_annot_simple_seq", "gy6or6", "032312" -) - - -@pytest.fixture -def annot_dir_simple_seq(): - return ANNOT_DIR_SIMPLE_SEQ - - -ANNOT_FILES_SIMPLE_SEQ = sorted(ANNOT_DIR_SIMPLE_SEQ.glob("*.cbin.csv")) - - -@pytest.fixture -def annot_files_simple_seq(): - return ANNOT_FILES_SIMPLE_SEQ - - -scribe_simple_seq = crowsetta.Transcriber(format="simple-seq") -ANNOT_LIST_SIMPLE_SEQ = [scribe_simple_seq.from_file(simpleseq) - for simpleseq in ANNOT_FILES_SIMPLE_SEQ] - - -@pytest.fixture -def annot_list_simple_seq(): - return ANNOT_LIST_SIMPLE_SEQ - - @pytest.fixture -def specific_annot_list(annot_list_birdsongrec, - annot_list_notmat, - annot_list_simple_seq, - annot_list_textgrid, +def specific_annot_list(annot_list_notmat, annot_list_yarden): """factory fixture, returns a function that returns a fixture containing a list of Annotation objects, @@ -173,10 +97,7 @@ def specific_annot_list(annot_list_birdsongrec, so that unit tests can be parameterized with annotation format names """ FORMAT_ANNOT_LIST_FIXTURE_MAP = { - "birdsong-recognition-dataset": annot_list_birdsongrec, "notmat": annot_list_notmat, - "simple-seq": annot_list_simple_seq, - "textgrid": annot_list_textgrid, "yarden": annot_list_yarden, } diff --git a/tests/fixtures/audio.py b/tests/fixtures/audio.py index 4753a4097..94121e793 100644 --- a/tests/fixtures/audio.py +++ b/tests/fixtures/audio.py @@ -64,42 +64,8 @@ def audio_list_cbin_labels_not_in_labelset(): return AUDIO_LIST_CBIN_LABELS_NOT_IN_LABELSET -AUDIO_DIR_WAV_BIRDSONGREC = SOURCE_TEST_DATA_ROOT.joinpath("audio_wav_annot_birdsongrec", "Bird0", "Wave") - - -@pytest.fixture -def audio_dir_wav_birdsongrec(): - return AUDIO_DIR_WAV_BIRDSONGREC - - -AUDIO_LIST_WAV_BIRDSONGREC = sorted(AUDIO_DIR_WAV_BIRDSONGREC.glob("*.wav")) - - -@pytest.fixture -def audio_list_wav_birdsongrec(): - return AUDIO_LIST_WAV_BIRDSONGREC - - -AUDIO_DIR_WAV_TEXTGRID = SOURCE_TEST_DATA_ROOT.joinpath("audio_wav_annot_textgrid", "AGBk") - - -@pytest.fixture -def audio_dir_wav_textgrid(): - return AUDIO_DIR_WAV_TEXTGRID - - -AUDIO_LIST_WAV_TEXTGRID = sorted(AUDIO_DIR_WAV_TEXTGRID.glob("*.WAV")) - - -@pytest.fixture -def audio_list_wav_textgrid(): - return AUDIO_LIST_WAV_TEXTGRID - - @pytest.fixture -def audio_list_factory(audio_list_cbin, - audio_list_wav_birdsongrec, - audio_list_wav_textgrid): +def audio_list_factory(audio_list_cbin): """factory fixture, returns a function that returns a fixture containing a list of Annotation objects, given a specified annotation format @@ -108,9 +74,6 @@ def audio_list_factory(audio_list_cbin, """ FORMAT_AUDIO_LIST_FIXTURE_MAP = { "audio_cbin_annot_notmat": audio_list_cbin, - "audio_cbin_annot_simple-seq": audio_list_cbin, - "audio_wav_annot_birdsong-recognition-dataset": audio_list_wav_birdsongrec, - "audio_wav_annot_textgrid": audio_list_wav_textgrid, } def _audio_list_factory(audio_format, annot_format): diff --git a/tests/fixtures/config.py b/tests/fixtures/config.py index ac4c74573..dae6e50f4 100644 --- a/tests/fixtures/config.py +++ b/tests/fixtures/config.py @@ -17,7 +17,7 @@ def test_configs_root(): Two types of config files in this directory: 1) those used by the tests/scripts/generate_data_for_tests.py script. - Will be listed in configs.json. See ``specific_config`` fixture below + Will be listed in configs.json. See ``specific_config_toml_path`` fixture below for details about types of configs. 2) those used by tests that are static, e.g., ``invalid_section_config.toml`` @@ -47,7 +47,7 @@ def list_of_schematized_configs(test_configs_root): "annot_format": "notmat" } - The ``specific_config`` factory fixture returns a function that + The ``specific_config_toml_path`` factory fixture returns a function that itself return a configuration ``filename``, when provided values for all of the other keys. """ @@ -88,7 +88,7 @@ def all_generated_configs(): @pytest.fixture -def specific_config(generated_test_configs_root, list_of_schematized_configs, tmp_path): +def specific_config_toml_path(generated_test_configs_root, list_of_schematized_configs, tmp_path): """returns a factory function that will return the path to a specific configuration file, determined by @@ -226,7 +226,7 @@ def _return_toml(toml_path): @pytest.fixture -def specific_config_toml(specific_config): +def specific_config_toml(specific_config_toml_path): """returns a function that will return a dict containing parsed toml from a specific configuration file, determined by @@ -241,7 +241,7 @@ def _specific_config_toml( audio_format=None, spect_format=None, ): - config_path = specific_config( + config_path = specific_config_toml_path( config_type, model, annot_format, audio_format, spect_format ) return _return_toml(config_path) diff --git a/tests/fixtures/source_files.py b/tests/fixtures/source_files.py new file mode 100644 index 000000000..935c80c13 --- /dev/null +++ b/tests/fixtures/source_files.py @@ -0,0 +1,108 @@ +"""Fixtures having to do with source files, i.e., +the "raw" files that go into a data set +used with neural networks +""" +import pandas as pd +import pytest + +from .test_data import GENERATED_TEST_DATA_ROOT + +# copied from vaktestdata.constants; could we add this to that with sys.path? or vice versa +GENERATED_SOURCE_FILES_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-csv" +GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-with-splits-csv" + + +@pytest.fixture +def specific_source_files_csv_path(): + """Factory fixture that returns a specific source file csv""" + def _specific_source_files_csv_path( + config_type, + model_name, + annot_format, + audio_format=None, + spect_format=None, + ): + if audio_format and spect_format: + raise ValueError( + "Specify audio_format or spect_format, not both" + ) + if audio_format: + csv_filename = f'{model_name}_{config_type}_audio_{audio_format}_annot_{annot_format}.toml-source-files.csv' + elif spect_format: + csv_filename = f'{model_name}_{config_type}_spect_{spect_format}_annot_{annot_format}.toml-source-files.csv' + csv_path = GENERATED_SOURCE_FILES_CSV_DIR / csv_filename + return csv_path + + return _specific_source_files_csv_path + + +@pytest.fixture +def specific_source_files_df( + specific_source_files_csv_path +): + """Factory fixture that returns a specific source file csv""" + def _specific_source_files_df( + config_type, + model_name, + annot_format, + audio_format=None, + spect_format=None, + ): + csv_path = specific_source_files_csv_path( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + df = pd.read_csv(csv_path) + return df + return _specific_source_files_df + + +@pytest.fixture +def specific_source_files_with_split_csv_path(): + """Factory fixture that returns a specific source file csv""" + def _specific_source_files_with_split_csv_path( + config_type, + model_name, + annot_format, + audio_format=None, + spect_format=None, + ): + if audio_format and spect_format: + raise ValueError( + "Specify audio_format or spect_format, not both" + ) + if audio_format: + csv_filename = f'{model_name}_{config_type}_audio_{audio_format}_annot_{annot_format}.toml-source-files-with-split.csv' + elif spect_format: + csv_filename = f'{model_name}_{config_type}_spect_{spect_format}_annot_{annot_format}.toml-source-files-with-split.csv' + csv_path = GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR / csv_filename + return csv_path + + return _specific_source_files_with_split_csv_path + + +@pytest.fixture +def specific_source_files_with_split_df( + specific_source_files_with_split_csv_path +): + """Factory fixture that returns a specific source file csv""" + def _specific_source_files_with_split_df( + config_type, + model_name, + annot_format, + audio_format=None, + spect_format=None, + ): + csv_path = specific_source_files_with_split_csv_path( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + df = pd.read_csv(csv_path) + return df + return _specific_source_files_with_split_df diff --git a/tests/fixtures/spect.py b/tests/fixtures/spect.py index 53bc10f27..9aa8ec402 100644 --- a/tests/fixtures/spect.py +++ b/tests/fixtures/spect.py @@ -22,11 +22,8 @@ def spect_dir_mat(): return SPECT_DIR_MAT -SPECT_DIR_NPZ = sorted( - GENERATED_TEST_DATA_ROOT.joinpath( - "prep", "train", "audio_cbin_annot_notmat", "TweetyNet" - ).glob("*vak-frame-classification-dataset-generated*/spectrograms_generated_*") - )[0] +# this dir is created by ./tests/scripts/generate_test_data +SPECT_DIR_NPZ = GENERATED_TEST_DATA_ROOT / "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032312" @pytest.fixture diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py index c4dd83ccd..db54f972f 100644 --- a/tests/scripts/generate_data_for_tests.py +++ b/tests/scripts/generate_data_for_tests.py @@ -65,7 +65,7 @@ import vaktestdata -logger = logging.getLogger() # 'base' logger +logger = logging.getLogger('vaktestdata') # 'base' logger formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') stream_handler = logging.StreamHandler(sys.stdout) stream_handler.setFormatter(formatter) @@ -101,7 +101,17 @@ def generate_test_data( """ # need to run `prep` before we run other commands if step in ('prep', 'all'): + # first we generate outputs of processing steps + # leading up to a dataset that speed up tests + vaktestdata.dirs.make_spect_output_dir_in_generated() # for any prepared spectrograms + vaktestdata.dirs.make_source_files_csv_dir_in_generated() # for csvs of source files + vaktestdata.dirs.make_source_files_with_splits_csv_dir_in_generated() # same csvs, with splits added + + # -- now actually run prep for all the configs config_paths = vaktestdata.configs.copy_config_files() + + vaktestdata.source_files.set_up_source_files_and_csv_files_for_frame_classification_models() + vaktestdata.dirs.make_subdirs_in_generated(config_paths) # run prep for some models vaktestdata.prep.run_prep() @@ -114,7 +124,7 @@ def generate_test_data( for command in commands: if command == "prep": continue # we don't run prep in this code block - print(f"running configs for command: {command}") + logger.info(f"running configs for command: {command}") command_config_metadata = [ config_metadata for config_metadata in vaktestdata.constants.CONFIG_METADATA @@ -134,7 +144,7 @@ def generate_test_data( for config_metadata in command_config_metadata: config_path = vaktestdata.constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename - print( + logger.info( f"n\Running 'vak {command}', using config: {config_path.name}" ) vak.cli.cli.cli(command, config_path) diff --git a/tests/scripts/vaktestdata/__init__.py b/tests/scripts/vaktestdata/__init__.py index f12f6b06c..cf29ad1f9 100644 --- a/tests/scripts/vaktestdata/__init__.py +++ b/tests/scripts/vaktestdata/__init__.py @@ -5,4 +5,5 @@ dirs, parser, prep, + source_files, ) diff --git a/tests/scripts/vaktestdata/config_metadata.py b/tests/scripts/vaktestdata/config_metadata.py index 73402c848..5154e7b32 100644 --- a/tests/scripts/vaktestdata/config_metadata.py +++ b/tests/scripts/vaktestdata/config_metadata.py @@ -3,11 +3,61 @@ @attrs.define class ConfigMetadata: - filename: str = attrs.field() - model: str = attrs.field() - config_type: str = attrs.field() - audio_format: str = attrs.field() - spect_format: str = attrs.field() - annot_format: str = attrs.field() - use_dataset_from_config = attrs.field(default=None) - use_result_from_config = attrs.field(default=None) + """Dataclass that represents metadata + about a configuration file + + Attributes + ---------- + filename : str + The name of the configuration file. + model : str + The name of the model in :mod:`vak` + that the configuration file is used with. + model_family : str + The name of the model family + for the model in the configuration file. + config_type : str + The type of config, one of + {'train', 'eval', 'predict', 'learncurve'}. + audio_format : str + The format of the audio files. + spect_format : str + The format of the spectrogram files. + spect_output_dir : str, optional + The directory where spectrograms should be saved + when generated for this configuration file. + If not specified, then no spectrograms are generated. + This attribute is used to avoid repeatedly + generating the same set of spectrograms for multiple + configs. + data_dir : str, optional + The directory that should be used as the `data_dir` + option for this config. + The option will be changed to this value in the generated + config file. + This attribute is used to avoid repeatedly + generating the same set of spectrograms for multiple + configs. + use_dataset_from_config : str, optional + The filename of another configuration file. + The ``dataset_path`` option of that configuration file + will be used for this configuration file. + This option is used to avoid repeatedly + generating the same dataset for multiple configs. + use_results_from_config : str, optional + The filename of another configuration file. + The most recent results from ``results_path`` option + of that configuration file + will be used for this configuration file. + """ + filename: str = attrs.field(converter=str) + model: str = attrs.field(converter=str) + model_family: str = attrs.field(converter=str) + config_type: str = attrs.field(converter=str) + audio_format: str = attrs.field(converter=attrs.converters.optional(str), default=None) + spect_format: str = attrs.field(converter=attrs.converters.optional(str), default=None) + annot_format: str = attrs.field(converter=attrs.converters.optional(str), default=None) + spect_output_dir: str = attrs.field(converter=attrs.converters.optional(str), default=None) + data_dir: str = attrs.field(converter=attrs.converters.optional(str), default=None) + use_dataset_from_config: str = attrs.field(converter=attrs.converters.optional(str), default=None) + use_result_from_config: str = attrs.field(converter=attrs.converters.optional(str), default=None) diff --git a/tests/scripts/vaktestdata/constants.py b/tests/scripts/vaktestdata/constants.py index 867ebf36b..705c93d96 100644 --- a/tests/scripts/vaktestdata/constants.py +++ b/tests/scripts/vaktestdata/constants.py @@ -13,8 +13,13 @@ ConfigMetadata(**config_metadata_dict) for config_metadata_dict in CONFIG_METADATA_LIST ] -GENERATED_TEST_DATA = TEST_DATA_ROOT / "generated" -GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA / "configs" +GENERATED_TEST_DATA_ROOT = TEST_DATA_ROOT / "generated" + +GENERATED_SPECT_OUTPUT_DIR = GENERATED_TEST_DATA_ROOT / "spect-output-dir" +GENERATED_SOURCE_FILES_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-csv" +GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-with-splits-csv" + +GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA_ROOT / "configs" # convention is that all the config.toml files in tests/data_for_tests/configs # that should be run when generating test data diff --git a/tests/scripts/vaktestdata/dirs.py b/tests/scripts/vaktestdata/dirs.py index 8debd01f1..bbb764031 100644 --- a/tests/scripts/vaktestdata/dirs.py +++ b/tests/scripts/vaktestdata/dirs.py @@ -7,6 +7,16 @@ logger = logging.getLogger(__name__) +def make_spect_output_dir_in_generated(): + constants.GENERATED_SPECT_OUTPUT_DIR.mkdir() + +def make_source_files_csv_dir_in_generated(): + constants.GENERATED_SOURCE_FILES_CSV_DIR.mkdir() + +def make_source_files_with_splits_csv_dir_in_generated(): + constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR.mkdir() + + def make_subdirs_in_generated(config_paths): """make sub-directories inside ./tests/data_for_tests/generated @@ -26,7 +36,7 @@ def make_subdirs_in_generated(config_paths): for top_level_dir in constants.TOP_LEVEL_DIRS: # datasets / results subdir_to_make = ( - constants.GENERATED_TEST_DATA / top_level_dir + constants.GENERATED_TEST_DATA_ROOT / top_level_dir ) logger.info( f"Making sub-directory: {subdir_to_make}" @@ -47,7 +57,7 @@ def make_subdirs_in_generated(config_paths): if config_metadata.use_dataset_from_config is None: # we need to make dataset dir subdir_to_make = ( - constants.GENERATED_TEST_DATA / 'prep' / config_type / data_dir / model + constants.GENERATED_TEST_DATA_ROOT / 'prep' / config_type / data_dir / model ) logger.info( f"Making sub-directory: {subdir_to_make}" @@ -55,7 +65,7 @@ def make_subdirs_in_generated(config_paths): subdir_to_make.mkdir(parents=True) subdir_to_make = ( - constants.GENERATED_TEST_DATA / 'results' / config_type / data_dir / model + constants.GENERATED_TEST_DATA_ROOT / 'results' / config_type / data_dir / model ) logger.info( f"Making sub-directory: {subdir_to_make}" diff --git a/tests/scripts/vaktestdata/source_files.py b/tests/scripts/vaktestdata/source_files.py new file mode 100644 index 000000000..e53d0e2ee --- /dev/null +++ b/tests/scripts/vaktestdata/source_files.py @@ -0,0 +1,194 @@ +# Do this here to suppress warnings before we import vak +import logging +import shutil +import warnings + +from numba.core.errors import NumbaDeprecationWarning +warnings.simplefilter('ignore', category=NumbaDeprecationWarning) + +import pandas as pd +import toml + +import vak + +from . import constants + + +logger = logging.getLogger(__name__) + + +def set_up_source_files_and_csv_files_for_frame_classification_models(): + """Set up source files and csv files + used when testing functionality for frame classification models. + + This function does the following + - First, get only config files that have the model family set to "frame_classification" + - Then for all those config files: + - Generate spectrograms for all the ones that have "spect_output_dir" + - Then for all the other ones that have "data_dir", set that option in the config file + - Then for *all* the config files, run `get_or_make_source_files` (again) + - to get a source files dataframe + - and save this to csv + - and then save it again with a ``'split'`` column added + """ + # first just get configs we're going to prep later + configs_to_make_spectrograms = [ + config_metadata + for config_metadata in constants.CONFIG_METADATA + if config_metadata.model_family == "frame_classification" and config_metadata.spect_output_dir is not None + ] + + for config_metadata in configs_to_make_spectrograms: + spect_output_dir = constants.GENERATED_SPECT_OUTPUT_DIR / config_metadata.spect_output_dir + spect_output_dir.mkdir(parents=True) + + config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename + logger.info( + f"\nRunning :func:`vak.prep.frame_classification.get_or_make_source_files` to generate data for tests, " + f"using config:\n{config_path.name}" + ) + cfg = vak.config.parse.from_toml_path(config_path) + + source_files_df: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files( + data_dir=cfg.prep.data_dir, + input_type=cfg.prep.input_type, + audio_format=cfg.prep.audio_format, + spect_format=cfg.prep.spect_format, + spect_params=cfg.spect_params, + spect_output_dir=spect_output_dir, + annot_format=cfg.prep.annot_format, + annot_file=cfg.prep.annot_file, + labelset=cfg.prep.labelset, + audio_dask_bag_kwargs=cfg.prep.audio_dask_bag_kwargs, + ) + + # We copy annotation files to spect_output_dir + # so we can "prep" from that directory later. + # This means we have repeats of some files still, which is annoying; + # .not.mat files are about ~1.2K though + for annot_path in source_files_df['annot_path'].values: + shutil.copy(annot_path, spect_output_dir) + + csv_path = constants.GENERATED_SOURCE_FILES_CSV_DIR / f'{config_metadata.filename}-source-files.csv' + source_files_df.to_csv(csv_path, index=False) + + config_toml: dict = vak.config.parse._load_toml_from_path(config_path) + purpose = vak.cli.prep.purpose_from_toml(config_toml, config_path) + dataset_df: pd.DataFrame = vak.prep.frame_classification.assign_samples_to_splits( + purpose, + source_files_df, + dataset_path=spect_output_dir, + train_dur=cfg.prep.train_dur, + val_dur=cfg.prep.val_dur, + test_dur=cfg.prep.test_dur, + labelset=cfg.prep.labelset, + ) + source_files_with_split_csv_path = ( + constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR / + f'{config_metadata.filename}-source-files-with-split.csv' + ) + dataset_df.to_csv(source_files_with_split_csv_path) + + configs_to_add_data_dir = [ + config_metadata + for config_metadata in constants.CONFIG_METADATA + if config_metadata.model_family == "frame_classification" and config_metadata.data_dir is not None + ] + + for config_metadata in configs_to_add_data_dir: + config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename + logger.info( + f"\nRunning :func:`vak.prep.frame_classification.get_or_make_source_files` to generate data for tests, " + f"using config:\n{config_path.name}" + ) + + with config_path.open("r") as fp: + config_toml = toml.load(fp) + data_dir = constants.GENERATED_TEST_DATA_ROOT / config_metadata.data_dir + config_toml['PREP']['data_dir'] = str(data_dir) + with config_path.open("w") as fp: + toml.dump(config_toml, fp) + + cfg = vak.config.parse.from_toml_path(config_path) + + source_files_df: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files( + data_dir=cfg.prep.data_dir, + input_type=cfg.prep.input_type, + audio_format=cfg.prep.audio_format, + spect_format=cfg.prep.spect_format, + spect_params=cfg.spect_params, + spect_output_dir=None, + annot_format=cfg.prep.annot_format, + annot_file=cfg.prep.annot_file, + labelset=cfg.prep.labelset, + audio_dask_bag_kwargs=cfg.prep.audio_dask_bag_kwargs, + ) + + csv_path = constants.GENERATED_SOURCE_FILES_CSV_DIR / f'{config_metadata.filename}-source-files.csv' + source_files_df.to_csv(csv_path, index=False) + + config_toml: dict = vak.config.parse._load_toml_from_path(config_path) + purpose = vak.cli.prep.purpose_from_toml(config_toml, config_path) + dataset_df: pd.DataFrame = vak.prep.frame_classification.assign_samples_to_splits( + purpose, + source_files_df, + dataset_path=data_dir, + train_dur=cfg.prep.train_dur, + val_dur=cfg.prep.val_dur, + test_dur=cfg.prep.test_dur, + labelset=cfg.prep.labelset, + ) + source_files_with_split_csv_path = ( + constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR / + f'{config_metadata.filename}-source-files-with-split.csv' + ) + dataset_df.to_csv(source_files_with_split_csv_path) + + configs_without_spect_output_or_data_dir_to_change = [ + config_metadata + for config_metadata in constants.CONFIG_METADATA + if config_metadata.model_family == "frame_classification" and ( + config_metadata.spect_output_dir is None and config_metadata.data_dir is None + ) + ] + for config_metadata in configs_without_spect_output_or_data_dir_to_change: + config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename + if not config_path.exists(): + raise FileNotFoundError(f"{config_path} not found") + logger.info( + f"\nRunning :func:`vak.prep.frame_classification.get_or_make_source_files` to generate data for tests, " + f"using config:\n{config_path.name}" + ) + cfg = vak.config.parse.from_toml_path(config_path) + source_files_df: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files( + data_dir=cfg.prep.data_dir, + input_type=cfg.prep.input_type, + audio_format=cfg.prep.audio_format, + spect_format=cfg.prep.spect_format, + spect_params=cfg.spect_params, + spect_output_dir=None, + annot_format=cfg.prep.annot_format, + annot_file=cfg.prep.annot_file, + labelset=cfg.prep.labelset, + audio_dask_bag_kwargs=cfg.prep.audio_dask_bag_kwargs, + ) + + csv_path = constants.GENERATED_SOURCE_FILES_CSV_DIR / f'{config_metadata.filename}-source-files.csv' + source_files_df.to_csv(csv_path, index=False) + + config_toml: dict = vak.config.parse._load_toml_from_path(config_path) + purpose = vak.cli.prep.purpose_from_toml(config_toml, config_path) + dataset_df: pd.DataFrame = vak.prep.frame_classification.assign_samples_to_splits( + purpose, + source_files_df, + dataset_path=cfg.prep.data_dir, + train_dur=cfg.prep.train_dur, + val_dur=cfg.prep.val_dur, + test_dur=cfg.prep.test_dur, + labelset=cfg.prep.labelset, + ) + source_files_with_split_csv_path = ( + constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR / + f'{config_metadata.filename}-source-files-with-split.csv' + ) + dataset_df.to_csv(source_files_with_split_csv_path) diff --git a/tests/test_cli/test_eval.py b/tests/test_cli/test_eval.py index 3e0c5ef99..f94f68f46 100644 --- a/tests/test_cli/test_eval.py +++ b/tests/test_cli/test_eval.py @@ -18,7 +18,7 @@ ], ) def test_eval( - model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device + model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device ): output_dir = tmp_path.joinpath( f"test_eval_{audio_format}_{spect_format}_{annot_format}" @@ -30,7 +30,7 @@ def test_eval( {"section": "EVAL", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model=model_name, audio_format=audio_format, @@ -49,7 +49,7 @@ def test_eval( def test_eval_dataset_path_none_raises( - specific_config, tmp_path, + specific_config_toml_path, tmp_path, ): """Test that cli.eval raises ValueError when dataset_path is None (presumably because `vak prep` was not run yet) @@ -58,7 +58,7 @@ def test_eval_dataset_path_none_raises( {"section": "EVAL", "option": "dataset_path", "value": "DELETE-OPTION"}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_cli/test_learncurve.py b/tests/test_cli/test_learncurve.py index abf3adaaa..8dce64302 100644 --- a/tests/test_cli/test_learncurve.py +++ b/tests/test_cli/test_learncurve.py @@ -10,7 +10,7 @@ from . import cli_asserts -def test_learncurve(specific_config, tmp_path, device): +def test_learncurve(specific_config_toml_path, tmp_path, device): root_results_dir = tmp_path.joinpath("test_learncurve_root_results_dir") root_results_dir.mkdir() @@ -23,7 +23,7 @@ def test_learncurve(specific_config, tmp_path, device): {"section": "LEARNCURVE", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="learncurve", model="TweetyNet", audio_format="cbin", @@ -45,7 +45,7 @@ def test_learncurve(specific_config, tmp_path, device): def test_learning_curve_dataset_path_none_raises( - specific_config, tmp_path, + specific_config_toml_path, tmp_path, ): """Test that cli.learncurve.learning_curve raises ValueError when dataset_path is None @@ -66,7 +66,7 @@ def test_learning_curve_dataset_path_none_raises( "value": "DELETE-OPTION"}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="learncurve", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_cli/test_predict.py b/tests/test_cli/test_predict.py index ff2764364..6269c01d9 100644 --- a/tests/test_cli/test_predict.py +++ b/tests/test_cli/test_predict.py @@ -14,11 +14,10 @@ "model_name, audio_format, spect_format, annot_format", [ ("TweetyNet", "cbin", None, "notmat"), - ("TweetyNet", "wav", None, "birdsong-recognition-dataset"), ], ) def test_predict( - model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device + model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device ): output_dir = tmp_path.joinpath( f"test_predict_{audio_format}_{spect_format}_{annot_format}" @@ -30,7 +29,7 @@ def test_predict( {"section": "PREDICT", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="predict", model=model_name, audio_format=audio_format, @@ -47,7 +46,7 @@ def test_predict( def test_predict_dataset_path_none_raises( - specific_config, tmp_path, + specific_config_toml_path, tmp_path, ): """Test that cli.predict raises ValueError when dataset_path is None (presumably because `vak prep` was not run yet) @@ -56,7 +55,7 @@ def test_predict_dataset_path_none_raises( {"section": "PREDICT", "option": "dataset_path", "value": "DELETE-OPTION"}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="predict", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_cli/test_prep.py b/tests/test_cli/test_prep.py index fee7eaf7e..cfdd8453e 100644 --- a/tests/test_cli/test_prep.py +++ b/tests/test_cli/test_prep.py @@ -15,9 +15,7 @@ ("eval", "cbin", None, "notmat"), ("learncurve", "cbin", None, "notmat"), ("predict", "cbin", None, "notmat"), - ("predict", "wav", None, "birdsong-recognition-dataset"), ("train", "cbin", None, "notmat"), - ("train", "wav", None, "birdsong-recognition-dataset"), ("train", None, "mat", "yarden"), ], ) @@ -26,11 +24,11 @@ def test_purpose_from_toml( audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, default_model, tmp_path, ): - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type=config_type, model=default_model, audio_format=audio_format, @@ -47,9 +45,7 @@ def test_purpose_from_toml( ("eval", "cbin", None, "notmat"), ("learncurve", "cbin", None, "notmat"), ("predict", "cbin", None, "notmat"), - ("predict", "wav", None, "birdsong-recognition-dataset"), ("train", "cbin", None, "notmat"), - ("train", "wav", None, "birdsong-recognition-dataset"), ("train", None, "mat", "yarden"), ], ) @@ -58,7 +54,7 @@ def test_prep( audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, default_model, tmp_path, dummy_tmpfile_csv, @@ -77,7 +73,7 @@ def test_prep( "value": None, }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type=config_type, model=default_model, audio_format=audio_format, @@ -98,9 +94,7 @@ def test_prep( ("eval", "cbin", None, "notmat"), ("learncurve", "cbin", None, "notmat"), ("predict", "cbin", None, "notmat"), - ("predict", "wav", None, "birdsong-recognition-dataset"), ("train", "cbin", None, "notmat"), - ("train", "wav", None, "birdsong-recognition-dataset"), ("train", None, "mat", "yarden"), ], ) @@ -109,7 +103,7 @@ def test_prep_dataset_path_raises( audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, default_model, tmp_path, @@ -122,7 +116,7 @@ def test_prep_dataset_path_raises( options_to_change = [ {"section": "PREP", "option": "output_dir", "value": str(output_dir)}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type=config_type, model=default_model, audio_format=audio_format, diff --git a/tests/test_cli/test_train.py b/tests/test_cli/test_train.py index cb02736aa..c59716ff2 100644 --- a/tests/test_cli/test_train.py +++ b/tests/test_cli/test_train.py @@ -15,12 +15,11 @@ "model_name, audio_format, spect_format, annot_format", [ ("TweetyNet", "cbin", None, "notmat"), - ("TweetyNet", "wav", None, "birdsong-recognition-dataset"), ("TweetyNet", None, "mat", "yarden"), ], ) def test_train( - model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device + model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device ): root_results_dir = tmp_path.joinpath("test_train_root_results_dir") root_results_dir.mkdir() @@ -34,7 +33,7 @@ def test_train( {"section": "TRAIN", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model=model_name, audio_format=audio_format, @@ -56,7 +55,7 @@ def test_train( def test_train_dataset_path_none_raises( - specific_config, tmp_path, + specific_config_toml_path, tmp_path, ): """Test that cli.train raises ValueError when dataset_path is None (presumably because `vak prep` was not run yet) @@ -69,7 +68,7 @@ def test_train_dataset_path_none_raises( {"section": "TRAIN", "option": "dataset_path", "value": "DELETE-OPTION"}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_common/test_annotation.py b/tests/test_common/test_annotation.py index ab6e4511f..f32134421 100644 --- a/tests/test_common/test_annotation.py +++ b/tests/test_common/test_annotation.py @@ -72,9 +72,7 @@ def test_audio_stem_from_path_raises(path, audio_ext): @pytest.mark.parametrize( "source_type, source_format, annot_format, audio_ext", [ - ("audio", "wav", "birdsong-recognition-dataset", None), ("spect", "mat", "yarden", None), - ("audio", "wav", "birdsong-recognition-dataset", "wav"), ("spect", "mat", "yarden", "wav"), ], ) @@ -118,7 +116,6 @@ def test__map_using_notated_path( "source_type, source_format, annot_format, annotated_ext, method", [ ("audio", "cbin", "notmat", None, "remove"), - ("audio", "wav", "textgrid", None, "replace"), ], ) def test__map_using_ext( @@ -166,11 +163,7 @@ def test__map_using_ext( "source_type, source_format, annot_format, method", [ ("audio", "cbin", "notmat", "remove"), - ("audio", "cbin", "simple-seq", "remove"), - ("audio", "wav", "birdsong-recognition-dataset", None), - ("audio", "wav", "textgrid", "replace"), ("spect", "mat", "yarden", None), - ("audio", "wav", "textgrid", "replace"), ], ) def test_map_annotated_to_annot( diff --git a/tests/test_common/test_files/test_files.py b/tests/test_common/test_files/test_files.py index 09f773d92..4cac95f40 100644 --- a/tests/test_common/test_files/test_files.py +++ b/tests/test_common/test_files/test_files.py @@ -64,29 +64,3 @@ def test_files_from_dir_with_cbin(audio_dir_cbin, audio_list_cbin): # files.from_dir returns str not Path, need to convert fixture audio_list_cbin = [str(audio_path) for audio_path in audio_list_cbin] assert sorted(audio_list_cbin) == sorted(files) - - -@pytest.mark.parametrize( - ("dir_path", "ext"), - [ - ("./tests/data_for_tests/source/audio_wav_annot_textgrid/AGBk/", "WAV"), - ("./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Wave", "wav"), - ], -) -def test_from_dir_is_case_insensitive(dir_path, ext): - files = vak.common.files.files.from_dir(dir_path, ext) - assert len(files) > 0 - assert all([str(file).endswith(ext) for file in files]) - - -@pytest.mark.parametrize( - ("dir_path", "ext"), - [ - ("./tests/data_for_tests/source/audio_wav_annot_textgrid/", "WAV"), - ("./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0", "wav"), - ], -) -def test_from_dir_searches_child_dir(dir_path, ext): - files = vak.common.files.files.from_dir(dir_path, ext) - assert len(files) > 0 - assert all([str(file).endswith(ext) for file in files]) diff --git a/tests/test_datasets/test_frame_classification/test_frames_dataset.py b/tests/test_datasets/test_frame_classification/test_frames_dataset.py index 953e9c3c1..a7674ec61 100644 --- a/tests/test_datasets/test_frame_classification/test_frames_dataset.py +++ b/tests/test_datasets/test_frame_classification/test_frames_dataset.py @@ -12,13 +12,13 @@ class TestWindowDataset: ] ) def test_from_dataset_path(self, config_type, model_name, audio_format, spect_format, annot_format, - split, specific_config): + split, specific_config_toml_path): """Test we can get a FramesDataset instance from the classmethod ``from_dataset_path``""" - toml_path = specific_config(config_type, - model_name, - audio_format=audio_format, - spect_format=spect_format, - annot_format=annot_format) + toml_path = specific_config_toml_path(config_type, + model_name, + audio_format=audio_format, + spect_format=spect_format, + annot_format=annot_format) cfg = vak.config.parse.from_toml_path(toml_path) cfg_command = getattr(cfg, config_type) diff --git a/tests/test_datasets/test_frame_classification/test_helper.py b/tests/test_datasets/test_frame_classification/test_helper.py new file mode 100644 index 000000000..2be8e4fdc --- /dev/null +++ b/tests/test_datasets/test_frame_classification/test_helper.py @@ -0,0 +1,46 @@ +import numpy as np +import pytest + +import vak.datasets.frame_classification.helper + +from ... import fixtures + + +@pytest.mark.parametrize( + 'subset', + [ + 'train-dur-4.0-replicate-1', + 'train-dur-4.0-replicate-2' + ] +) +def test_sample_ids_array_filename_for_subset(subset): + out = vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset(subset) + assert isinstance(out, str) + assert out == vak.datasets.frame_classification.constants.SAMPLE_IDS_ARRAY_FILENAME.replace( + '.npy', f'-{subset}.npy' + ) + + +@pytest.mark.parametrize( + 'subset', + [ + 'train-dur-4.0-replicate-1', + 'train-dur-4.0-replicate-2' + ] +) +def test_inds_in_sample_array_filename_for_subset(subset): + out = vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(subset) + assert isinstance(out, str) + assert out == vak.datasets.frame_classification.constants.INDS_IN_SAMPLE_ARRAY_FILENAME.replace( + '.npy', f'-{subset}.npy' + ) + + +@pytest.fixture(params=fixtures.spect.SPECT_LIST_NPZ) +def frames_path(request): + return request.param + + +def test_load_frames(frames_path): + out = vak.datasets.frame_classification.helper.load_frames(frames_path, input_type="spect") + assert isinstance(out, np.ndarray) diff --git a/tests/test_datasets/test_frame_classification/test_window_dataset.py b/tests/test_datasets/test_frame_classification/test_window_dataset.py index 5bc5f6586..613fd1854 100644 --- a/tests/test_datasets/test_frame_classification/test_window_dataset.py +++ b/tests/test_datasets/test_frame_classification/test_window_dataset.py @@ -13,13 +13,13 @@ class TestWindowDataset: ] ) def test_from_dataset_path(self, config_type, model_name, audio_format, spect_format, annot_format, - split, transform_kwargs, specific_config): + split, transform_kwargs, specific_config_toml_path): """Test we can get a WindowDataset instance from the classmethod ``from_dataset_path``""" - toml_path = specific_config(config_type, - model_name, - audio_format=audio_format, - spect_format=spect_format, - annot_format=annot_format) + toml_path = specific_config_toml_path(config_type, + model_name, + audio_format=audio_format, + spect_format=spect_format, + annot_format=annot_format) cfg = vak.config.parse.from_toml_path(toml_path) cfg_command = getattr(cfg, config_type) diff --git a/tests/test_datasets/test_parametric_umap/test_parametric_umap.py b/tests/test_datasets/test_parametric_umap/test_parametric_umap.py index 38a2782da..15eab713f 100644 --- a/tests/test_datasets/test_parametric_umap/test_parametric_umap.py +++ b/tests/test_datasets/test_parametric_umap/test_parametric_umap.py @@ -12,13 +12,13 @@ class TestParametricUMAPDataset: ] ) def test_from_dataset_path(self, config_type, model_name, audio_format, spect_format, annot_format, - split, transform_kwargs, specific_config): + split, transform_kwargs, specific_config_toml_path): """Test we can get a WindowDataset instance from the classmethod ``from_dataset_path``""" - toml_path = specific_config(config_type, - model_name, - audio_format=audio_format, - spect_format=spect_format, - annot_format=annot_format) + toml_path = specific_config_toml_path(config_type, + model_name, + audio_format=audio_format, + spect_format=spect_format, + annot_format=annot_format) cfg = vak.config.parse.from_toml_path(toml_path) cfg_command = getattr(cfg, config_type) diff --git a/tests/test_eval/test_eval.py b/tests/test_eval/test_eval.py index 94822887f..b4e69322b 100644 --- a/tests/test_eval/test_eval.py +++ b/tests/test_eval/test_eval.py @@ -20,7 +20,7 @@ ) def test_eval( audio_format, spect_format, annot_format, model_name, eval_function_to_mock, - specific_config, tmp_path + specific_config_toml_path, tmp_path ): """Test that :func:`vak.eval.eval` dispatches to the correct model-specific training functions""" @@ -34,7 +34,7 @@ def test_eval( {"section": "EVAL", "option": "device", "value": 'cpu'}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model=model_name, audio_format=audio_format, diff --git a/tests/test_eval/test_frame_classification.py b/tests/test_eval/test_frame_classification.py index 5fe4c6d8d..ce299c0e6 100644 --- a/tests/test_eval/test_frame_classification.py +++ b/tests/test_eval/test_frame_classification.py @@ -44,7 +44,7 @@ def test_eval_frame_classification_model( audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, tmp_path, device, post_tfm_kwargs @@ -59,7 +59,7 @@ def test_eval_frame_classification_model( {"section": "EVAL", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model=model_name, audio_format=audio_format, @@ -98,7 +98,7 @@ def test_eval_frame_classification_model( ) def test_eval_frame_classification_model_raises_file_not_found( path_option_to_change, - specific_config, + specific_config_toml_path, tmp_path, device ): @@ -117,7 +117,7 @@ def test_eval_frame_classification_model_raises_file_not_found( path_option_to_change, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model="TweetyNet", audio_format="cbin", @@ -152,7 +152,7 @@ def test_eval_frame_classification_model_raises_file_not_found( ) def test_eval_frame_classification_model_raises_not_a_directory( path_option_to_change, - specific_config, + specific_config_toml_path, device, tmp_path, ): @@ -175,7 +175,7 @@ def test_eval_frame_classification_model_raises_not_a_directory( {"section": "EVAL", "option": "output_dir", "value": str(output_dir)} ) - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_eval/test_parametric_umap.py b/tests/test_eval/test_parametric_umap.py index adc95a729..5b803a7e7 100644 --- a/tests/test_eval/test_parametric_umap.py +++ b/tests/test_eval/test_parametric_umap.py @@ -23,7 +23,7 @@ def test_eval_parametric_umap_model( audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, tmp_path, device, ): @@ -37,7 +37,7 @@ def test_eval_parametric_umap_model( {"section": "EVAL", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model=model_name, audio_format=audio_format, @@ -72,7 +72,7 @@ def test_eval_parametric_umap_model( ) def test_eval_frame_classification_model_raises_file_not_found( path_option_to_change, - specific_config, + specific_config_toml_path, tmp_path, device ): @@ -89,7 +89,7 @@ def test_eval_frame_classification_model_raises_file_not_found( path_option_to_change, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model="ConvEncoderUMAP", audio_format="cbin", @@ -123,7 +123,7 @@ def test_eval_frame_classification_model_raises_file_not_found( ) def test_eval_frame_classification_model_raises_not_a_directory( path_option_to_change, - specific_config, + specific_config_toml_path, device, tmp_path, ): @@ -145,7 +145,7 @@ def test_eval_frame_classification_model_raises_not_a_directory( {"section": "EVAL", "option": "output_dir", "value": str(output_dir)} ) - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="eval", model="ConvEncoderUMAP", audio_format="cbin", diff --git a/tests/test_learncurve/test_frame_classification.py b/tests/test_learncurve/test_frame_classification.py index 7ca125fd6..cc3484279 100644 --- a/tests/test_learncurve/test_frame_classification.py +++ b/tests/test_learncurve/test_frame_classification.py @@ -51,10 +51,10 @@ def assert_learncurve_output_matches_expected(cfg, model_name, results_path): ] ) def test_learning_curve_for_frame_classification_model( - model_name, audio_format, annot_format, specific_config, tmp_path, device): + model_name, audio_format, annot_format, specific_config_toml_path, tmp_path, device): options_to_change = {"section": "LEARNCURVE", "option": "device", "value": device} - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="learncurve", model=model_name, audio_format=audio_format, @@ -99,7 +99,7 @@ def test_learning_curve_for_frame_classification_model( ] ) def test_learncurve_raises_not_a_directory(dir_option_to_change, - specific_config, + specific_config_toml_path, tmp_path, device): """Test that core.learncurve.learning_curve raises NotADirectoryError when the following directories do not exist: @@ -109,7 +109,7 @@ def test_learncurve_raises_not_a_directory(dir_option_to_change, {"section": "LEARNCURVE", "option": "device", "value": device}, dir_option_to_change ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="learncurve", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_models/test_base.py b/tests/test_models/test_base.py index 4f8dc282c..0c3236296 100644 --- a/tests/test_models/test_base.py +++ b/tests/test_models/test_base.py @@ -180,7 +180,7 @@ def test_validate_init_raises(self, definition, kwargs, expected_exception, monk def test_load_state_dict_from_path(self, model_name, # our fixtures - specific_config, + specific_config_toml_path, # pytest fixtures monkeypatch, device @@ -190,7 +190,7 @@ def test_load_state_dict_from_path(self, We use actual model definitions here so we can test with real checkpoints. """ definition = self.MODEL_DEFINITION_MAP[model_name] - train_toml_path = specific_config('train', model_name, audio_format='cbin', annot_format='notmat') + train_toml_path = specific_config_toml_path('train', model_name, audio_format='cbin', annot_format='notmat') train_cfg = vak.config.parse.from_toml_path(train_toml_path) # stuff we need just to be able to instantiate network @@ -224,7 +224,7 @@ def test_load_state_dict_from_path(self, model = vak.models.base.Model(network=network) model.to(device) - eval_toml_path = specific_config('eval', model_name, audio_format='cbin', annot_format='notmat') + eval_toml_path = specific_config_toml_path('eval', model_name, audio_format='cbin', annot_format='notmat') eval_cfg = vak.config.parse.from_toml_path(eval_toml_path) checkpoint_path = eval_cfg.eval.checkpoint_path diff --git a/tests/test_models/test_frame_classification_model.py b/tests/test_models/test_frame_classification_model.py index 4694fa160..e77e84acf 100644 --- a/tests/test_models/test_frame_classification_model.py +++ b/tests/test_models/test_frame_classification_model.py @@ -79,13 +79,13 @@ def test_init(self, def test_from_config(self, definition, # our fixtures - specific_config, + specific_config_toml_path, # pytest fixtures monkeypatch, ): definition = vak.models.definition.validate(definition) model_name = definition.__name__.replace('Definition', '') - toml_path = specific_config('train', model_name, audio_format='cbin', annot_format='notmat') + toml_path = specific_config_toml_path('train', model_name, audio_format='cbin', annot_format='notmat') cfg = vak.config.parse.from_toml_path(toml_path) # stuff we need just to be able to instantiate network diff --git a/tests/test_models/test_parametric_umap_model.py b/tests/test_models/test_parametric_umap_model.py index b3b75d3e0..36087ddbc 100644 --- a/tests/test_models/test_parametric_umap_model.py +++ b/tests/test_models/test_parametric_umap_model.py @@ -81,12 +81,12 @@ def test_from_config( self, input_shape, definition, - specific_config, + specific_config_toml_path, monkeypatch, ): definition = vak.models.definition.validate(definition) model_name = definition.__name__.replace('Definition', '') - toml_path = specific_config('train', model_name, audio_format='cbin', annot_format='notmat') + toml_path = specific_config_toml_path('train', model_name, audio_format='cbin', annot_format='notmat') cfg = vak.config.parse.from_toml_path(toml_path) monkeypatch.setattr( diff --git a/tests/test_nn/test_loss/test_dice.py b/tests/test_nn/test_loss/test_dice.py index f993c7898..bf453a1ee 100644 --- a/tests/test_nn/test_loss/test_dice.py +++ b/tests/test_nn/test_loss/test_dice.py @@ -1,7 +1,7 @@ """test loss functions""" import torch from torch.autograd import gradcheck -from torch.testing import assert_allclose +from torch.testing import assert_close import vak.nn.loss @@ -35,7 +35,7 @@ def test_all_zeros(self, device, dtype): criterion = vak.nn.loss.DiceLoss() loss = criterion(logits, labels) - assert_allclose(loss, torch.zeros_like(loss), rtol=1e-3, atol=1e-3) + assert_close(loss, torch.zeros_like(loss), rtol=1e-3, atol=1e-3) def test_gradcheck(self, device, dtype): num_classes = 3 @@ -55,7 +55,7 @@ def test_jit(self, device, dtype): op = vak.nn.dice_loss op_script = torch.jit.script(op) - assert_allclose(op(logits, labels), op_script(logits, labels)) + assert_close(op(logits, labels), op_script(logits, labels)) def test_module(self, device, dtype): num_classes = 3 @@ -66,4 +66,4 @@ def test_module(self, device, dtype): op = vak.nn.dice_loss op_module = vak.nn.loss.DiceLoss() - assert_allclose(op(logits, labels), op_module(logits, labels)) + assert_close(op(logits, labels), op_module(logits, labels)) diff --git a/tests/test_predict/test_frame_classification.py b/tests/test_predict/test_frame_classification.py index 65f76d133..6726ec09b 100644 --- a/tests/test_predict/test_frame_classification.py +++ b/tests/test_predict/test_frame_classification.py @@ -19,9 +19,7 @@ def assert_predict_output_matches_expected(output_dir, annot_csv_filename): "model_name, audio_format, spect_format, annot_format, save_net_outputs", [ ("TweetyNet", "cbin", None, "notmat", False), - ("TweetyNet", "wav", None, "birdsong-recognition-dataset", False), ("TweetyNet", "cbin", None, "notmat", True), - ("TweetyNet", "wav", None, "birdsong-recognition-dataset", True), ], ) def test_predict_with_frame_classification_model( @@ -30,7 +28,7 @@ def test_predict_with_frame_classification_model( spect_format, annot_format, save_net_outputs, - specific_config, + specific_config_toml_path, tmp_path, device, ): @@ -44,7 +42,7 @@ def test_predict_with_frame_classification_model( {"section": "PREDICT", "option": "device", "value": device}, {"section": "PREDICT", "option": "save_net_outputs", "value": save_net_outputs}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="predict", model=model_name, audio_format=audio_format, @@ -103,7 +101,7 @@ def test_predict_with_frame_classification_model( ) def test_predict_with_frame_classification_model_raises_file_not_found( path_option_to_change, - specific_config, + specific_config_toml_path, tmp_path, device ): @@ -119,7 +117,7 @@ def test_predict_with_frame_classification_model_raises_file_not_found( {"section": "PREDICT", "option": "device", "value": device}, path_option_to_change, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="predict", model="TweetyNet", audio_format="cbin", @@ -160,7 +158,7 @@ def test_predict_with_frame_classification_model_raises_file_not_found( ) def test_predict_with_frame_classification_model_raises_not_a_directory( path_option_to_change, - specific_config, + specific_config_toml_path, device, tmp_path, ): @@ -183,7 +181,7 @@ def test_predict_with_frame_classification_model_raises_not_a_directory( {"section": "PREDICT", "option": "output_dir", "value": str(output_dir)} ) - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="predict", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_predict/test_predict.py b/tests/test_predict/test_predict.py index 0aa528abf..98051ca80 100644 --- a/tests/test_predict/test_predict.py +++ b/tests/test_predict/test_predict.py @@ -17,7 +17,7 @@ ) def test_predict( audio_format, spect_format, annot_format, model_name, predict_function_to_mock, - specific_config, tmp_path + specific_config_toml_path, tmp_path ): """Test that :func:`vak.predict.predict` dispatches to the correct model-specific training functions""" @@ -31,7 +31,7 @@ def test_predict( {"section": "PREDICT", "option": "device", "value": 'cpu'}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="predict", model=model_name, audio_format=audio_format, diff --git a/tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py b/tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py new file mode 100644 index 000000000..d354dfd6a --- /dev/null +++ b/tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py @@ -0,0 +1,70 @@ +import pandas as pd +import pytest + +import vak + + +@pytest.mark.parametrize( + 'config_type, model_name, audio_format, spect_format, annot_format, input_type', + [ + ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'), + ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + # TODO: add audio cases + ] +) +def test_assign_samples_to_splits( + config_type, model_name, audio_format, spect_format, annot_format, + input_type, tmp_path, specific_config_toml_path, specific_source_files_df, +): + toml_path = specific_config_toml_path( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + + cfg = vak.config.parse.from_toml_path(toml_path) + + # ---- set up ---- + tmp_dataset_path = tmp_path / 'dataset_dir' + tmp_dataset_path.mkdir() + + purpose = config_type + + source_files_df = specific_source_files_df( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + + out = vak.prep.frame_classification.assign_samples_to_splits( + purpose, + source_files_df, + tmp_dataset_path, + cfg.prep.train_dur, + cfg.prep.val_dur, + cfg.prep.test_dur, + cfg.prep.labelset, + ) + + assert isinstance(out, pd.DataFrame) + assert 'split' in out.columns + if purpose == 'predict': + assert all(val == 'predict' for val in out['split'].values) + elif purpose == 'eval': + assert all(val == 'test' for val in out['split'].values) + else: + split_vals = out['split'].values.tolist() + assert all( + [ + split_name in split_vals + for split_name in ('train', 'val', 'test') + if hasattr(cfg.prep, f'{split_name}_dur') and getattr(cfg.prep, f'{split_name}_dur') is not None + ] + ) diff --git a/tests/test_prep/test_frame_classification/test_dataset_arrays.py b/tests/test_prep/test_frame_classification/test_dataset_arrays.py deleted file mode 100644 index 8c3c82f54..000000000 --- a/tests/test_prep/test_frame_classification/test_dataset_arrays.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Unit tests for vak.prep.frame_classification.dataset_arrays""" -import json -import pathlib -import shutil - -import crowsetta -import pytest - -import vak.prep.frame_classification.dataset_arrays - - -@pytest.mark.parametrize( - 'annots, expected_sort_inds', - [ - ( - [ - crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( - onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b'] - ), annot_path='./fake'), - crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( - onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b'] - ), annot_path='./fake'), - ], - [0, 1,] - ), - ( - [ - crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( - onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b'] - ), annot_path='./fake'), - crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( - onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b'] - ), annot_path='./fake'), - crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( - onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b'] - ), annot_path='./fake'), - ], - [0, 1, 2], - ), - ] -) -def test_argsort_by_label_freq(annots, expected_sort_inds): - out = vak.prep.frame_classification.dataset_arrays.argsort_by_label_freq(annots) - assert isinstance(out, list) - assert out == expected_sort_inds - - -def copy_dataset_df_files_to_tmp_path_data_dir(dataset_df, dataset_path, config_type, input_type, tmp_path_data_dir): - """Copy all the files in a dataset DataFrame to a `tmp_path_data_dir`, - and change the paths in the Dataframe, so that we can then call - `vak.prep.frame_classification.helper.move_files_into_split_subdirs`.""" - paths_cols = [] - if input_type == 'spect': - paths_cols.append('spect_path') - elif input_type == 'audio': - paths_cols.append('audio_path') - if config_type != 'predict': - paths_cols.append('annot_path') - for paths_col in paths_cols: - paths = dataset_df[paths_col].values - new_paths = [] - for path in paths: - new_path = shutil.copy(src=dataset_path / path, dst=tmp_path_data_dir) - new_paths.append(new_path) - dataset_df[paths_col] = new_paths - return dataset_df - - -@pytest.mark.parametrize( - 'config_type, model_name, audio_format, spect_format, annot_format, input_type', - [ - ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), - ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), - ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), - ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'), - ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), - # TODO: add audio cases - ] -) -def test_make_npy_files_for_each_split(config_type, model_name, audio_format, spect_format, annot_format, - input_type, tmp_path, specific_dataset_df, specific_dataset_path): - dataset_df = specific_dataset_df(config_type, model_name, annot_format, audio_format, spect_format) - dataset_path = specific_dataset_path(config_type, model_name, annot_format, audio_format, spect_format) - tmp_path_data_dir = tmp_path / 'data_dir' - tmp_path_data_dir.mkdir() - copy_dataset_df_files_to_tmp_path_data_dir(dataset_df, dataset_path, config_type, input_type, tmp_path_data_dir) - - tmp_dataset_path = tmp_path / 'dataset_dir' - tmp_dataset_path.mkdir() - - if config_type != 'predict': - with (dataset_path / 'labelmap.json').open('r') as fp: - labelmap = json.load(fp) - else: - labelmap = None - - purpose = config_type - - vak.prep.frame_classification.dataset_arrays.make_npy_files_for_each_split( - dataset_df, - tmp_dataset_path, - input_type, - purpose, - labelmap, - audio_format, - ) - - splits = [ - split - for split in sorted(dataset_df.split.dropna().unique()) - if split != "None" - ] - - for split in splits: - split_subdir = tmp_dataset_path / split - if split != 'None': - assert split_subdir.exists() - elif split == 'None': - assert not split_subdir.exists() - - split_df = dataset_df[dataset_df.split == split].copy() - - if purpose != "predict": - annots = vak.common.annotation.from_df(split_df) - else: - annots = None - - if input_type == "audio": - source_paths = split_df["audio_path"].values - elif input_type == "spect": - source_paths = split_df["spect_path"].values - - source_paths = [pathlib.Path(source_path) for source_path in source_paths] - - if annots: - source_path_annot_tups = [ - (source_path, annot) - for source_path, annot in zip(source_paths, annots) - ] - else: - source_path_annot_tups = [ - (source_path, None) - for source_path in source_paths - ] - - for source_path_annot_tup in source_path_annot_tups: - source_path, annot = source_path_annot_tup - frames_array_file_that_should_exist = split_subdir / ( - source_path.stem - + vak.datasets.frame_classification.constants.FRAMES_ARRAY_EXT - ) - assert frames_array_file_that_should_exist.exists() - if annot: - frame_labels_file_that_should_exist = split_subdir / ( - source_path.stem - + vak.datasets.frame_classification.constants.FRAME_LABELS_EXT - ) - assert frame_labels_file_that_should_exist.exists() - - sample_id_vec_path = ( - split_subdir / - vak.datasets.frame_classification.constants.SAMPLE_IDS_ARRAY_FILENAME - ) - assert sample_id_vec_path.exists() - - inds_in_sample_vec_path = ( - split_subdir / - vak.datasets.frame_classification.constants.INDS_IN_SAMPLE_ARRAY_FILENAME - ) - assert inds_in_sample_vec_path.exists() diff --git a/tests/test_prep/test_frame_classification/test_frame_classification.py b/tests/test_prep/test_frame_classification/test_frame_classification.py index b0fdf781f..31a264847 100644 --- a/tests/test_prep/test_frame_classification/test_frame_classification.py +++ b/tests/test_prep/test_frame_classification/test_frame_classification.py @@ -34,22 +34,26 @@ def assert_prep_output_matches_expected(dataset_path, df_returned_by_prep): check_exact = False else: check_exact = True - try: - assert_series_equal( - df_from_dataset_path[column], - df_returned_by_prep[column], - check_exact=check_exact, - ) - except: - breakpoint() - - for column in ('spect_path', 'annot_path'): - paths = df_from_dataset_path[column].values - if not all([isinstance(path, str) for path in paths]): - continue - for path in paths: - path = pathlib.Path(path) - assert (dataset_path / path).exists() + assert_series_equal( + df_from_dataset_path[column], + df_returned_by_prep[column], + check_exact=check_exact, + ) + + if vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME in df_returned_by_prep.columns: + frames_paths = df_returned_by_prep[ + vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME + ].values + for frames_path in frames_paths: + assert (dataset_path / frames_path).exists() + + if vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME in df_returned_by_prep.columns: + frame_labels_paths = df_returned_by_prep[ + vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME + ].values + if not all([frame_labels_path is None for frame_labels_path in frame_labels_paths]): + for frame_labels_path in frame_labels_paths: + assert (dataset_path / frame_labels_path).exists() @pytest.mark.parametrize( @@ -58,9 +62,7 @@ def assert_prep_output_matches_expected(dataset_path, df_returned_by_prep): ("eval", "cbin", None, "notmat"), ("learncurve", "cbin", None, "notmat"), ("predict", "cbin", None, "notmat"), - ("predict", "wav", None, "birdsong-recognition-dataset"), ("train", "cbin", None, "notmat"), - ("train", "wav", None, "birdsong-recognition-dataset"), ("train", None, "mat", "yarden"), ], ) @@ -69,7 +71,7 @@ def test_prep_frame_classification_dataset( audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, default_model, tmp_path, ): @@ -85,7 +87,7 @@ def test_prep_frame_classification_dataset( "value": str(output_dir), }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type=config_type, model=default_model, audio_format=audio_format, @@ -123,7 +125,6 @@ def test_prep_frame_classification_dataset( ("eval", "cbin", None, "notmat"), ("learncurve", "cbin", None, "notmat"), ("train", "cbin", None, "notmat"), - ("train", "wav", None, "birdsong-recognition-dataset"), ("train", None, "mat", "yarden"), ], ) @@ -132,7 +133,7 @@ def test_prep_frame_classification_dataset_raises_when_labelset_required_but_is_ audio_format, spect_format, annot_format, - specific_config, + specific_config_toml_path, default_model, tmp_path, ): @@ -158,7 +159,7 @@ def test_prep_frame_classification_dataset_raises_when_labelset_required_but_is_ "value": "DELETE-OPTION", }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type=config_type, model=default_model, audio_format=audio_format, @@ -188,9 +189,9 @@ def test_prep_frame_classification_dataset_raises_when_labelset_required_but_is_ def test_prep_frame_classification_dataset_with_single_audio_and_annot(source_test_data_root, - specific_config, - default_model, - tmp_path): + specific_config_toml_path, + default_model, + tmp_path): """ regression test, checks that we avoid a repeat of https://github.com/vocalpy/vak/issues/467 @@ -226,7 +227,7 @@ def test_prep_frame_classification_dataset_with_single_audio_and_annot(source_te }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type='eval', model=default_model, audio_format='cbin', @@ -257,7 +258,7 @@ def test_prep_frame_classification_dataset_with_single_audio_and_annot(source_te def test_prep_frame_classification_dataset_when_annot_has_single_segment(source_test_data_root, - specific_config, + specific_config_toml_path, default_model, tmp_path): """ @@ -284,7 +285,7 @@ def test_prep_frame_classification_dataset_when_annot_has_single_segment(source_ }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type='eval', model=default_model, audio_format='cbin', @@ -323,7 +324,7 @@ def test_prep_frame_classification_dataset_when_annot_has_single_segment(source_ ) def test_prep_frame_classification_dataset_raises_not_a_directory( dir_option_to_change, - specific_config, + specific_config_toml_path, default_model, tmp_path, ): @@ -331,7 +332,7 @@ def test_prep_frame_classification_dataset_raises_not_a_directory( when one of the following is not a directory: data_dir, output_dir """ - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="TweetyNet", audio_format="cbin", @@ -368,7 +369,7 @@ def test_prep_frame_classification_dataset_raises_not_a_directory( ) def test_prep_frame_classification_dataset_raises_file_not_found( path_option_to_change, - specific_config, + specific_config_toml_path, default_model, tmp_path, ): @@ -379,7 +380,7 @@ def test_prep_frame_classification_dataset_raises_file_not_found( Structuring unit test this way in case other path parameters get added. """ - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_prep/test_frame_classification/test_get_or_make_source_files.py b/tests/test_prep/test_frame_classification/test_get_or_make_source_files.py new file mode 100644 index 000000000..d43ca7380 --- /dev/null +++ b/tests/test_prep/test_frame_classification/test_get_or_make_source_files.py @@ -0,0 +1,88 @@ +from unittest import mock + +import pandas as pd +import pytest + +import vak + +FAKE_SOURCE_FILES_DF = pd.DataFrame.from_records( + [ + {'audio_path': 'bird0-2023.10.12.cbin', + 'spect_path': 'bird0-2023.10.12.cbin.spect.npz', + 'annot_path': 'bird0-2023.10.12.cbin.not.mat'} + ] +) + + +@pytest.mark.parametrize( + 'config_type, model_name, audio_format, spect_format, annot_format, input_type', + [ + ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'), + ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ] +) +def test_get_or_make_source_files( + config_type, model_name, audio_format, spect_format, annot_format, + input_type, tmp_path, specific_config_toml_path +): + """Test that this `vak.prep.frame_classification.get_or_make_source_files` dispatches correctly. + + Other unit tests already test the functions that this function calls. + """ + toml_path = specific_config_toml_path( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + + cfg = vak.config.parse.from_toml_path(toml_path) + + # ---- set up ---- + tmp_dataset_path = tmp_path / 'dataset_dir' + tmp_dataset_path.mkdir() + + if cfg.prep.input_type == 'audio': + with mock.patch('vak.prep.frame_classification.source_files.prep_audio_dataset', autospec=True) as mock_prep_audio_dataset: + mock_prep_audio_dataset.return_value = FAKE_SOURCE_FILES_DF + + out: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files( + cfg.prep.data_dir, + cfg.prep.input_type, + cfg.prep.audio_format, + cfg.prep.spect_format, + cfg.spect_params, + tmp_dataset_path, + cfg.prep.annot_format, + cfg.prep.annot_file, + cfg.prep.labelset, + cfg.prep.audio_dask_bag_kwargs, + ) + + assert mock_prep_audio_dataset.called + assert isinstance(out, pd.DataFrame) + + elif cfg.prep.input_type == 'spect': + with mock.patch( + 'vak.prep.frame_classification.source_files.prep_spectrogram_dataset', autospec=True + ) as mock_prep_spect_dataset: + mock_prep_spect_dataset.return_value = FAKE_SOURCE_FILES_DF + + out: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files( + cfg.prep.data_dir, + cfg.prep.input_type, + cfg.prep.audio_format, + cfg.prep.spect_format, + cfg.spect_params, + tmp_dataset_path, + cfg.prep.annot_format, + cfg.prep.annot_file, + cfg.prep.labelset, + cfg.prep.audio_dask_bag_kwargs, + ) + assert mock_prep_spect_dataset.called + assert isinstance(out, pd.DataFrame) diff --git a/tests/test_prep/test_frame_classification/test_learncurve.py b/tests/test_prep/test_frame_classification/test_learncurve.py index 1de0ad22a..150e6483a 100644 --- a/tests/test_prep/test_frame_classification/test_learncurve.py +++ b/tests/test_prep/test_frame_classification/test_learncurve.py @@ -1,5 +1,6 @@ import json import shutil +from unittest import mock import numpy as np import pandas as pd @@ -10,6 +11,117 @@ import vak.common.paths import vak.prep.frame_classification +@pytest.mark.parametrize( + 'model_name, audio_format, annot_format, input_type', + [ + ('TweetyNet', 'cbin', 'notmat', 'spect') + ] +) +def test_make_index_vectors_for_each_subsets( + model_name, audio_format, annot_format, input_type, specific_config_toml_path, device, tmp_path, +): + root_results_dir = tmp_path.joinpath("tmp_root_results_dir") + root_results_dir.mkdir() + options_to_change = [ + { + "section": "LEARNCURVE", + "option": "root_results_dir", + "value": str(root_results_dir), + }, + ] + toml_path = specific_config_toml_path( + config_type="learncurve", + model=model_name, + audio_format=audio_format, + annot_format=annot_format, + options_to_change=options_to_change, + ) + cfg = vak.config.parse.from_toml_path(toml_path) + + dataset_path = cfg.learncurve.dataset_path + metadata = vak.datasets.frame_classification.Metadata.from_dataset_path(dataset_path) + dataset_csv_path = dataset_path / metadata.dataset_csv_filename + dataset_df = pd.read_csv(dataset_csv_path) + + subsets_df = dataset_df[ + ~dataset_df['subset'].isnull() + ] + + tmp_dataset_path = tmp_path / f"test_make_learncurve_splits_from_dataset_df" + shutil.copytree(dataset_path, tmp_dataset_path) + # delete all the subset indices vectors, since we're about to test that we make them + for train_dur in cfg.prep.train_set_durs: + for replicate_num in range(1, cfg.prep.num_replicates + 1): + train_dur_replicate_subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name( + train_dur, replicate_num + ) + sample_id_vec_path = (tmp_dataset_path / "train" / + vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset( + train_dur_replicate_subset_name) + ) + sample_id_vec_path.unlink() + inds_in_sample_vec_path = (tmp_dataset_path / "train" / + vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset( + train_dur_replicate_subset_name) + ) + inds_in_sample_vec_path.unlink() + + vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset( + subsets_df, + tmp_dataset_path, + input_type, + ) + + assert sorted(subsets_df['train_dur'].unique()) == cfg.prep.train_set_durs + assert sorted( + subsets_df['replicate_num'].unique() + ) == list(range(1, cfg.prep.num_replicates + 1)) + + # assert that each expected split name is in data frame + for train_dur in cfg.prep.train_set_durs: + train_dur_df = subsets_df[np.isclose(subsets_df['train_dur'], train_dur)].copy() + # assert correct number of replicates for this train duration + assert sorted( + train_dur_df['replicate_num'] + ) == list(range(1, cfg.prep.num_replicates + 1)) + + for replicate_num in range(1, cfg.prep.num_replicates + 1): + subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name( + train_dur, replicate_num + ) + + # test that indexing vectors got made + sample_id_vec_path = (tmp_dataset_path / "train" / + vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset( + subset_name) + ) + assert sample_id_vec_path.exists() + + inds_in_sample_vec_path = (tmp_dataset_path / "train" / + vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset( + subset_name) + ) + assert inds_in_sample_vec_path.exists() + + this_subset_df = subsets_df[subsets_df['subset'] == subset_name] + frames_paths = this_subset_df[ + vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME + ].values + sample_id_vec, inds_in_sample_vec = [], [] + for sample_id, frames_path in enumerate(frames_paths): + # make indexing vectors that we use to test + frames = vak.datasets.frame_classification.helper.load_frames(tmp_dataset_path / frames_path, + input_type) + n_frames = frames.shape[-1] + sample_id_vec.append(np.ones((n_frames,)).astype(np.int32) * sample_id) + inds_in_sample_vec.append(np.arange(n_frames)) + expected_sample_id_vec = np.concatenate(sample_id_vec) + expected_inds_in_sample_vec = np.concatenate(inds_in_sample_vec) + sample_id_vec = np.load(sample_id_vec_path) + assert np.array_equal(sample_id_vec, expected_sample_id_vec) + inds_in_sample_vec = np.load(inds_in_sample_vec_path) + assert np.array_equal(inds_in_sample_vec, expected_inds_in_sample_vec) + @pytest.mark.parametrize( 'model_name, audio_format, annot_format, input_type', @@ -17,8 +129,8 @@ ('TweetyNet', 'cbin', 'notmat', 'spect') ] ) -def test_make_learncurve_splits_from_dataset_df( - model_name, audio_format, annot_format, input_type, specific_config, device, tmp_path, +def test_make_subsets_from_dataset_df( + model_name, audio_format, annot_format, input_type, specific_config_toml_path, device, tmp_path, ): root_results_dir = tmp_path.joinpath("tmp_root_results_dir") root_results_dir.mkdir() @@ -29,7 +141,7 @@ def test_make_learncurve_splits_from_dataset_df( "value": str(root_results_dir), }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="learncurve", model=model_name, audio_format=audio_format, @@ -49,57 +161,70 @@ def test_make_learncurve_splits_from_dataset_df( tmp_dataset_path = tmp_path / f"test_make_learncurve_splits_from_dataset_df" shutil.copytree(dataset_path, tmp_dataset_path) - # delete all the split directories since we're about to test that we make them + # delete all the subset indices vectors, since we're about to test that we make them for train_dur in cfg.prep.train_set_durs: for replicate_num in range(1, cfg.prep.num_replicates + 1): - train_dur_replicate_split_name = vak.common.learncurve.get_train_dur_replicate_split_name( + train_dur_replicate_subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name( train_dur, replicate_num ) - split_dir = tmp_dataset_path / train_dur_replicate_split_name - shutil.rmtree(split_dir) - - out = vak.prep.frame_classification.learncurve.make_learncurve_splits_from_dataset_df( - dataset_df, - "spect", - cfg.prep.train_set_durs, - cfg.prep.num_replicates, - tmp_dataset_path, - labelmap, - audio_format=audio_format, - ) + sample_id_vec_path = (tmp_dataset_path / "train" / + vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset( + train_dur_replicate_subset_name) + ) + sample_id_vec_path.unlink() + inds_in_sample_vec_path = (tmp_dataset_path / "train" / + vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset( + train_dur_replicate_subset_name) + ) + inds_in_sample_vec_path.unlink() + + # now reset the dataset df to what it would have been before we passed it into `make_splits` + dataset_df = dataset_df[ + # drop any rows where there *is* a train dur -- because these are the subsets + dataset_df['train_dur'].isnull() + # drop the columns added by ``make_splits``, then reset the index + ].drop(columns=['subset', 'train_dur', 'replicate_num']).reset_index(drop=True) + + with mock.patch('vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset') as mock_idx_vectors: + out = vak.prep.frame_classification.learncurve.make_subsets_from_dataset_df( + dataset_df, + input_type, + cfg.prep.train_set_durs, + cfg.prep.num_replicates, + tmp_dataset_path, + labelmap, + ) + assert mock_idx_vectors.called + assert isinstance(out, pd.DataFrame) - splits_df = dataset_df[ - ~dataset_df.split.isin(('train', 'val', 'test')) + for added_column in ('subset', 'train_dur', 'replicate_num'): + assert added_column in out.columns + + subsets_df = out[ + ~out['subset'].isnull() ] - assert sorted(splits_df['train_dur'].unique()) == cfg.prep.train_set_durs + assert sorted(subsets_df['train_dur'].unique()) == cfg.prep.train_set_durs assert sorted( - splits_df['replicate_num'].unique() + subsets_df['replicate_num'].unique() ) == list(range(1, cfg.prep.num_replicates + 1)) # assert that each expected split name is in data frame - all_split_names = [] for train_dur in cfg.prep.train_set_durs: - train_dur_df = splits_df[np.isclose(splits_df['train_dur'], train_dur)].copy() + train_dur_df = subsets_df[np.isclose(subsets_df['train_dur'], train_dur)].copy() # assert correct number of replicates for this train duration assert sorted( train_dur_df['replicate_num'] ) == list(range(1, cfg.prep.num_replicates + 1)) for replicate_num in range(1, cfg.prep.num_replicates + 1): - train_dur_replicate_split_name = vak.common.learncurve.get_train_dur_replicate_split_name( + subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name( train_dur, replicate_num ) - all_split_names.append(train_dur_replicate_split_name) - - # assert directory holding split files exists - split_dir = tmp_dataset_path / train_dur_replicate_split_name - assert split_dir.exists() and split_dir.is_dir() # assert this train_dur + replicate split exists in dataframe - assert np.isin(train_dur_replicate_split_name, splits_df['split'].values) - this_split_df = splits_df[splits_df['split'] == train_dur_replicate_split_name] + assert np.isin(subset_name, subsets_df['subset'].values) + this_subset_df = subsets_df[subsets_df['subset'] == subset_name] # assert that it has the correct duration - assert this_split_df['duration'].sum() >= train_dur - + assert this_subset_df['duration'].sum() >= train_dur diff --git a/tests/test_prep/test_frame_classification/test_make_splits.py b/tests/test_prep/test_frame_classification/test_make_splits.py new file mode 100644 index 000000000..5d5ef11cf --- /dev/null +++ b/tests/test_prep/test_frame_classification/test_make_splits.py @@ -0,0 +1,236 @@ +"""Unit tests for vak.prep.frame_classification.make_splits""" +import json +import pathlib +import shutil + +import crowsetta +import numpy as np +import pandas as pd +import pytest + +import vak.prep.frame_classification.make_splits + + +@pytest.mark.parametrize( + 'annots, expected_sort_inds', + [ + ( + [ + crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( + onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b'] + ), annot_path='./fake'), + crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( + onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b'] + ), annot_path='./fake'), + ], + [0, 1,] + ), + ( + [ + crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( + onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b'] + ), annot_path='./fake'), + crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( + onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b'] + ), annot_path='./fake'), + crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword( + onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b'] + ), annot_path='./fake'), + ], + [0, 1, 2], + ), + ] +) +def test_argsort_by_label_freq(annots, expected_sort_inds): + out = vak.prep.frame_classification.make_splits.argsort_by_label_freq(annots) + assert isinstance(out, list) + assert out == expected_sort_inds + + +def copy_dataset_df_files_to_tmp_path_data_dir(dataset_df, dataset_path, config_type, input_type, tmp_path_data_dir): + """Copy all the files in a dataset DataFrame to a `tmp_path_data_dir`, + and change the paths in the Dataframe, so that we can then call + `vak.prep.frame_classification.helper.move_files_into_split_subdirs`.""" + paths_cols = [] + if input_type == 'spect': + paths_cols.append('spect_path') + elif input_type == 'audio': + paths_cols.append('audio_path') + if config_type != 'predict': + paths_cols.append('annot_path') + for paths_col in paths_cols: + paths = dataset_df[paths_col].values + new_paths = [] + for path in paths: + new_path = shutil.copy(src=dataset_path / path, dst=tmp_path_data_dir) + new_paths.append(new_path) + dataset_df[paths_col] = new_paths + return dataset_df + + +@pytest.mark.parametrize( + 'config_type, model_name, audio_format, spect_format, annot_format, input_type', + [ + ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'), + ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'), + # TODO: add audio cases + ] +) +def test_make_splits(config_type, model_name, audio_format, spect_format, annot_format, + input_type, tmp_path, specific_config_toml_path, specific_source_files_with_split_df): + toml_path = specific_config_toml_path( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + cfg = vak.config.parse.from_toml_path(toml_path) + + # ---- set up ---- + tmp_dataset_path = tmp_path / 'dataset_dir' + tmp_dataset_path.mkdir() + + purpose = config_type + + dataset_df = specific_source_files_with_split_df( + config_type, + model_name, + annot_format, + audio_format, + spect_format, + ) + if purpose != "predict": + # TODO: add option to generate predict using existing dataset, so we can get labelmap from it + map_unlabeled_segments = vak.prep.sequence_dataset.has_unlabeled_segments( + dataset_df + ) + labelmap = vak.common.labels.to_map( + cfg.prep.labelset, map_unlabeled=map_unlabeled_segments + ) + else: + labelmap = None + + dataset_df_with_splits = vak.prep.frame_classification.make_splits.make_splits( + dataset_df, + tmp_dataset_path, + cfg.prep.input_type, + purpose, + labelmap, + cfg.prep.audio_format, + ) + assert isinstance(dataset_df_with_splits, pd.DataFrame) + + splits = [ + split + for split in sorted(dataset_df_with_splits.split.dropna().unique()) + if split != "None" + ] + + for split in splits: + split_subdir = tmp_dataset_path / split + if split != 'None': + assert split_subdir.exists() + elif split == 'None': + assert not split_subdir.exists() + + split_df = dataset_df_with_splits[ + dataset_df_with_splits.split == split + ].copy() + + assert vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME in split_df.columns + + frames_paths = split_df[ + vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME + ].values + + if purpose != "predict": + assert vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME in split_df.columns + + frame_labels_paths = split_df[ + vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME + ].values + + annots = vak.common.annotation.from_df(split_df) + + frames_tuples = [ + (frames_path, frame_labels_path, annot) + for frames_path, frame_labels_path, annot in zip( + frames_paths, frame_labels_paths, annots + ) + ] + else: + frames_tuples = [ + (frames_path, None, None) + for frames_path in frames_paths + ] + + sample_id_vecs, inds_in_sample_vecs = [], [] + for sample_id, frames_tuple in enumerate(frames_tuples): + frames_path, frame_labels_path, annot = frames_tuple + frames_file_that_should_exist = tmp_dataset_path / frames_path + assert frames_file_that_should_exist.exists() + + # NOTE we load frames to confirm we can and also to make indexing vectors we use to test, + # see next code block + frames = vak.datasets.frame_classification.helper.load_frames(tmp_dataset_path / frames_path, input_type) + assert isinstance(frames, np.ndarray) + + # make indexing vectors that we use to test + n_frames = frames.shape[-1] + sample_id_vecs.append(np.ones((n_frames,)).astype(np.int32) * sample_id) + inds_in_sample_vecs.append(np.arange(n_frames)) + + if frame_labels_path is not None and annot is not None: + frame_labels_file_that_should_exist = tmp_dataset_path / frame_labels_path + assert frame_labels_file_that_should_exist.exists() + + if input_type == "audio": + _, samplefreq = vak.common.constants.AUDIO_FORMAT_FUNC_MAP[ + audio_format + ](tmp_dataset_path / frames_path) + frame_times = np.arange(frames.shape[-1]) / samplefreq + elif input_type == "spect": + spect_dict = vak.common.files.spect.load(tmp_dataset_path / frames_path, "npz") + frame_times = spect_dict[vak.common.constants.TIMEBINS_KEY] + + lbls_int = [labelmap[lbl] for lbl in annot.seq.labels] + expected_frame_labels = vak.transforms.frame_labels.from_segments( + lbls_int, + annot.seq.onsets_s, + annot.seq.offsets_s, + frame_times, + unlabeled_label=labelmap["unlabeled"], + ) + frame_labels = np.load(frame_labels_file_that_should_exist) + assert np.array_equal(frame_labels, expected_frame_labels) + + # assert there are no remaining .spect.npz files in dataset path (root) + # because they were moved in to splits, and we removed any remaining that were not put into splits + spect_npz_files_not_in_split = sorted( + tmp_dataset_path.glob(f'*{vak.common.constants.SPECT_NPZ_EXTENSION}') + ) + assert len(spect_npz_files_not_in_split) == 0 + + sample_id_vec_path = ( + split_subdir / + vak.datasets.frame_classification.constants.SAMPLE_IDS_ARRAY_FILENAME + ) + assert sample_id_vec_path.exists() + + expected_sample_id_vec = np.concatenate(sample_id_vecs) + sample_id_vec = np.load(sample_id_vec_path) + assert np.array_equal(sample_id_vec, expected_sample_id_vec) + + inds_in_sample_vec_path = ( + split_subdir / + vak.datasets.frame_classification.constants.INDS_IN_SAMPLE_ARRAY_FILENAME + ) + assert inds_in_sample_vec_path.exists() + + expected_inds_in_sample_vec = np.concatenate(inds_in_sample_vecs) + inds_in_sample_vec = np.load(inds_in_sample_vec_path) + assert np.array_equal(inds_in_sample_vec, expected_inds_in_sample_vec) diff --git a/tests/test_prep/test_prep.py b/tests/test_prep/test_prep.py index 4481af81c..8e995f8bc 100644 --- a/tests/test_prep/test_prep.py +++ b/tests/test_prep/test_prep.py @@ -13,9 +13,7 @@ ("eval", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"), ("learncurve", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"), ("predict", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"), - ("predict", "wav", None, "birdsong-recognition-dataset", "vak.prep.prep_.prep_frame_classification_dataset"), ("train", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"), - ("train", "wav", None, "birdsong-recognition-dataset", "vak.prep.prep_.prep_frame_classification_dataset"), ("train", None, "mat", "yarden", "vak.prep.prep_.prep_frame_classification_dataset"), ], ) @@ -25,7 +23,7 @@ def test_prep( spect_format, annot_format, dataset_prep_func_to_mock, - specific_config, + specific_config_toml_path, default_model, tmp_path, ): @@ -42,7 +40,7 @@ def test_prep( "value": str(output_dir), }, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type=config_type, model=default_model, audio_format=audio_format, diff --git a/tests/test_prep/test_sequence_dataset.py b/tests/test_prep/test_sequence_dataset.py index 80ef3739e..66410204d 100644 --- a/tests/test_prep/test_sequence_dataset.py +++ b/tests/test_prep/test_sequence_dataset.py @@ -8,7 +8,6 @@ 'model_name, config_type, audio_format, spect_format, annot_format, expected_result', [ ("TweetyNet", "train", "cbin", None, "notmat", True), - ("TweetyNet", "train", "wav", None, "birdsong-recognition-dataset", True), ("TweetyNet", "train", None, "mat", "yarden", True), ] ) diff --git a/tests/test_prep/test_spectrogram_dataset/test_prep.py b/tests/test_prep/test_spectrogram_dataset/test_prep.py index 792ff1c85..f72244883 100644 --- a/tests/test_prep/test_spectrogram_dataset/test_prep.py +++ b/tests/test_prep/test_spectrogram_dataset/test_prep.py @@ -101,45 +101,32 @@ def assert_returned_dataframe_matches_expected( ] ) - # test that all the generated spectrogram files are in a - # newly-created directory inside spect_output_dir + # test that all the generated spectrogram files are in spect_output_dir assert all( [ - spect_path.parents[1] == spect_output_dir + spect_path.parents[0] == spect_output_dir for spect_path in spect_paths_from_df ] ) elif spect_format: # implies that --> we made the dataframe from spect files - if spect_format == 'mat': - expected_spect_file_names = [ - spect_path.name.replace('.mat', '.npz') - for spect_path in expected_spect_paths - ] - else: - expected_spect_file_names = [ - spect_path.name for spect_path in expected_spect_paths - ] + expected_spect_file_names = [ + spect_path.name for spect_path in expected_spect_paths + ] assert all( - [expected_spect_file_name in spect_file_names_from_df - for expected_spect_file_name in expected_spect_file_names] + [spect_file_name_from_df in expected_spect_file_names + for spect_file_name_from_df in spect_file_names_from_df] ) # test that **only** expected paths were in DataFrame if not_expected_spect_paths is not None: - if spect_format == 'mat': - not_expected_spect_file_names = [ - spect_path.name.replace('.mat', '.npz') - for spect_path in not_expected_spect_paths - ] - else: - not_expected_spect_file_names = [ - spect_path.name for spect_path in not_expected_spect_paths - ] + not_expected_spect_file_names = [ + spect_path.name for spect_path in not_expected_spect_paths + ] assert all( - [not_expected_spect_file_name not in spect_file_names_from_df - for not_expected_spect_file_name in not_expected_spect_file_names] + [spect_file_name_from_df not in not_expected_spect_file_names + for spect_file_name_from_df in spect_file_names_from_df] ) diff --git a/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py b/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py index 77babd444..a5864b47f 100644 --- a/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py +++ b/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py @@ -29,35 +29,23 @@ def assert_expected_spect_paths_in_dataframe( of paths to spectrogram files, that should **not** be in dataset_df.spect_path column """ spect_file_names_from_df = [spect_path.name for spect_path in spect_paths_from_df] - - if spect_format == 'mat': - expected_spectfile_names = [ - spect_path.name.replace('.mat', '.npz') - for spect_path in expected_spect_paths - ] - else: - expected_spectfile_names = [ - spect_path.name for spect_path in expected_spect_paths - ] + expected_spectfile_names = [ + spect_path.name for spect_path in expected_spect_paths + ] assert all( - [expected_spect_file in spect_file_names_from_df for expected_spect_file in expected_spectfile_names] + [spect_file_name_from_df in expected_spectfile_names + for spect_file_name_from_df in spect_file_names_from_df] ) # test that **only** expected paths were in DataFrame if not_expected_spect_paths is not None: - if spect_format == 'mat': - not_expected_spectfile_names = [ - spect_path.name.replace('.mat', '.npz') - for spect_path in not_expected_spect_paths - ] - else: - not_expected_spectfile_names = [ - spect_path.name for spect_path in not_expected_spect_paths - ] + not_expected_spectfile_names = [ + spect_path.name for spect_path in not_expected_spect_paths + ] assert all( - [not_expected_spect_file not in spect_file_names_from_df - for not_expected_spect_file in not_expected_spectfile_names] + [spect_file_name_from_df not in not_expected_spectfile_names + for spect_file_name_from_df in spect_file_names_from_df] ) @@ -119,7 +107,6 @@ def test_make_dataframe_of_spect_files( spect_format=spect_format, spect_dir=spect_dir, spect_files=spect_files, - spect_output_dir=spect_output_dir, labelset=labelset, annot_list=annot_list, annot_format=annot_format, @@ -144,13 +131,10 @@ def test_make_dataframe_of_spect_files( expected_spect_list, not_expected_spect_list ) - if spect_format == 'mat': - expected_parent = spect_output_dir - else: - expected_parent = specific_spect_dir(spect_format) - assert all( - [spect_path.parent == expected_parent for spect_path in spect_paths_from_df] - ) + if spect_dir is not None: + assert all( + [spect_path.parent == spect_dir for spect_path in spect_paths_from_df] + ) def test_make_dataframe_of_spect_files_no_spect_dir_files_or_map_raises(annot_list_yarden): diff --git a/tests/test_train/test_frame_classification.py b/tests/test_train/test_frame_classification.py index c2ef7f201..9cddd7e17 100644 --- a/tests/test_train/test_frame_classification.py +++ b/tests/test_train/test_frame_classification.py @@ -38,12 +38,11 @@ def assert_train_output_matches_expected(cfg: vak.config.config.Config, model_na "model_name, audio_format, spect_format, annot_format", [ ("TweetyNet", "cbin", None, "notmat"), - ("TweetyNet", "wav", None, "birdsong-recognition-dataset"), ("TweetyNet", None, "mat", "yarden"), ], ) def test_train_frame_classification_model( - model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device + model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device ): results_path = vak.common.paths.generate_results_dir_name_as_path(tmp_path) results_path.mkdir() @@ -51,7 +50,7 @@ def test_train_frame_classification_model( {"section": "TRAIN", "option": "device", "value": device}, {"section": "TRAIN", "option": "root_results_dir", "value": results_path} ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model=model_name, audio_format=audio_format, @@ -92,12 +91,11 @@ def test_train_frame_classification_model( "model_name, audio_format, spect_format, annot_format", [ ("TweetyNet", "cbin", None, "notmat"), - ("TweetyNet", "wav", None, "birdsong-recognition-dataset"), ("TweetyNet", None, "mat", "yarden"), ], ) def test_continue_training( - model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device + model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device ): results_path = vak.common.paths.generate_results_dir_name_as_path(tmp_path) results_path.mkdir() @@ -105,7 +103,7 @@ def test_continue_training( {"section": "TRAIN", "option": "device", "value": device}, {"section": "TRAIN", "option": "root_results_dir", "value": results_path} ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train_continue", model=model_name, audio_format=audio_format, @@ -149,7 +147,7 @@ def test_continue_training( ] ) def test_train_raises_file_not_found( - path_option_to_change, specific_config, tmp_path, device + path_option_to_change, specific_config_toml_path, tmp_path, device ): """Test that pre-conditions in `vak.train` raise FileNotFoundError when one of the following does not exist: @@ -159,7 +157,7 @@ def test_train_raises_file_not_found( {"section": "TRAIN", "option": "device", "value": device}, path_option_to_change ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="TweetyNet", audio_format="cbin", @@ -204,7 +202,7 @@ def test_train_raises_file_not_found( ] ) def test_train_raises_not_a_directory( - path_option_to_change, specific_config, device, tmp_path + path_option_to_change, specific_config_toml_path, device, tmp_path ): """Test that core.train raises NotADirectory when directory does not exist @@ -214,7 +212,7 @@ def test_train_raises_not_a_directory( {"section": "TRAIN", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="TweetyNet", audio_format="cbin", diff --git a/tests/test_train/test_parametric_umap.py b/tests/test_train/test_parametric_umap.py index 11e5f709c..a64516e0a 100644 --- a/tests/test_train/test_parametric_umap.py +++ b/tests/test_train/test_parametric_umap.py @@ -35,7 +35,7 @@ def assert_train_output_matches_expected(cfg: vak.config.config.Config, model_na ) def test_train_parametric_umap_model( model_name, audio_format, spect_format, annot_format, - specific_config, tmp_path, device + specific_config_toml_path, tmp_path, device ): results_path = vak.common.paths.generate_results_dir_name_as_path(tmp_path) results_path.mkdir() @@ -43,7 +43,7 @@ def test_train_parametric_umap_model( {"section": "TRAIN", "option": "device", "value": device}, {"section": "TRAIN", "option": "root_results_dir", "value": results_path} ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model=model_name, audio_format=audio_format, @@ -83,7 +83,7 @@ def test_train_parametric_umap_model( ] ) def test_train_parametric_umap_model_raises_file_not_found( - path_option_to_change, specific_config, tmp_path, device + path_option_to_change, specific_config_toml_path, tmp_path, device ): """Test that pre-conditions in :func:`vak.train.parametric_umap.train_parametric_umap_model` raise FileNotFoundError when one of the following does not exist: @@ -93,7 +93,7 @@ def test_train_parametric_umap_model_raises_file_not_found( {"section": "TRAIN", "option": "device", "value": device}, path_option_to_change ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="ConvEncoderUMAP", audio_format="cbin", @@ -135,7 +135,7 @@ def test_train_parametric_umap_model_raises_file_not_found( ] ) def test_train_parametric_umap_model_raises_not_a_directory( - path_option_to_change, specific_config, device, tmp_path + path_option_to_change, specific_config_toml_path, device, tmp_path ): """Test that core.train raises NotADirectory when directory does not exist @@ -145,7 +145,7 @@ def test_train_parametric_umap_model_raises_not_a_directory( {"section": "TRAIN", "option": "device", "value": device}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model="ConvEncoderUMAP", audio_format="cbin", diff --git a/tests/test_train/test_train.py b/tests/test_train/test_train.py index c43e6631f..559853a24 100644 --- a/tests/test_train/test_train.py +++ b/tests/test_train/test_train.py @@ -14,8 +14,6 @@ [ ("cbin", None, "notmat", "TweetyNet", 'vak.train.train_.train_frame_classification_model'), - ("wav", None, "birdsong-recognition-dataset", "TweetyNet", - 'vak.train.train_.train_frame_classification_model'), (None, "mat", "yarden", "TweetyNet", 'vak.train.train_.train_frame_classification_model'), ("cbin", None, "notmat", "ConvEncoderUMAP", @@ -24,7 +22,7 @@ ) def test_train( audio_format, spect_format, annot_format, model_name, train_function_to_mock, - specific_config, tmp_path + specific_config_toml_path, tmp_path ): """Test that :func:`vak.train.train` dispatches to the correct model-specific training functions""" @@ -40,7 +38,7 @@ def test_train( {"section": "TRAIN", "option": "device", "value": 'cpu'}, ] - toml_path = specific_config( + toml_path = specific_config_toml_path( config_type="train", model=model_name, audio_format=audio_format, diff --git a/tests/test_transforms/test_transforms.py b/tests/test_transforms/test_transforms.py index a90270388..b71a42749 100644 --- a/tests/test_transforms/test_transforms.py +++ b/tests/test_transforms/test_transforms.py @@ -60,26 +60,19 @@ def test_fit_dataset_path(self, split, train_cbin_notmat_df, annot_format="notmat" ) - dataset_csv_path = specific_dataset_csv_path( - config_type="train", - model="TweetyNet", - audio_format="cbin", - annot_format="notmat" - ) - if split is None: split_to_test = 'train' else: split_to_test = split # ---- set up df_split = train_cbin_notmat_df[train_cbin_notmat_df.split == split_to_test].copy() - spect_paths = df_split['spect_path'].values - spect = vak.common.files.spect.load(dataset_path / spect_paths[0])['s'] + spect_paths = df_split['frames_path'].values + spect = vak.common.files.spect.load(dataset_path / spect_paths[0])[vak.common.constants.SPECT_KEY] mean_freqs = np.mean(spect, axis=1) std_freqs = np.std(spect, axis=1) for spect_path in spect_paths[1:]: - spect = vak.common.files.spect.load(dataset_path / spect_path)['s'] + spect = vak.common.files.spect.load(dataset_path / spect_path)[vak.common.constants.SPECT_KEY] mean_freqs += np.mean(spect, axis=1) std_freqs += np.std(spect, axis=1) expected_mean_freqs = mean_freqs / len(spect_paths)