From d0810ee4933f202e7f539cc5ca0f20ae072a0a4f Mon Sep 17 00:00:00 2001 From: David Nicholson Date: Tue, 10 Sep 2024 14:31:48 -0400 Subject: [PATCH] Rename BioSoundSegBench dataset -> CMACBench and refactor, fix #776 - move class into sub-package with separate modules for helper functions and default transforms --- src/vak/datasets/__init__.py | 8 +- src/vak/datasets/cmacbench/__init__.py | 0 .../cmacbench.py} | 357 +----------------- src/vak/datasets/cmacbench/helper.py | 156 ++++++++ src/vak/datasets/cmacbench/transforms.py | 205 ++++++++++ 5 files changed, 380 insertions(+), 346 deletions(-) create mode 100644 src/vak/datasets/cmacbench/__init__.py rename src/vak/datasets/{biosoundsegbench.py => cmacbench/cmacbench.py} (53%) create mode 100644 src/vak/datasets/cmacbench/helper.py create mode 100644 src/vak/datasets/cmacbench/transforms.py diff --git a/src/vak/datasets/__init__.py b/src/vak/datasets/__init__.py index aa2f3be75..c91cdc2f8 100644 --- a/src/vak/datasets/__init__.py +++ b/src/vak/datasets/__init__.py @@ -1,13 +1,13 @@ from . import biosoundsegbench -from .biosoundsegbench import BioSoundSegBench, SplitsMetadata +from .cmacbench import CMACBench, SplitsMetadata from .get import get __all__ = [ - "biosoundsegbench", - "BioSoundSegBench", + "cmacbench", + "CMACBench", "get", "SplitsMetadata", ] # TODO: make this a proper registry -DATASETS = {"BioSoundSegBench": BioSoundSegBench} +DATASETS = {"CMACBench": CMACBench} diff --git a/src/vak/datasets/cmacbench/__init__.py b/src/vak/datasets/cmacbench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/vak/datasets/biosoundsegbench.py b/src/vak/datasets/cmacbench/cmacbench.py similarity index 53% rename from src/vak/datasets/biosoundsegbench.py rename to src/vak/datasets/cmacbench/cmacbench.py index f4b5369a8..9b04aeeee 100644 --- a/src/vak/datasets/biosoundsegbench.py +++ b/src/vak/datasets/cmacbench/cmacbench.py @@ -1,5 +1,4 @@ -"""Class representing BioSoundSegBench dataset.""" - +"""Class representing CMACBench dataset.""" from __future__ import annotations import json @@ -12,10 +11,14 @@ import torchvision.transforms from attrs import define -from .. import common, datapipes, transforms +from ... import common, datapipes, transforms + +from .helper import metadata_from_splits_json_path, SplitsMetadata +from .transforms import TrainItemTransform, InferItemTransform + if TYPE_CHECKING: - from ..transforms import FramesStandardizer + from ...transforms import FramesStandardizer VALID_TARGET_TYPES = ( @@ -33,345 +36,15 @@ BOUNDARY_FRAME_LABELS_PATH_COL_NAME = "boundary_frame_labels_path" -@define -class SampleIDVectorPaths: - train: pathlib.Path - val: pathlib.Path - test: pathlib.Path - - -@define -class IndsInSampleVectorPaths: - train: pathlib.Path - val: pathlib.Path - test: pathlib.Path - - -@define -class SplitsMetadata: - """Class that represents metadata about dataset splits - in the BioSoundSegBench dataset, loaded from a json file""" - - splits_csv_path: pathlib.Path - sample_id_vector_paths: SampleIDVectorPaths - inds_in_sample_vector_paths: IndsInSampleVectorPaths - - @classmethod - def from_paths(cls, json_path, dataset_path): - json_path = pathlib.Path(json_path) - with json_path.open("r") as fp: - splits_json = json.load(fp) - - dataset_path = pathlib.Path(dataset_path) - if not dataset_path.exists() or not dataset_path.is_dir(): - raise NotADirectoryError( - f"`dataset_path` not found or not a directory: {dataset_path}" - ) - - splits_csv_path = pathlib.Path( - dataset_path / splits_json["splits_csv_path"] - ) - if not splits_csv_path.exists(): - raise FileNotFoundError( - f"`splits_csv_path` not found: {splits_csv_path}" - ) - - sample_id_vector_paths = { - split: dataset_path / path - for split, path in splits_json["sample_id_vec_path"].items() - } - for split, vec_path in sample_id_vector_paths.items(): - if not vec_path.exists(): - raise FileNotFoundError( - f"`sample_id_vector_path` for split '{split}' not found: {vec_path}" - ) - sample_id_vector_paths = SampleIDVectorPaths(**sample_id_vector_paths) - - inds_in_sample_vector_paths = { - split: dataset_path / path - for split, path in splits_json["inds_in_sample_vec_path"].items() - } - for split, vec_path in inds_in_sample_vector_paths.items(): - if not vec_path.exists(): - raise FileNotFoundError( - f"`inds_in_sample_vec_path` for split '{split}' not found: {vec_path}" - ) - inds_in_sample_vector_paths = IndsInSampleVectorPaths( - **inds_in_sample_vector_paths - ) - - return cls( - splits_csv_path, - sample_id_vector_paths, - inds_in_sample_vector_paths, - ) - - -@define -class TrainingReplicateMetadata: - """Class representing metadata for a - pre-defined training replicate - in the BioSoundSegBench dataset. +class CMACBench(torch.utils.data.Dataset): + """Class representing the CMAC dataset. + + Notes + ----- + For more information about this dataset, please see + https://github.com/vocalpy/CMACBench """ - biosound_group: str - id: str | None - frame_dur: float - unit: str - data_source: str | None - train_dur: float - replicate_num: int - - -def metadata_from_splits_json_path( - splits_json_path: pathlib.Path, datset_path: pathlib.Path -) -> TrainingReplicateMetadata: - name = splits_json_path.name - try: - ( - biosound_group, - unit, - id_, - frame_dur_1st_half, - frame_dur_2nd_half, - data_source, - train_dur_1st_half, - train_dur_2nd_half, - replicate_num, - _, - _, - ) = name.split(".") - # Human-Speech doesn't have ID or data source in filename - # so it will raise a ValueError - except ValueError: - name = splits_json_path.name - ( - biosound_group, - unit, - frame_dur_1st_half, - frame_dur_2nd_half, - train_dur_1st_half, - train_dur_2nd_half, - replicate_num, - _, - _, - ) = name.split(".") - id_ = None - data_source = None - if id_ is not None: - id_ = id_.split("-")[-1] - frame_dur = float( - frame_dur_1st_half.split("-")[-1] - + "." - + frame_dur_2nd_half.split("-")[0] - ) - train_dur = float( - train_dur_1st_half.split("-")[-1] - + "." - + train_dur_2nd_half.split("-")[0] - ) - replicate_num = int(replicate_num.split("-")[-1]) - return TrainingReplicateMetadata( - biosound_group, - id_, - frame_dur, - unit, - data_source, - train_dur, - replicate_num, - ) - - -class TrainItemTransform: - """Default transform used when training frame classification models - with :class:`BioSoundSegBench` dataset.""" - - def __init__( - self, - frames_standardizer: FramesStandardizer | None = None, - ): - from ..transforms import FramesStandardizer # avoid circular import - - if frames_standardizer is not None: - if isinstance(frames_standardizer, FramesStandardizer): - frames_transform = [frames_standardizer] - else: - raise TypeError( - f"invalid type for frames_standardizer: {type(frames_standardizer)}. " - "Should be an instance of vak.transforms.StandardizeSpect" - ) - else: - frames_transform = [] - # add as an attribute on self so that high-level functions can save this class as needed - self.frames_standardizer = frames_standardizer - - frames_transform.extend( - [ - transforms.ToFloatTensor(), - transforms.AddChannel(), - ] - ) - self.frames_transform = torchvision.transforms.Compose( - frames_transform - ) - self.frame_labels_transform = transforms.ToLongTensor() - - def __call__( - self, - frames: torch.Tensor, - multi_frame_labels: torch.Tensor | None = None, - binary_frame_labels: torch.Tensor | None = None, - boundary_frame_labels: torch.Tensor | None = None, - ) -> dict: - frames = self.frames_transform(frames) - item = { - "frames": frames, - } - if multi_frame_labels is not None: - item["multi_frame_labels"] = self.frame_labels_transform( - multi_frame_labels - ) - - if binary_frame_labels is not None: - item["binary_frame_labels"] = self.frame_labels_transform( - binary_frame_labels - ) - - if boundary_frame_labels is not None: - item["boundary_frame_labels"] = self.frame_labels_transform( - boundary_frame_labels - ) - - return item - - -class InferItemTransform: - """Default transform used when running inference on classification models - with :class:`BioSoundSegBench` dataset, for evaluation or to generate new predictions. - - Returned item includes frames reshaped into a stack of windows, - with padded added to make reshaping possible. - Any `frame_labels` are not padded and reshaped, - but are converted to :class:`torch.LongTensor`. - If return_padding_mask is True, item includes 'padding_mask' that - can be used to crop off any predictions made on the padding. - - Attributes - ---------- - frames_standardizer : vak.transforms.FramesStandardizer - instance that has already been fit to dataset, using fit_df method. - Default is None, in which case no standardization transform is applied. - window_size : int - width of window in number of elements. Argument to PadToWindow transform. - frames_padval : float - Value to pad frames with. Added to end of array, the "right side". - Argument to PadToWindow transform. Default is 0.0. - frame_labels_padval : int - Value to pad frame labels vector with. Added to the end of the array. - Argument to PadToWindow transform. Default is -1. - Used with ``ignore_index`` argument of :mod:`torch.nn.CrossEntropyLoss`. - return_padding_mask : bool - if True, the dictionary returned by ItemTransform classes will include - a boolean vector to use for cropping back down to size before padding. - padding_mask has size equal to width of padded array, i.e. original size - plus padding at the end, and has values of 1 where - columns in padded are from the original array, - and values of 0 where columns were added for padding. - """ - - def __init__( - self, - window_size, - frames_standardizer=None, - frames_padval=0.0, - frame_labels_padval=-1, - return_padding_mask=True, - channel_dim=1, - ): - from ..transforms import FramesStandardizer # avoid circular import - - self.window_size = window_size - self.frames_padval = frames_padval - self.frame_labels_padval = frame_labels_padval - self.return_padding_mask = return_padding_mask - self.channel_dim = channel_dim - - if frames_standardizer is not None: - if not isinstance(frames_standardizer, FramesStandardizer): - raise TypeError( - f"Invalid type for frames_standardizer: {type(frames_standardizer)}. " - "Should be an instance of vak.transforms.FramesStandardizer" - ) - # add as an attribute on self to use inside __call__ - # *and* so that high-level functions can save this class as needed - self.frames_standardizer = frames_standardizer - - self.pad_to_window = transforms.PadToWindow( - window_size, frames_padval, return_padding_mask=return_padding_mask - ) - - self.frames_transform_after_pad = torchvision.transforms.Compose( - [ - transforms.ViewAsWindowBatch(window_size), - transforms.ToFloatTensor(), - # below, add channel at first dimension because windows become batch - transforms.AddChannel(channel_dim=channel_dim), - ] - ) - - self.frame_labels_padval = frame_labels_padval - self.frame_labels_transform = transforms.ToLongTensor() - - def __call__( - self, - frames: torch.Tensor, - multi_frame_labels: torch.Tensor | None = None, - binary_frame_labels: torch.Tensor | None = None, - boundary_frame_labels: torch.Tensor | None = None, - frames_path=None, - ) -> dict: - if self.frames_standardizer: - frames = self.frames_standardizer(frames) - - if self.pad_to_window.return_padding_mask: - frames, padding_mask = self.pad_to_window(frames) - else: - frames = self.pad_to_window(frames) - padding_mask = None - frames = self.frames_transform_after_pad(frames) - - item = { - "frames": frames, - } - - if multi_frame_labels is not None: - item["multi_frame_labels"] = self.frame_labels_transform( - multi_frame_labels - ) - - if binary_frame_labels is not None: - item["binary_frame_labels"] = self.frame_labels_transform( - binary_frame_labels - ) - - if boundary_frame_labels is not None: - item["boundary_frame_labels"] = self.frame_labels_transform( - boundary_frame_labels - ) - - if padding_mask is not None: - item["padding_mask"] = padding_mask - - if frames_path is not None: - # make sure frames_path is a str, not a pathlib.Path - item["frames_path"] = str(frames_path) - - return item - - -class BioSoundSegBench: - """Class representing BioSoundSegBench dataset.""" - def __init__( self, dataset_path: str | pathlib.Path, @@ -575,7 +248,7 @@ def shape(self): elif ( self.split in ("val", "test", "predict") and len(input_shape) == 4 ): - # discard windows dimension from shape -- + # discard windows dimension from shape, # it's sample dependent and not what we want return input_shape[1:] diff --git a/src/vak/datasets/cmacbench/helper.py b/src/vak/datasets/cmacbench/helper.py new file mode 100644 index 000000000..bbcd2b71b --- /dev/null +++ b/src/vak/datasets/cmacbench/helper.py @@ -0,0 +1,156 @@ +"""Helper functions used with CMACBench dataset.""" +from __future__ import annotations + +import json +import pathlib + +from attrs import define + + +@define +class SampleIDVectorPaths: + train: pathlib.Path + val: pathlib.Path + test: pathlib.Path + + +@define +class IndsInSampleVectorPaths: + train: pathlib.Path + val: pathlib.Path + test: pathlib.Path + + +@define +class SplitsMetadata: + """Class that represents metadata about dataset splits + in the BioSoundSegBench dataset, loaded from a json file""" + + splits_csv_path: pathlib.Path + sample_id_vector_paths: SampleIDVectorPaths + inds_in_sample_vector_paths: IndsInSampleVectorPaths + + @classmethod + def from_paths(cls, json_path, dataset_path): + json_path = pathlib.Path(json_path) + with json_path.open("r") as fp: + splits_json = json.load(fp) + + dataset_path = pathlib.Path(dataset_path) + if not dataset_path.exists() or not dataset_path.is_dir(): + raise NotADirectoryError( + f"`dataset_path` not found or not a directory: {dataset_path}" + ) + + splits_csv_path = pathlib.Path( + dataset_path / splits_json["splits_csv_path"] + ) + if not splits_csv_path.exists(): + raise FileNotFoundError( + f"`splits_csv_path` not found: {splits_csv_path}" + ) + + sample_id_vector_paths = { + split: dataset_path / path + for split, path in splits_json["sample_id_vec_path"].items() + } + for split, vec_path in sample_id_vector_paths.items(): + if not vec_path.exists(): + raise FileNotFoundError( + f"`sample_id_vector_path` for split '{split}' not found: {vec_path}" + ) + sample_id_vector_paths = SampleIDVectorPaths(**sample_id_vector_paths) + + inds_in_sample_vector_paths = { + split: dataset_path / path + for split, path in splits_json["inds_in_sample_vec_path"].items() + } + for split, vec_path in inds_in_sample_vector_paths.items(): + if not vec_path.exists(): + raise FileNotFoundError( + f"`inds_in_sample_vec_path` for split '{split}' not found: {vec_path}" + ) + inds_in_sample_vector_paths = IndsInSampleVectorPaths( + **inds_in_sample_vector_paths + ) + + return cls( + splits_csv_path, + sample_id_vector_paths, + inds_in_sample_vector_paths, + ) + + +@define +class TrainingReplicateMetadata: + """Class representing metadata for a + pre-defined training replicate + in the BioSoundSegBench dataset. + """ + + biosound_group: str + id: str | None + frame_dur: float + unit: str + data_source: str | None + train_dur: float + replicate_num: int + + +def metadata_from_splits_json_path( + splits_json_path: pathlib.Path, datset_path: pathlib.Path +) -> TrainingReplicateMetadata: + name = splits_json_path.name + try: + ( + biosound_group, + unit, + id_, + frame_dur_1st_half, + frame_dur_2nd_half, + data_source, + train_dur_1st_half, + train_dur_2nd_half, + replicate_num, + _, + _, + ) = name.split(".") + # Human-Speech doesn't have ID or data source in filename + # so it will raise a ValueError + except ValueError: + name = splits_json_path.name + ( + biosound_group, + unit, + frame_dur_1st_half, + frame_dur_2nd_half, + train_dur_1st_half, + train_dur_2nd_half, + replicate_num, + _, + _, + ) = name.split(".") + id_ = None + data_source = None + if id_ is not None: + id_ = id_.split("-")[-1] + frame_dur = float( + frame_dur_1st_half.split("-")[-1] + + "." + + frame_dur_2nd_half.split("-")[0] + ) + train_dur = float( + train_dur_1st_half.split("-")[-1] + + "." + + train_dur_2nd_half.split("-")[0] + ) + replicate_num = int(replicate_num.split("-")[-1]) + return TrainingReplicateMetadata( + biosound_group, + id_, + frame_dur, + unit, + data_source, + train_dur, + replicate_num, + ) \ No newline at end of file diff --git a/src/vak/datasets/cmacbench/transforms.py b/src/vak/datasets/cmacbench/transforms.py new file mode 100644 index 000000000..96b26b0ce --- /dev/null +++ b/src/vak/datasets/cmacbench/transforms.py @@ -0,0 +1,205 @@ +"""Default transforms for CMACBench dataset.""" + +from __future__ import annotations + +import json +import pathlib +from typing import TYPE_CHECKING, Callable, Literal + +import numpy as np +import pandas as pd +import torch +import torchvision.transforms +from attrs import define + +from ... import common, datapipes, transforms + +if TYPE_CHECKING: + from ...transforms import FramesStandardizer + + +class TrainItemTransform: + """Default transform used when training frame classification models + with :class:`CMACBench` dataset.""" + + def __init__( + self, + frames_standardizer: FramesStandardizer | None = None, + ): + from ...transforms import FramesStandardizer # avoid circular import + + if frames_standardizer is not None: + if isinstance(frames_standardizer, FramesStandardizer): + frames_transform = [frames_standardizer] + else: + raise TypeError( + f"invalid type for frames_standardizer: {type(frames_standardizer)}. " + "Should be an instance of vak.transforms.StandardizeSpect" + ) + else: + frames_transform = [] + # add as an attribute on self so that high-level functions can save this class as needed + self.frames_standardizer = frames_standardizer + + frames_transform.extend( + [ + transforms.ToFloatTensor(), + transforms.AddChannel(), + ] + ) + self.frames_transform = torchvision.transforms.Compose( + frames_transform + ) + self.frame_labels_transform = transforms.ToLongTensor() + + def __call__( + self, + frames: torch.Tensor, + multi_frame_labels: torch.Tensor | None = None, + binary_frame_labels: torch.Tensor | None = None, + boundary_frame_labels: torch.Tensor | None = None, + ) -> dict: + frames = self.frames_transform(frames) + item = { + "frames": frames, + } + if multi_frame_labels is not None: + item["multi_frame_labels"] = self.frame_labels_transform( + multi_frame_labels + ) + + if binary_frame_labels is not None: + item["binary_frame_labels"] = self.frame_labels_transform( + binary_frame_labels + ) + + if boundary_frame_labels is not None: + item["boundary_frame_labels"] = self.frame_labels_transform( + boundary_frame_labels + ) + + return item + + +class InferItemTransform: + """Default transform used when running inference on classification models + with :class:`CMACBench` dataset, for evaluation or to generate new predictions. + + Returned item includes frames reshaped into a stack of windows, + with padded added to make reshaping possible. + Any `frame_labels` are not padded and reshaped, + but are converted to :class:`torch.LongTensor`. + If return_padding_mask is True, item includes 'padding_mask' that + can be used to crop off any predictions made on the padding. + + Attributes + ---------- + frames_standardizer : vak.transforms.FramesStandardizer + instance that has already been fit to dataset, using fit_df method. + Default is None, in which case no standardization transform is applied. + window_size : int + width of window in number of elements. Argument to PadToWindow transform. + frames_padval : float + Value to pad frames with. Added to end of array, the "right side". + Argument to PadToWindow transform. Default is 0.0. + frame_labels_padval : int + Value to pad frame labels vector with. Added to the end of the array. + Argument to PadToWindow transform. Default is -1. + Used with ``ignore_index`` argument of :mod:`torch.nn.CrossEntropyLoss`. + return_padding_mask : bool + if True, the dictionary returned by ItemTransform classes will include + a boolean vector to use for cropping back down to size before padding. + padding_mask has size equal to width of padded array, i.e. original size + plus padding at the end, and has values of 1 where + columns in padded are from the original array, + and values of 0 where columns were added for padding. + """ + + def __init__( + self, + window_size, + frames_standardizer=None, + frames_padval=0.0, + frame_labels_padval=-1, + return_padding_mask=True, + channel_dim=1, + ): + from ...transforms import FramesStandardizer # avoid circular import + + self.window_size = window_size + self.frames_padval = frames_padval + self.frame_labels_padval = frame_labels_padval + self.return_padding_mask = return_padding_mask + self.channel_dim = channel_dim + + if frames_standardizer is not None: + if not isinstance(frames_standardizer, FramesStandardizer): + raise TypeError( + f"Invalid type for frames_standardizer: {type(frames_standardizer)}. " + "Should be an instance of vak.transforms.FramesStandardizer" + ) + # add as an attribute on self to use inside __call__ + # *and* so that high-level functions can save this class as needed + self.frames_standardizer = frames_standardizer + + self.pad_to_window = transforms.PadToWindow( + window_size, frames_padval, return_padding_mask=return_padding_mask + ) + + self.frames_transform_after_pad = torchvision.transforms.Compose( + [ + transforms.ViewAsWindowBatch(window_size), + transforms.ToFloatTensor(), + # below, add channel at first dimension because windows become batch + transforms.AddChannel(channel_dim=channel_dim), + ] + ) + + self.frame_labels_padval = frame_labels_padval + self.frame_labels_transform = transforms.ToLongTensor() + + def __call__( + self, + frames: torch.Tensor, + multi_frame_labels: torch.Tensor | None = None, + binary_frame_labels: torch.Tensor | None = None, + boundary_frame_labels: torch.Tensor | None = None, + frames_path=None, + ) -> dict: + if self.frames_standardizer: + frames = self.frames_standardizer(frames) + + if self.pad_to_window.return_padding_mask: + frames, padding_mask = self.pad_to_window(frames) + else: + frames = self.pad_to_window(frames) + padding_mask = None + frames = self.frames_transform_after_pad(frames) + + item = { + "frames": frames, + } + + if multi_frame_labels is not None: + item["multi_frame_labels"] = self.frame_labels_transform( + multi_frame_labels + ) + + if binary_frame_labels is not None: + item["binary_frame_labels"] = self.frame_labels_transform( + binary_frame_labels + ) + + if boundary_frame_labels is not None: + item["boundary_frame_labels"] = self.frame_labels_transform( + boundary_frame_labels + ) + + if padding_mask is not None: + item["padding_mask"] = padding_mask + + if frames_path is not None: + # make sure frames_path is a str, not a pathlib.Path + item["frames_path"] = str(frames_path) + + return item \ No newline at end of file