From cbc3f828b066a98c8a2c20077fb5249feb230c81 Mon Sep 17 00:00:00 2001
From: David Nicholson <NickleDave@users.noreply.github.com>
Date: Tue, 10 Oct 2023 17:35:47 -0400
Subject: [PATCH] ENH: Minimize frame classification dataset size, fix #717
 (#718)

* Change function vak.prep.frame_classification.dataset_arrays.make_npy_files_for_each_split to remove spectrogram/audio files from dataset path after making the npy files

* Modify prep_spectrogram_dataset so that it no longer makes a directory 'spectrogram_generated_{timenow} -- that way we don't have to delete the directory when we remove the spectrograms after converting to npy files later

* Rename get_train_dur_replicate_split_name -> get_train_dur_replicate_subset_name in src/vak/common/learncurve.py

* Modify src/vak/prep/frame_classification/learncurve.py to no longer make duplicate npy files for each subset names, and to add subset names in a separate column from split so that we can specify subsets directly in learncurve

* Add subset parameter to src/vak/datasets/frame_classification/frames_dataset.py, that takes precedence over split parameter when selecting part of dataframe to use for grabbing samples

* Add subset parameter to src/vak/datasets/frame_classification/window_dataset.py, that takes precedence over split parameter when selecting part of dataframe to use for grabbing samples

* Rename split parameter of vak.train.frame_classification to subset, and use when making training dataset instance

* Use subset inside of src/vak/learncurve/frame_classification.py

* Have StandardizeSpect.fit_dataset_path take subset argument and have it take precedence over split when fitting, as with dataset classes

* Use split + subset when calling StandardizeSpect.fit_dataset_path in src/vak/train/frame_classification.py

* Use subset not split argument when calling training functions for model families in src/vak/train/train_.py

* WIP: Use subset with ParametricUMAPDataset (haven't added argument to dataset class yet)

* Add function `make_index_vectors_for_each_subset` to src/vak/prep/frame_classification/learncurve.py, rename `make_learncurve_splits` to `make_subsets_from_dataset_df` and have it call `make_index_vectors`

* Revise a couple things in docstring in src/vak/prep/frame_classification/dataset_arrays.py

* Have audio_format default to none in src/vak/prep/frame_classification/dataset_arrays.py and raise ValueError if input_type is audio but audio_format is None

* Fix parameter order of function in src/vak/prep/frame_classification/learncurve.py to match order of dataset_arrays so it's not confusing, and set default of audio_format to None, raise a ValueError if input_type is audio but audio_format is None

* In src/vak/prep/frame_classification/frame_classification.py, call make_subsets_from_data_df with correct arguments (now renamed from make_learncurve_splits_from_dataset_df)

* Add src/vak/datasets/frame_classification/helper.py with helper functions that return filenames of indexing vectors for subsets of (training) data

* Import helper in src/vak/datasets/frame_classification/__init__.py

* Use helper functions to load indexing vectors for subsets in classmethod of src/vak/datasets/frame_classification/window_dataset.py

* Use helper functions to load indexing vectors for subsets in classmethod of src/vak/datasets/frame_classification/frames_dataset.py

* Rewrite functions in src/vak/prep/frame_classification/frame_classification.py -- realize I can just use frame npy files to make indexing vectors, so I don't need input type, audio format, etc.

* Fix args to make_indes_vecotrs_for_each_subset and fix how we concatenate dataset_df in src/vak/prep/frame_classification/learncurve.py

* Fix how we use subset in FramesDataset.__init__

* Fix how we use subset in WindowDataset.__init__

* Change word 'split' -> 'subset' in src/vak/learncurve/frame_classification.py

* Fix docstrings in src/vak/datasets/frame_classification/window_dataset.py

* Fix docstrings in src/vak/datasets/frame_classification/frames_dataset.py

* Fix a typo in a docstring in src/vak/datasets/frame_classification/window_dataset.py

* Fix subset parameter of classmethod for ParametricUMAPDataset class; move logic from classmethod into __init__ although I'm not sure this is a good idea

* Rename frame_classification/dataset_arrays.py to frame_classification/make_splits.py and rewrite 'make_npy_paths' as 'make_splits', have it move/copy/create audio or spectrogram files in split dirs, in addition to making npy files, and update the 'audio_path' or 'spect_path' columns with the files in the split dirs

* Remove constants from src/vak/datasets/frame_classification/constants.py that are no longer used for 'frames' files

* Use make_splits function in src/vak/prep/frame_classification/frame_classification.py

* Modify make_dataframe_of_spect_files function in src/vak/prep/spectrogram_dataset/spect_helper.py so it no longer converts mat files into npz files, instead it just finds/collates all the spect files and returns them in the dataframe; any converting is done by frame_classification.make_splits with the output of this function

* Fix typo in list comprehension and add info to docstring in src/vak/prep/frame_classification/make_splits.py

* Fix imports in src/vak/prep/frame_classification/__init__.py after renaming module to 'make_splits'

* Remove other occurrences of 'spect_output_dir' from src/vak/prep/spectrogram_dataset/spect_helper.py, no longer is a parameter and not used

* No longer pass 'spect_output_dir' into 'prep_spectrogram_dataset' in src/vak/prep/spectrogram_dataset/prep.py

* Remove unused import in src/vak/prep/spectrogram_dataset/spect_helper.py

* Add logger statement in src/vak/prep/frame_classification/make_splits.py

* Fix src/vak/prep/frame_classification/learncurve.py so functions use either spect or audio to get frames and make indexing vectors

* Fix src/vak/prep/frame_classification/frame_classification.py so we pass needed parameters into make_subsets_from_dataset_df

* Make x_path relative to dataset_path in src/vak/prep/frame_classification/frame_classification.py, since that's what downstream functions/classes expect

* Rename x_path -> source_path in src/vak/prep/frame_classification/make_splits.py

* Rename x_path -> source_path in src/vak/prep/frame_classification/learncurve.py

* Rewrite frame_classification.WindowDataset to load audio/spectrograms directly from 'frame_paths'

* Add FRAMES_PATH_COL_NAME to src/vak/datasets/frame_classification/constants.py

* Rewrite make_splits.py to add frames_path column to dataframe, and have frame_classification models use that column always; this way we keep the original 'audio_path' and 'spect_path' columns as metadata, and avoid if/else logic everywhere in dataset classes

* Fix WindowDataset to use constant to load frame paths column, and to validate input type, revise docstring

* Fix FramesDataset the same way as WindowDataset: load frame paths with constant, load inside __getitem__ with helper function _load_frames, validate input type, fix order of attributes in docstring

* Use self.dataset_path to build frames_path in WindowDataset

* Use self.dataset_path to build frames_path in FramesDataset, and pass into transform as 'frames_path', not 'source_path'

* Rename 'source_path' -> 'frames_path' inside src/vak/transforms/defaults/frame_classification.py

* Rename 'source_path' -> 'frames_path' in FrameClassificationModel methods, in src/vak/models/frame_classification_model.py

* Rename 'source_path' -> 'frames_path' in src/vak/predict/frame_classification.py

* Add SPECT_KEY to common.constants

* Fix how StandardizeSpect.from_dataset_path builds frames_path paths, and use constants.SPECT_KEY when loading from frames path

* Use common.constants.SPECT_KEY inside _load_frames method of WindowDataset

* Use common.constants.SPECT_KEY inside _load_frames method of FramesDataset

* Add newline at end of src/vak/common/constants.py

* Add FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT to src/vak/datasets/frame_classification/constants.py

* Add function load_frames to src/vak/datasets/frame_classification/helper.py

* Have WindowDataset._load_frames use helper.load_frames

* Have FramesDataset._load_frames use helper.load_frames

* Rename GENERATED_TEST_DATA -> GENERATED_TEST_DATA_ROOT in tests/scripts/vaktestdata/constants.py

* Rename GENERATED_TEST_DATA -> GENERATED_TEST_DATA_ROOT in tests/scripts/vaktestdata/dirs.py

* Add tests/scripts/vaktestdata/spect.py

* import spect module in tests/scripts/vaktestdata/__init__.py

* Call vaktestdata.spect.prep_spects in prep section of script tests/scripts/generate_data_for_tests.py

* Fix spect_dir_npz fixture in tests/fixtures/spect.py to use directory of just .spect.npz files that is now generated by the generate_test_data script

* Add SPECT_NPZ_EXTENSION to src/vak/common/constants.py

* Use common.SPECT_NPZ_EXTENSION in src/vak/prep/spectrogram_dataset/audio_helper.py

* Fix prep.frame_classification.make_splits to remove any .spect.npz files remaining in dataset_path, that were not moved into splits

* Fix vak.prep.frame_classification.learncurve.make_index_vectors_for_subsets to use frame_paths column instead of 'source' paths (audio_path or spect_path) -- so we are using files that definitely exist and are already assigned to splits

* WIP: Rewriting unit tests in tests/test_prep/test_frame_classification/test_learncurve.py

* WIP: Rewriting unit tests in tests/test_prep/test_frame_classification/test_make_splits.py

* WIP: Add tests/test_datasets/test_frame_classification/test_helper.py

* Rename specific_config -> specific_config_toml_path

* WIP: Rewriting tests/test_prep/test_frame_classification/test_make_splits.py

* Add src/vak/prep/frame_classification/get_or_make_source_files.py

* Add src/vak/prep/frame_classification/assign_samples_to_splits.py

* Rewrite 'prep_frame_classification_dataset' to use helper functions factored out into other modules: get_or_make_source_files and assign_samples_to_splits

* Capitalize in docstring in src/vak/prep/spectrogram_dataset/prep.py

* Add TIMEBINS_KEY to src/vak/common/constants.py

* Finish fixing unit test for vak.prep.frame_classification.make_splits

* Add imports in src/vak/prep/frame_classification/__init__.py

* Revise docstring of src/vak/prep/audio_dataset.py to refer to 'source_files_df'

* Revise docstring of src/vak/prep/spectrogram_dataset/spect_helper.py to refer to 'source_files_df'

* Revise docstring of src/vak/prep/spectrogram_dataset/prep.py to refer to 'source_files_df'

* Revise src/vak/prep/frame_classification/get_or_make_source_files.py to refer to 'source_files_df', in docstring and inside function

* In 'prep_frame_classification_dataset', differentiate between 'source_files_df' and 'dataset_df'

* Delete birdsong-recognition-dataset configs from tests/data_for_tests/configs

* Fix a docstring in noxfile.py

* Remove tests/scripts/vaktestdata/spect.py

* Add model_family field in tests/data_for_tests/configs/configs.json, remove configs for birdsong-recognition-dataset

* Add model_family field to ConfigMetadata dataclass in tests/scripts/vaktestdata/config_metadata.py

* Remove call to vaktestdata.spect.prep_spects() since we are going to call other functions that will make spectrograms

* Change parameters order of frame_classification.get_or_make_source_files, add pre-conditions/validators

* Fix order of args to get_or_make_source_files in src/vak/prep/frame_classification/frame_classification.py

* Add more to docstring of src/vak/prep/frame_classification/get_or_make_source_files.py

* Add 'spect_output_dir' and 'data_dir' fields to tests/data_for_tests/configs/configs.json

* Rewrite ConfigMetadata dataclass, add docstring and converters, add spect_output_dir and data_dir attributes

* Add functions to make more directories in tests/data_for_tests/generated in tests/scripts/vaktestdata/dirs.py

* Import get_or_make_source_files in tests/scripts/vaktestdata/__init__.py

* Add more constants with names of directories to make in tests/data_for_tests/generated in tests/scripts/vaktestdata/constants.py

* Add tests/scripts/vaktestdata/get_or_make_source_files.py

* Add 'spect-output-dir/' to data_dir paths in tests/data_for_tests/configs/configs.json

* Rename tests/scripts/vaktestdata/get_or_make_source_files.py -> tests/scripts/vaktestdata/source_files.py, rewrite function that makes source files + csv files we use with tests

* Fix tests/scripts/vaktestdata/__init__.py to import source_files module, remove import of get_or_make_source_files module that was renamed to source_files

* Import missing module constants and fix order of arguments to prep_spectrogram_dataset in src/vak/prep/frame_classification/get_or_make_source_files.py

* Change 3 configs to have spect_format option set to npz

* Remove import of module spect in tests/scripts/vaktestdata/__init__.py

* Flesh out function in tests/scripts/vaktestdata/source_files.py

* Add log statements in tests/scripts/generate_data_for_tests.py

* Fix typo in src/vak/prep/frame_classification/get_or_make_source_files.py

* Add SPECT_FORMAT_EXT_MAP to src/vak/common/constants.py

* Use vak.commonconstants.SPECT_FORMAT_EXT_MAP in src/vak/prep/spectrogram_dataset/prep.py so that we correctly remove source file extension to pair with annotation file

* Fix attributes of ConfigMetadata so we don't convert None to 'None'

* Copy annotation files to spect_output_dir so we can prep from that dir, in tests/scripts/vaktestdata/source_files.py

* Change name of logger in tests/scripts/generate_data_for_tests.py

* Fix attributes in ConfigMetadata so we don't convert strings to bool

* Remove fixtures from tests/fixtures/annot.py after removing corresponding source data

* Fix import in src/vak/prep/frame_classification/__init__.py

* Fix import in src/vak/prep/frame_classification/frame_classification.py

* Add tests/fixtures/source_files with fixtures to get csv files

* Add fixtures that return dataframes directly in tests/fixtures/source_files.py

* Add tests/test_prep/test_frame_classification/test_get_or_make_source_files.py

* Add tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py

* Fix factory functions in tests/fixtures/source_files.py

* Fix assembled path in tests/fixtures/source_files.py

* Fix unit test in tests/test_prep/test_frame_classification/test_make_splits.py to use fixture so it's faster and less verbose

* Remove fixtures that no longer exist from specific_annot_list fixture in tests/fixtures/annot.py

* Remove fixtures for data that doesn't exist in tests/fixtures/audio.py

* Remove birdsong-rec from parametrize in tests/test_cli/test_predict.py

* Remove birdsongrec from parametrize in tests/test_cli/test_prep.py

* Remove birdsongrec from parametrize in tests/test_cli/test_train.py

* Remove birdsongrec and other data no longer in source from parametrizes in tests/test_common/test_annotation.py

* Remove birdsongrec from parametrize in tests/test_predict/test_frame_classification.py

* Remove birdsongrec from parametrize in tests/test_prep/test_frame_classification/test_frame_classification.py

* Remove birdsongrec from parametrize in tests/test_prep/test_prep.py

* Remove birdsongrec from parametrize in tests/test_prep/test_sequence_dataset.py

* Remove birdsongrec from parametrize in tests/test_train/test_frame_classification.py

* Remove birdsongrec from parametrize in tests/test_train/test_train.py

* Remove unit tests from tests/test_common/test_files/test_files.py that test on data removed from source data

* Remove parametrize that uses wav/textgrid data removed from source data

* Fix fixture in tests/fixtures/spect.py

* Actually write unit tests in tests/test_datasets/test_frame_classification/test_helper.py

* Fix prep.frame_classification.make_splits to not convert frame labels npy paths to 'None' when they are None

* Fix assert helper in tests/test_prep/test_frame_classification/test_frame_classification.py

* Remove spect_key and audio_format parameters from functions in src/vak/prep/frame_classification/learncurve.py, no longer used

* Change order of params for make_subsets_from_dataset_df

* Change order of args in call to make_subsets_from_dataset_df inside prep_fram_classification_dataset

* Rename some variables to 'subset_df' in src/vak/prep/frame_classification/learncurve.py and revise docstrings

* Finish adding/fixing unit tests in tests/test_prep/test_frame_classification/test_learncurve.py

* Fix bug in unit test in tests/test_prep/test_frame_classification/test_make_splits.py

* Fix unit tests in tests/test_prep/test_spectrogram_dataset/test_prep.py

* Fix unit test in tests/test_prep/test_spectrogram_dataset/test_spect_helper.py

* Fix unit test in tests/test_transforms/test_transforms.py

* Use torch.testing.assert_close instead of assert_allclose in tests/test_nn/test_loss/test_dice.py
---
 noxfile.py                                    |   2 +-
 src/vak/common/constants.py                   |  14 +
 src/vak/common/learncurve.py                  |   6 +-
 .../datasets/frame_classification/__init__.py |   4 +-
 .../frame_classification/constants.py         |   4 +-
 .../frame_classification/frames_dataset.py    | 181 +++++++++--
 .../datasets/frame_classification/helper.py   |  37 +++
 .../frame_classification/window_dataset.py    | 235 ++++++++++++--
 .../parametric_umap/parametric_umap.py        | 122 +++++--
 src/vak/learncurve/frame_classification.py    |   6 +-
 src/vak/models/frame_classification_model.py  |   8 +-
 src/vak/predict/frame_classification.py       |  20 +-
 src/vak/prep/audio_dataset.py                 |  16 +-
 src/vak/prep/frame_classification/__init__.py |   8 +-
 .../assign_samples_to_splits.py               | 138 ++++++++
 .../frame_classification.py                   | 151 +++------
 .../prep/frame_classification/learncurve.py   | 297 +++++++++++++-----
 .../{dataset_arrays.py => make_splits.py}     | 173 +++++++---
 .../prep/frame_classification/source_files.py | 179 +++++++++++
 .../prep/spectrogram_dataset/audio_helper.py  |   2 +-
 src/vak/prep/spectrogram_dataset/prep.py      |  35 ++-
 .../prep/spectrogram_dataset/spect_helper.py  |  95 +++---
 src/vak/train/frame_classification.py         |  18 +-
 src/vak/train/parametric_umap.py              |   5 +-
 src/vak/train/train_.py                       |   6 +-
 .../defaults/frame_classification.py          |  14 +-
 src/vak/transforms/transforms.py              |  19 +-
 ...et_learncurve_audio_cbin_annot_notmat.toml |   2 +-
 ...tyNet_predict_audio_cbin_annot_notmat.toml |   2 +-
 ...t_predict_audio_wav_annot_birdsongrec.toml |  41 ---
 ...Net_train_audio_wav_annot_birdsongrec.toml |  51 ---
 ...rain_continue_audio_cbin_annot_notmat.toml |   2 +-
 ..._continue_audio_wav_annot_birdsongrec.toml |  53 ----
 tests/data_for_tests/configs/configs.json     |  57 ++--
 tests/fixtures/__init__.py                    |   1 +
 tests/fixtures/annot.py                       |  81 +----
 tests/fixtures/audio.py                       |  39 +--
 tests/fixtures/config.py                      |  10 +-
 tests/fixtures/source_files.py                | 108 +++++++
 tests/fixtures/spect.py                       |   7 +-
 tests/scripts/generate_data_for_tests.py      |  16 +-
 tests/scripts/vaktestdata/__init__.py         |   1 +
 tests/scripts/vaktestdata/config_metadata.py  |  66 +++-
 tests/scripts/vaktestdata/constants.py        |   9 +-
 tests/scripts/vaktestdata/dirs.py             |  16 +-
 tests/scripts/vaktestdata/source_files.py     | 194 ++++++++++++
 tests/test_cli/test_eval.py                   |   8 +-
 tests/test_cli/test_learncurve.py             |   8 +-
 tests/test_cli/test_predict.py                |   9 +-
 tests/test_cli/test_prep.py                   |  18 +-
 tests/test_cli/test_train.py                  |   9 +-
 tests/test_common/test_annotation.py          |   7 -
 tests/test_common/test_files/test_files.py    |  26 --
 .../test_frames_dataset.py                    |  12 +-
 .../test_frame_classification/test_helper.py  |  46 +++
 .../test_window_dataset.py                    |  12 +-
 .../test_parametric_umap.py                   |  12 +-
 tests/test_eval/test_eval.py                  |   4 +-
 tests/test_eval/test_frame_classification.py  |  12 +-
 tests/test_eval/test_parametric_umap.py       |  12 +-
 .../test_frame_classification.py              |   8 +-
 tests/test_models/test_base.py                |   6 +-
 .../test_frame_classification_model.py        |   4 +-
 .../test_models/test_parametric_umap_model.py |   4 +-
 tests/test_nn/test_loss/test_dice.py          |   8 +-
 .../test_predict/test_frame_classification.py |  14 +-
 tests/test_predict/test_predict.py            |   4 +-
 .../test_assign_samples_to_splits.py          |  70 +++++
 .../test_dataset_arrays.py                    | 170 ----------
 .../test_frame_classification.py              |  67 ++--
 .../test_get_or_make_source_files.py          |  88 ++++++
 .../test_learncurve.py                        | 191 +++++++++--
 .../test_make_splits.py                       | 236 ++++++++++++++
 tests/test_prep/test_prep.py                  |   6 +-
 tests/test_prep/test_sequence_dataset.py      |   1 -
 .../test_spectrogram_dataset/test_prep.py     |  37 +--
 .../test_spect_helper.py                      |  44 +--
 tests/test_train/test_frame_classification.py |  18 +-
 tests/test_train/test_parametric_umap.py      |  12 +-
 tests/test_train/test_train.py                |   6 +-
 tests/test_transforms/test_transforms.py      |  13 +-
 81 files changed, 2544 insertions(+), 1209 deletions(-)
 create mode 100644 src/vak/datasets/frame_classification/helper.py
 create mode 100644 src/vak/prep/frame_classification/assign_samples_to_splits.py
 rename src/vak/prep/frame_classification/{dataset_arrays.py => make_splits.py} (64%)
 create mode 100644 src/vak/prep/frame_classification/source_files.py
 delete mode 100644 tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml
 delete mode 100644 tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml
 delete mode 100644 tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml
 create mode 100644 tests/fixtures/source_files.py
 create mode 100644 tests/scripts/vaktestdata/source_files.py
 create mode 100644 tests/test_datasets/test_frame_classification/test_helper.py
 create mode 100644 tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py
 delete mode 100644 tests/test_prep/test_frame_classification/test_dataset_arrays.py
 create mode 100644 tests/test_prep/test_frame_classification/test_get_or_make_source_files.py
 create mode 100644 tests/test_prep/test_frame_classification/test_make_splits.py

diff --git a/noxfile.py b/noxfile.py
index 6adb4f33d..188fe1281 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -157,7 +157,7 @@ def copy_url(url: str, path: str) -> None:
 
 @nox.session(name='test-data-tar-source')
 def test_data_tar_source(session) -> None:
-    """Make a .tar.gz file of just the 'generated' test data used to run tests on CI."""
+    """Make a .tar.gz file of just the 'source' test data used to run tests."""
     session.log(f"Making tarfile with source data: {SOURCE_TEST_DATA_TAR}")
     make_tarfile(SOURCE_TEST_DATA_TAR, SOURCE_TEST_DATA_DIRS)
 
diff --git a/src/vak/common/constants.py b/src/vak/common/constants.py
index e8aaad94e..fcf2ab7d6 100644
--- a/src/vak/common/constants.py
+++ b/src/vak/common/constants.py
@@ -42,3 +42,17 @@
 # ---- output (default) file extensions. Using the `pathlib` name "suffix" ----
 ANNOT_CSV_SUFFIX = ".annot.csv"
 NET_OUTPUT_SUFFIX = ".output.npz"
+
+# ---- the key for loading the spectrogram matrix from an npz file
+# TODO: replace this with vocalpy constants when we move to VocalPy
+SPECT_KEY = "s"
+TIMEBINS_KEY = "t"
+
+# TODO: replace this with vocalpy extension when we move to VocalPy
+# ---- the extension used to save spectrograms in npz array files
+# used by :func:`vak.prep.spectrogram_dataset.audio_helper.make
+SPECT_NPZ_EXTENSION = ".spect.npz"
+SPECT_FORMAT_EXT_MAP = {
+    "npz": SPECT_NPZ_EXTENSION,
+    "mat": ".mat",
+}
diff --git a/src/vak/common/learncurve.py b/src/vak/common/learncurve.py
index ee291c4f6..68ffe7b59 100644
--- a/src/vak/common/learncurve.py
+++ b/src/vak/common/learncurve.py
@@ -1,10 +1,10 @@
-def get_train_dur_replicate_split_name(
+def get_train_dur_replicate_subset_name(
     train_dur: int, replicate_num: int
 ) -> str:
-    """Get name of a training set split for a learning curve,
+    """Get name of a training set subset for a learning curve,
     for a specified training duration and replicate number.
 
-    Used when preparing the training set splits for a learning curve,
+    Used when preparing the training set subsets for a learning curve,
     and when training models to generate the results for the curve.
     """
     return f"train-dur-{float(train_dur)}-replicate-{int(replicate_num)}"
diff --git a/src/vak/datasets/frame_classification/__init__.py b/src/vak/datasets/frame_classification/__init__.py
index 98eda614b..29bc8b68c 100644
--- a/src/vak/datasets/frame_classification/__init__.py
+++ b/src/vak/datasets/frame_classification/__init__.py
@@ -1,6 +1,6 @@
-from . import constants
+from . import constants, helper
 from .frames_dataset import FramesDataset
 from .metadata import Metadata
 from .window_dataset import WindowDataset
 
-__all__ = ["constants", "Metadata", "FramesDataset", "WindowDataset"]
+__all__ = ["constants", "helper", "Metadata", "FramesDataset", "WindowDataset"]
diff --git a/src/vak/datasets/frame_classification/constants.py b/src/vak/datasets/frame_classification/constants.py
index 0ec942562..6867b9b67 100644
--- a/src/vak/datasets/frame_classification/constants.py
+++ b/src/vak/datasets/frame_classification/constants.py
@@ -1,8 +1,8 @@
-FRAMES_ARRAY_EXT = ".frames.npy"
-FRAMES_NPY_PATH_COL_NAME = "frames_npy_path"
+FRAMES_PATH_COL_NAME = "frames_path"
 FRAME_LABELS_EXT = ".frame_labels.npy"
 FRAME_LABELS_NPY_PATH_COL_NAME = "frame_labels_npy_path"
 ANNOTATION_CSV_FILENAME = "y.csv"
 SAMPLE_IDS_ARRAY_FILENAME = "sample_ids.npy"
 INDS_IN_SAMPLE_ARRAY_FILENAME = "inds_in_sample.npy"
 WINDOW_INDS_ARRAY_FILENAME = "window_inds.npy"
+FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT = "wav"
diff --git a/src/vak/datasets/frame_classification/frames_dataset.py b/src/vak/datasets/frame_classification/frames_dataset.py
index 6713ba4ca..ffe748b3d 100644
--- a/src/vak/datasets/frame_classification/frames_dataset.py
+++ b/src/vak/datasets/frame_classification/frames_dataset.py
@@ -1,3 +1,6 @@
+"""A dataset class used for neural network models with the
+frame classification task, where the source data consists of audio signals
+or spectrograms of varying lengths."""
 from __future__ import annotations
 
 import pathlib
@@ -7,8 +10,9 @@
 import numpy.typing as npt
 import pandas as pd
 
-from . import constants
+from . import constants, helper
 from .metadata import Metadata
+from ... import common
 
 
 class FramesDataset:
@@ -20,32 +24,120 @@ class FramesDataset:
 
     Attributes
     ----------
-    dataset_path
-    dataset_df
-    frame_dur : float
-        Duration of a single frame, in seconds.
-    duration : float
-        Total duration of the dataset.
+    dataset_path : pathlib.Path
+        Path to directory that represents a
+        frame classification dataset,
+        as created by
+        :func:`vak.prep.prep_frame_classification_dataset`.
+    split : str
+        The name of a split from the dataset,
+        one of {'train', 'val', 'test'}.
+    subset : str, optional
+        Name of subset to use.
+        If specified, this takes precedence over split.
+        Subsets are typically taken from the training data
+        for use when generating a learning curve.
+    dataset_df : pandas.DataFrame
+        A frame classification dataset,
+        represented as a :class:`pandas.DataFrame`.
+        This will be only the rows that correspond
+        to either ``subset`` or ``split`` from the
+        ``dataset_df`` that was passed in when
+        instantiating the class.
+    frames_paths : numpy.ndarray
+        Paths to npy files containing frames,
+        either spectrograms or audio signals
+        that are input to the model.
+    frame_labels_paths : numpy.ndarray
+        Paths to npy files containing vectors
+        with a label for each frame.
+        The targets for the outputs of the model.
+    input_type : str
+        The type of input to the neural network model.
+        One of {'audio', 'spect'}.
+    sample_ids : numpy.ndarray
+        Indexing vector representing which sample
+        from the dataset every frame belongs to.
+    inds_in_sample : numpy.ndarray
+        Indexing vector representing which index
+        within each sample from the dataset
+        that every frame belongs to.
+    frame_dur: float
+        Duration of a frame, i.e., a single sample in audio
+        or a single timebin in a spectrogram.
+    item_transform : callable, optional
+        Transform applied to each item :math:`(x, y)`
+        returned by :meth:`FramesDataset.__getitem__`.
     """
 
     def __init__(
         self,
         dataset_path: str | pathlib.Path,
         dataset_df: pd.DataFrame,
+        input_type: str,
         split: str,
         sample_ids: npt.NDArray,
         inds_in_sample: npt.NDArray,
         frame_dur: float,
-        input_type: str,
+        subset: str | None = None,
         item_transform: Callable | None = None,
     ):
-        self.dataset_path = pathlib.Path(dataset_path)
+        """Initialize a new instance of a FramesDataset.
 
+        Parameters
+        ----------
+        dataset_path : pathlib.Path
+            Path to directory that represents a
+            frame classification dataset,
+            as created by
+            :func:`vak.prep.prep_frame_classification_dataset`.
+        dataset_df : pandas.DataFrame
+            A frame classification dataset,
+            represented as a :class:`pandas.DataFrame`.
+        input_type : str
+            The type of input to the neural network model.
+            One of {'audio', 'spect'}.
+        split : str
+            The name of a split from the dataset,
+            one of {'train', 'val', 'test'}.
+        sample_ids : numpy.ndarray
+            Indexing vector representing which sample
+            from the dataset every frame belongs to.
+        inds_in_sample : numpy.ndarray
+            Indexing vector representing which index
+            within each sample from the dataset
+            that every frame belongs to.
+        frame_dur: float
+            Duration of a frame, i.e., a single sample in audio
+            or a single timebin in a spectrogram.
+        subset : str, optional
+            Name of subset to use.
+            If specified, this takes precedence over split.
+            Subsets are typically taken from the training data
+            for use when generating a learning curve.
+        item_transform : callable, optional
+            Transform applied to each item :math:`(x, y)`
+            returned by :meth:`FramesDataset.__getitem__`.
+        """
+        from ... import prep  # avoid circular import, use for constants.INPUT_TYPES
+        if input_type not in prep.constants.INPUT_TYPES:
+            raise ValueError(
+                f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n"
+                f"Value for ``input_type`` was: {input_type}"
+            )
+
+        self.dataset_path = pathlib.Path(dataset_path)
         self.split = split
-        dataset_df = dataset_df[dataset_df.split == split].copy()
+        self.subset = subset
+        # subset takes precedence over split, if specified
+        if subset:
+            dataset_df = dataset_df[dataset_df.subset == subset].copy()
+        else:
+            dataset_df = dataset_df[dataset_df.split == split].copy()
         self.dataset_df = dataset_df
+        self.input_type = input_type
         self.frames_paths = self.dataset_df[
-            constants.FRAMES_NPY_PATH_COL_NAME
+            constants.FRAMES_PATH_COL_NAME
         ].values
         if split != "predict":
             self.frame_labels_paths = self.dataset_df[
@@ -53,16 +145,6 @@ def __init__(
             ].values
         else:
             self.frame_labels_paths = None
-
-        if input_type == "audio":
-            self.source_paths = self.dataset_df["audio_path"].values
-        elif input_type == "spect":
-            self.source_paths = self.dataset_df["spect_path"].values
-        else:
-            raise ValueError(
-                f"Invalid `input_type`: {input_type}. Must be one of {{'audio', 'spect'}}."
-            )
-
         self.sample_ids = sample_ids
         self.inds_in_sample = inds_in_sample
         self.frame_dur = float(frame_dur)
@@ -78,10 +160,20 @@ def shape(self):
         tmp_item = self.__getitem__(tmp_x_ind)
         return tmp_item["frames"].shape
 
+    def _load_frames(self, frames_path):
+        """Helper function that loads "frames",
+        the input to the frame classification model.
+        Loads audio or spectrogram, depending on
+        :attr:`self.input_type`.
+        This function assumes that audio is in wav format 
+        and spectrograms are in npz files.
+        """
+        return helper.load_frames(frames_path, self.input_type)
+
     def __getitem__(self, idx):
-        source_path = self.source_paths[idx]
-        frames = np.load(self.dataset_path / self.frames_paths[idx])
-        item = {"frames": frames, "source_path": source_path}
+        frames_path = self.dataset_path / self.frames_paths[idx]
+        frames = self._load_frames(frames_path)
+        item = {"frames": frames, "frames_path": frames_path}
         if self.frame_labels_paths is not None:
             frame_labels = np.load(
                 self.dataset_path / self.frame_labels_paths[idx]
@@ -102,19 +194,34 @@ def from_dataset_path(
         cls,
         dataset_path: str | pathlib.Path,
         split: str = "val",
+        subset: str | None = None,
         item_transform: Callable | None = None,
     ):
-        """
+        """Make a :class:`FramesDataset` instance,
+        given the path to a frame classification dataset.
 
         Parameters
         ----------
-        dataset_path
-        split
-        item_transform
+        dataset_path : pathlib.Path
+            Path to directory that represents a
+            frame classification dataset,
+            as created by
+            :func:`vak.prep.prep_frame_classification_dataset`.
+        split : str
+            The name of a split from the dataset,
+            one of {'train', 'val', 'test'}.
+        subset : str, optional
+            Name of subset to use.
+            If specified, this takes precedence over split.
+            Subsets are typically taken from the training data
+            for use when generating a learning curve.
+        item_transform : callable, optional
+            Transform applied to each item :math:`(x, y)`
+            returned by :meth:`FramesDataset.__getitem__`.
 
         Returns
         -------
-
+        frames_dataset : FramesDataset
         """
         dataset_path = pathlib.Path(dataset_path)
         metadata = Metadata.from_dataset_path(dataset_path)
@@ -125,20 +232,26 @@ def from_dataset_path(
         dataset_df = pd.read_csv(dataset_csv_path)
 
         split_path = dataset_path / split
-        sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME
+        if subset:
+            sample_ids_path = split_path / helper.sample_ids_array_filename_for_subset(subset)
+        else:
+            sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME
         sample_ids = np.load(sample_ids_path)
-        inds_in_sample_path = (
-            split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME
-        )
+
+        if subset:
+            inds_in_sample_path = split_path / helper.inds_in_sample_array_filename_for_subset(subset)
+        else:
+            inds_in_sample_path = split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME
         inds_in_sample = np.load(inds_in_sample_path)
 
         return cls(
             dataset_path,
             dataset_df,
+            input_type,
             split,
             sample_ids,
             inds_in_sample,
             frame_dur,
-            input_type,
+            subset,
             item_transform,
         )
diff --git a/src/vak/datasets/frame_classification/helper.py b/src/vak/datasets/frame_classification/helper.py
new file mode 100644
index 000000000..2e9b1b4d2
--- /dev/null
+++ b/src/vak/datasets/frame_classification/helper.py
@@ -0,0 +1,37 @@
+"""Helper functions used with frame classification datasets."""
+from __future__ import annotations
+
+from . import constants
+from ... import common
+
+
+def sample_ids_array_filename_for_subset(subset: str) -> str:
+    """Returns name of sample IDs array file for a subset of the training data."""
+    return constants.SAMPLE_IDS_ARRAY_FILENAME.replace(
+                '.npy', f'-{subset}.npy'
+            )
+
+
+def inds_in_sample_array_filename_for_subset(subset: str) -> str:
+    """Returns name of inds in sample array file for a subset of the training data."""
+    return constants.INDS_IN_SAMPLE_ARRAY_FILENAME.replace(
+        '.npy', f'-{subset}.npy'
+    )
+
+
+def load_frames(frames_path, input_type):
+    """Helper function that loads "frames",
+    the input to the frame classification model.
+    Loads audio or spectrogram, depending on
+    :attr:`self.input_type`.
+    This function assumes that audio is in wav format
+    and spectrograms are in npz files.
+    """
+    if input_type == "audio":
+        frames, _ = common.constants.AUDIO_FORMAT_FUNC_MAP[
+            constants.FRAME_CLASSIFICATION_DATASET_AUDIO_FORMAT
+        ](frames_path)
+    elif input_type == "spect":
+        spect_dict = common.files.spect.load(frames_path)
+        frames = spect_dict[common.constants.SPECT_KEY]
+    return frames
diff --git a/src/vak/datasets/frame_classification/window_dataset.py b/src/vak/datasets/frame_classification/window_dataset.py
index aa61d37ff..c4f309f94 100644
--- a/src/vak/datasets/frame_classification/window_dataset.py
+++ b/src/vak/datasets/frame_classification/window_dataset.py
@@ -1,3 +1,20 @@
+"""A dataset class used for neural network models with the
+frame classification task, where the source data consists of audio signals
+or spectrograms of varying lengths.
+
+Unlike :class:`vak.datasets.frame_classification.FramesDataset`,
+this class does not return entire samples
+from the source dataset.
+Instead each paired samples :math:`(x_i, y_i)`
+returned by this dataset class consists of
+a window :math:`x_i` of fixed length
+:math:`w` from the underlying data ``X`` of total length :math:`T`.
+Each :math:`y_i` is a vector of the same size :math:`w`, containing
+an integer class label for each *frame* in the window :math:`x_i`.
+The entire dataset consists of some number of windows
+:math:`I` determined by a ``stride`` parameter :math:`s`,
+:math:`I = (T - w) / s`.
+"""
 from __future__ import annotations
 
 import pathlib
@@ -7,8 +24,9 @@
 import numpy.typing as npt
 import pandas as pd
 
-from . import constants
+from . import constants, helper
 from .metadata import Metadata
+from ... import common
 
 
 def get_window_inds(n_frames: int, window_size: int, stride: int = 1):
@@ -43,7 +61,7 @@ def get_window_inds(n_frames: int, window_size: int, stride: int = 1):
 
 class WindowDataset:
     """Dataset used for training neural network models
-    on the frame classification task.
+    on the frame classification task,
     where the source data consists of audio signals
     or spectrograms of varying lengths.
 
@@ -85,55 +103,167 @@ class WindowDataset:
 
     Attributes
     ----------
-    X : numpy.ndarray
-    Y : numpy.ndarray
+    dataset_path : pathlib.Path
+        Path to directory that represents a
+        frame classification dataset,
+        as created by
+        :func:`vak.prep.prep_frame_classification_dataset`.
+    split : str
+        The name of a split from the dataset,
+        one of {'train', 'val', 'test'}.
+    subset : str, optional
+        Name of subset to use.
+        If specified, this takes precedence over split.
+        Subsets are typically taken from the training data
+        for use when generating a learning curve.
+    dataset_df : pandas.DataFrame
+        A frame classification dataset,
+        represented as a :class:`pandas.DataFrame`.
+        This will be only the rows that correspond
+        to either ``subset`` or ``split`` from the
+        ``dataset_df`` that was passed in when
+        instantiating the class.
+    input_type : str
+        The type of input to the neural network model.
+        One of {'audio', 'spect'}.
+    frame_paths : numpy.ndarray
+        Paths to npy files containing frames,
+        either spectrograms or audio signals
+        that are input to the model.
+    frame_labels_paths : numpy.ndarray
+        Paths to npy files containing vectors
+        with a label for each frame.
+        The targets for the outputs of the model.
+    sample_ids : numpy.ndarray
+        Indexing vector representing which sample
+        from the dataset every frame belongs to.
+    inds_in_sample : numpy.ndarray
+        Indexing vector representing which index
+        within each sample from the dataset
+        that every frame belongs to.
     window_size : int
-    frame_dur : float
-        Duration of a single frame, in seconds.
-    duration : float
-        Total duration of the dataset.
+        Size of windows to return;
+        number of frames.
+    frame_dur: float
+        Duration of a frame, i.e., a single sample in audio
+        or a single timebin in a spectrogram.
+    stride : int
+        The size of the stride used to determine which windows
+        are included in the dataset. The default is 1.
+        Used to compute ``window_inds``,
+        with the function
+        :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`.
+    window_inds : numpy.ndarray, optional
+        A vector of valid window indices for the dataset.
+        If specified, this takes precedence over ``stride``.
+    transform : callable
+        The transform applied to the frames,
+         the input to the neural network :math:`x`.
+    target_transform : callable
+        The transform applied to the target for the output
+        of the neural network :math:`y`.
     """
 
     def __init__(
         self,
         dataset_path: str | pathlib.Path,
         dataset_df: pd.DataFrame,
+        input_type: str,
         split: str,
         sample_ids: npt.NDArray,
         inds_in_sample: npt.NDArray,
         window_size: int,
         frame_dur: float,
         stride: int = 1,
+        subset: str | None = None,
         window_inds: npt.NDArray | None = None,
         transform: Callable | None = None,
         target_transform: Callable | None = None,
     ):
-        self.dataset_path = pathlib.Path(dataset_path)
+        """Initialize a new instance of a WindowDataset.
+
+        Parameters
+        ----------
+        dataset_path : pathlib.Path
+            Path to directory that represents a
+            frame classification dataset,
+            as created by
+            :func:`vak.prep.prep_frame_classification_dataset`.
+        dataset_df : pandas.DataFrame
+            A frame classification dataset,
+            represented as a :class:`pandas.DataFrame`.
+        input_type : str
+            The type of input to the neural network model.
+            One of {'audio', 'spect'}.
+        split : str
+            The name of a split from the dataset,
+            one of {'train', 'val', 'test'}.
+        sample_ids : numpy.ndarray
+            Indexing vector representing which sample
+            from the dataset every frame belongs to.
+        inds_in_sample : numpy.ndarray
+            Indexing vector representing which index
+            within each sample from the dataset
+            that every frame belongs to.
+        window_size : int
+            Size of windows to return;
+            number of frames.
+        frame_dur: float
+            Duration of a frame, i.e., a single sample in audio
+            or a single timebin in a spectrogram.
+        stride : int
+            The size of the stride used to determine which windows
+            are included in the dataset. The default is 1.
+            Used to compute ``window_inds``,
+            with the function
+            :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`.
+        subset : str, optional
+            Name of subset to use.
+            If specified, this takes precedence over split.
+            Subsets are typically taken from the training data
+            for use when generating a learning curve.
+        window_inds : numpy.ndarray, optional
+            A vector of valid window indices for the dataset.
+            If specified, this takes precedence over ``stride``.
+        transform : callable
+            The transform applied to the input to the neural network :math:`x`.
+        target_transform : callable
+            The transform applied to the target for the output
+            of the neural network :math:`y`.
+        """
+        from ... import prep  # avoid circular import, use for constants.INPUT_TYPES
+        if input_type not in prep.constants.INPUT_TYPES:
+            raise ValueError(
+                f"``input_type`` must be one of: {prep.constants.INPUT_TYPES}\n"
+                f"Value for ``input_type`` was: {input_type}"
+            )
 
+        self.dataset_path = pathlib.Path(dataset_path)
         self.split = split
-        dataset_df = dataset_df[dataset_df.split == split].copy()
+        self.subset = subset
+        # subset takes precedence over split, if specified
+        if subset:
+            dataset_df = dataset_df[dataset_df.subset == subset].copy()
+        else:
+            dataset_df = dataset_df[dataset_df.split == split].copy()
         self.dataset_df = dataset_df
-
+        self.input_type = input_type
         self.frames_paths = self.dataset_df[
-            constants.FRAMES_NPY_PATH_COL_NAME
+            constants.FRAMES_PATH_COL_NAME
         ].values
         self.frame_labels_paths = self.dataset_df[
             constants.FRAME_LABELS_NPY_PATH_COL_NAME
         ].values
-
         self.sample_ids = sample_ids
         self.inds_in_sample = inds_in_sample
-
         self.window_size = window_size
         self.frame_dur = float(frame_dur)
         self.stride = stride
-
         if window_inds is None:
             window_inds = get_window_inds(
                 sample_ids.shape[-1], window_size, stride
             )
         self.window_inds = window_inds
-
         self.transform = transform
         self.target_transform = target_transform
 
@@ -149,6 +279,16 @@ def shape(self):
         # e.g. when initializing a neural network model
         return one_x.shape
 
+    def _load_frames(self, frames_path):
+        """Helper function that loads "frames",
+        the input to the frame classification model.
+        Loads audio or spectrogram, depending on
+        :attr:`self.input_type`.
+        This function assumes that audio is in wav format 
+        and spectrograms are in npz files.
+        """
+        return helper.load_frames(frames_path, self.input_type)
+
     def __getitem__(self, idx):
         window_idx = self.window_inds[idx]
         sample_ids = self.sample_ids[
@@ -156,17 +296,21 @@ def __getitem__(self, idx):
         ]
         uniq_sample_ids = np.unique(sample_ids)
         if len(uniq_sample_ids) == 1:
+            # we repeat ourselves here to avoid running a loop on one item
             sample_id = uniq_sample_ids[0]
-            frames = np.load(self.dataset_path / self.frames_paths[sample_id])
+            frames_path = self.dataset_path / self.frames_paths[sample_id]
+            frames = self._load_frames(frames_path)
             frame_labels = np.load(
                 self.dataset_path / self.frame_labels_paths[sample_id]
             )
+
         elif len(uniq_sample_ids) > 1:
             frames = []
             frame_labels = []
             for sample_id in sorted(uniq_sample_ids):
+                frames_path = self.dataset_path / self.frames_paths[sample_id]
                 frames.append(
-                    np.load(self.dataset_path / self.frames_paths[sample_id])
+                    self._load_frames(frames_path)
                 )
                 frame_labels.append(
                     np.load(
@@ -210,37 +354,66 @@ def from_dataset_path(
         window_size: int,
         stride: int = 1,
         split: str = "train",
+        subset: str | None = None,
         transform: Callable | None = None,
         target_transform: Callable | None = None,
     ):
-        """
+        """Make a :class:`WindowDataset` instance,
+        given the path to a frame classification dataset.
 
         Parameters
         ----------
-        dataset_path
-        window_size
-        stride
-        split
-        transform
-        target_transform
+        dataset_path : pathlib.Path
+            Path to directory that represents a
+            frame classification dataset,
+            as created by
+            :func:`vak.prep.prep_frame_classification_dataset`.
+        window_size : int
+            Size of windows to return;
+            number of frames.
+        stride : int
+            The size of the stride used to determine which windows
+            are included in the dataset. The default is 1.
+            Used to compute ``window_inds``,
+            with the function
+            :func:`vak.datasets.frame_classification.window_dataset.get_window_inds`.
+        split : str
+            The name of a split from the dataset,
+            one of {'train', 'val', 'test'}.
+        subset : str, optional
+            Name of subset to use.
+            If specified, this takes precedence over split.
+            Subsets are typically taken from the training data
+            for use when generating a learning curve.
+        transform : callable
+            The transform applied to the input to the neural network :math:`x`.
+        target_transform : callable
+            The transform applied to the target for the output
+            of the neural network :math:`y`.
 
         Returns
         -------
-
+        dataset : vak.datasets.frame_classification.WindowDataset
         """
         dataset_path = pathlib.Path(dataset_path)
         metadata = Metadata.from_dataset_path(dataset_path)
         frame_dur = metadata.frame_dur
+        input_type = metadata.input_type
 
         dataset_csv_path = dataset_path / metadata.dataset_csv_filename
         dataset_df = pd.read_csv(dataset_csv_path)
 
         split_path = dataset_path / split
-        sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME
+        if subset:
+            sample_ids_path = split_path / helper.sample_ids_array_filename_for_subset(subset)
+        else:
+            sample_ids_path = split_path / constants.SAMPLE_IDS_ARRAY_FILENAME
         sample_ids = np.load(sample_ids_path)
-        inds_in_sample_path = (
-            split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME
-        )
+
+        if subset:
+            inds_in_sample_path = split_path / helper.inds_in_sample_array_filename_for_subset(subset)
+        else:
+            inds_in_sample_path = split_path / constants.INDS_IN_SAMPLE_ARRAY_FILENAME
         inds_in_sample = np.load(inds_in_sample_path)
 
         window_inds_path = split_path / constants.WINDOW_INDS_ARRAY_FILENAME
@@ -252,12 +425,14 @@ def from_dataset_path(
         return cls(
             dataset_path,
             dataset_df,
+            input_type,
             split,
             sample_ids,
             inds_in_sample,
             window_size,
             frame_dur,
             stride,
+            subset,
             window_inds,
             transform,
             target_transform,
diff --git a/src/vak/datasets/parametric_umap/parametric_umap.py b/src/vak/datasets/parametric_umap/parametric_umap.py
index 2281b4d21..8755ddacf 100644
--- a/src/vak/datasets/parametric_umap/parametric_umap.py
+++ b/src/vak/datasets/parametric_umap/parametric_umap.py
@@ -1,3 +1,4 @@
+"""A dataset class used to train Parametric UMAP models."""
 from __future__ import annotations
 
 import pathlib
@@ -185,16 +186,64 @@ def get_graph_elements(
 
 
 class ParametricUMAPDataset(Dataset):
-    """Dataset used for training Parametric UMAP models"""
+    """A dataset class used to train Parametric UMAP models."""
 
     def __init__(
         self,
-        data: npt.NDArray,
-        graph,
+        dataset_path: str | pathlib.Path,
         dataset_df: pd.DataFrame,
+        split: str,
+        subset: str | None = None,
         n_epochs: int = 200,
+        n_neighbors: int = 10,
+        metric: str = "euclidean",
+        random_state: int | None = None,
         transform: Callable | None = None,
     ):
+        """Initialize a :class:`ParametricUMAPDataset` instance.
+
+        Parameters
+        ----------
+        dataset_path : pathlib.Path
+            Path to directory that represents a
+            parametric UMAP dataset,
+            as created by
+            :func:`vak.prep.prep_parametric_umap_dataset`.
+        dataset_df : pandas.DataFrame
+            A parametric UMAP dataset,
+            represented as a :class:`pandas.DataFrame`.
+        split : str
+            The name of a split from the dataset,
+            one of {'train', 'val', 'test'}.
+        subset : str, optional
+            Name of subset to use.
+            If specified, this takes precedence over split.
+            Subsets are typically taken from the training data
+            for use when generating a learning curve.
+        n_epochs : int
+            Number of epochs model will be trained. Default is 200.
+        transform : callable, optional
+        """
+        # subset takes precedence over split, if specified
+        if subset:
+            dataset_df = dataset_df[dataset_df.subset == subset].copy()
+        else:
+            dataset_df = dataset_df[dataset_df.split == split].copy()
+
+        data = np.stack(
+            [
+                np.load(dataset_path / spect_path)
+                for spect_path in dataset_df.spect_path.values
+            ]
+        )
+
+        graph = get_umap_graph(
+            data,
+            n_neighbors=n_neighbors,
+            metric=metric,
+            random_state=random_state,
+        )
+
         (
             graph,
             epochs_per_sample,
@@ -246,28 +295,49 @@ def from_dataset_path(
         cls,
         dataset_path: str | pathlib.Path,
         split: str,
+        subset: str | None = None,
         n_neighbors: int = 10,
         metric: str = "euclidean",
         random_state: int | None = None,
         n_epochs: int = 200,
         transform: Callable | None = None,
     ):
-        """
+        """Make a :class:`ParametricUMAPDataset` instance,
+        given the path to parametric UMAP dataset.
 
         Parameters
         ----------
-        dataset_path : str, pathlib.Path
-            Path to a directory that represents a dataset.
-        split
-        n_neighbors
-        metric
-        random_state
-        n_epochs
-        transform
+        dataset_path : pathlib.Path
+            Path to directory that represents a
+            parametric UMAP dataset,
+            as created by
+            :func:`vak.prep.prep_parametric_umap_dataset`.
+        split : str
+            The name of a split from the dataset,
+            one of {'train', 'val', 'test'}.
+        subset : str, optional
+            Name of subset to use.
+            If specified, this takes precedence over split.
+            Subsets are typically taken from the training data
+            for use when generating a learning curve.
+        n_neighbors : int
+            Number of nearest neighbors to use
+            when computing approximate nearest neighbors.
+            Parameter passed to :class:`pynndescent.NNDescent`
+            and :func:`umap._umap.fuzzy_simplicial_set`.
+        metric : str
+            Distance metric. Default is "cosine".
+            Parameter passed to :class:`pynndescent.NNDescent`
+            and :func:`umap._umap.fuzzy_simplicial_set`.
+        random_state : numpy.random.RandomState
+            Either a numpy.random.RandomState instance,
+            or None.
+        transform : callable
+            The transform applied to the input to the neural network :math:`x`.
 
         Returns
         -------
-
+        dataset : vak.datasets.parametric_umap.ParametricUMAPDataset
         """
         import vak.datasets  # import here just to make classmethod more explicit
 
@@ -278,27 +348,17 @@ def from_dataset_path(
 
         dataset_csv_path = dataset_path / metadata.dataset_csv_filename
         dataset_df = pd.read_csv(dataset_csv_path)
-        split_df = dataset_df[dataset_df.split == split]
-
-        data = np.stack(
-            [
-                np.load(dataset_path / spect_path)
-                for spect_path in split_df.spect_path.values
-            ]
-        )
-        graph = get_umap_graph(
-            data,
-            n_neighbors=n_neighbors,
-            metric=metric,
-            random_state=random_state,
-        )
 
         return cls(
-            data,
-            graph,
-            split_df,
+            dataset_path,
+            dataset_df,
+            split,
+            subset,
             n_epochs,
-            transform=transform,
+            n_neighbors,
+            metric,
+            random_state,
+            transform,
         )
 
 
diff --git a/src/vak/learncurve/frame_classification.py b/src/vak/learncurve/frame_classification.py
index 265a6dc12..0eff85775 100644
--- a/src/vak/learncurve/frame_classification.py
+++ b/src/vak/learncurve/frame_classification.py
@@ -202,8 +202,8 @@ def learning_curve_for_frame_classification_model(
             f"Saving results to: {results_path_this_replicate}",
         )
 
-        # `split` lets us use correct subset ("split") of training set for this duration / replicate
-        split = common.learncurve.get_train_dur_replicate_split_name(
+        # `subset` lets us use correct subset of training set for this duration / replicate
+        subset = common.learncurve.get_train_dur_replicate_subset_name(
             train_dur, replicate_num
         )
 
@@ -225,7 +225,7 @@ def learning_curve_for_frame_classification_model(
             ckpt_step=ckpt_step,
             patience=patience,
             device=device,
-            split=split,
+            subset=subset,
         )
 
         logger.info(f"Evaluating model from replicate {replicate_num} ")
diff --git a/src/vak/models/frame_classification_model.py b/src/vak/models/frame_classification_model.py
index 84616d4d9..f1d36ec02 100644
--- a/src/vak/models/frame_classification_model.py
+++ b/src/vak/models/frame_classification_model.py
@@ -332,9 +332,9 @@ def predict_step(self, batch: tuple, batch_idx: int):
             containing the spectrogram
             for which a prediction was generated.
         """
-        x, source_path = batch["frames"].to(self.device), batch["source_path"]
-        if isinstance(source_path, list) and len(source_path) == 1:
-            source_path = source_path[0]
+        x, frames_path = batch["frames"].to(self.device), batch["frames_path"]
+        if isinstance(frames_path, list) and len(frames_path) == 1:
+            frames_path = frames_path[0]
         # TODO: fix this weirdness. Diff't collate_fn?
         if x.ndim in (5, 4):
             if x.shape[0] == 1:
@@ -342,7 +342,7 @@ def predict_step(self, batch: tuple, batch_idx: int):
         else:
             raise ValueError(f"invalid shape for x: {x.shape}")
         y_pred = self.network(x)
-        return {source_path: y_pred}
+        return {frames_path: y_pred}
 
     @classmethod
     def from_config(
diff --git a/src/vak/predict/frame_classification.py b/src/vak/predict/frame_classification.py
index c5c7e52da..b029ea809 100644
--- a/src/vak/predict/frame_classification.py
+++ b/src/vak/predict/frame_classification.py
@@ -239,9 +239,9 @@ def predict_with_frame_classification_model(
     results = trainer.predict(model, pred_loader)
     # TODO: figure out how to overload `on_predict_epoch_end` to return dict
     pred_dict = {
-        source_path: y_pred
+        frames_path: y_pred
         for result in results
-        for source_path, y_pred in result.items()
+        for frames_path, y_pred in result.items()
     }
     # ----------------  converting to annotations ------------------------------------------------------------------
     progress_bar = tqdm(pred_loader)
@@ -256,11 +256,11 @@ def predict_with_frame_classification_model(
     annots = []
     logger.info("converting predictions to annotations")
     for ind, batch in enumerate(progress_bar):
-        padding_mask, source_path = batch["padding_mask"], batch["source_path"]
+        padding_mask, frames_path = batch["padding_mask"], batch["frames_path"]
         padding_mask = np.squeeze(padding_mask)
-        if isinstance(source_path, list) and len(source_path) == 1:
-            source_path = source_path[0]
-        y_pred = pred_dict[source_path]
+        if isinstance(frames_path, list) and len(frames_path) == 1:
+            frames_path = frames_path[0]
+        y_pred = pred_dict[frames_path]
 
         if save_net_outputs:
             # not sure if there's a better way to get outputs into right shape;
@@ -271,7 +271,7 @@ def predict_with_frame_classification_model(
             net_output = net_output[:, padding_mask]
             net_output = net_output.cpu().numpy()
             net_output_path = output_dir.joinpath(
-                pathlib.Path(source_path).stem
+                pathlib.Path(frames_path).stem
                 + f"{model_name}{constants.NET_OUTPUT_SUFFIX}"
             )
             np.savez(net_output_path, net_output)
@@ -281,12 +281,12 @@ def predict_with_frame_classification_model(
 
         if input_type == "audio":
             frames, samplefreq = constants.AUDIO_FORMAT_FUNC_MAP[audio_format](
-                source_path
+                frames_path
             )
             frame_times = np.arange(frames.shape[-1]) / samplefreq
         elif input_type == "spect":
             spect_dict = files.spect.load(
-                dataset_path / source_path, spect_format=spect_format
+                frames_path, spect_format=spect_format
             )
             frame_times = spect_dict[timebins_key]
 
@@ -311,7 +311,7 @@ def predict_with_frame_classification_model(
             labels=labels, onsets_s=onsets_s, offsets_s=offsets_s
         )
 
-        audio_fname = files.spect.find_audio_fname(source_path)
+        audio_fname = files.spect.find_audio_fname(frames_path)
         annot = crowsetta.Annotation(
             seq=seq, notated_path=audio_fname, annot_path=annot_csv_path.name
         )
diff --git a/src/vak/prep/audio_dataset.py b/src/vak/prep/audio_dataset.py
index 3f6ff192e..9e8fe2970 100644
--- a/src/vak/prep/audio_dataset.py
+++ b/src/vak/prep/audio_dataset.py
@@ -36,9 +36,10 @@ def prep_audio_dataset(
     annot_file: str | pathlib.Path | None = None,
     labelset: set | None = None,
 ) -> pd.DataFrame:
-    """Creates a dataset of audio files from a directory,
+    """Gets a set of audio files from a directory,
     optionally paired with an annotation file or files,
-    and return a Pandas DataFrame that represents the dataset.
+    and return a Pandas DataFrame that represents the set
+    of files.
 
     Finds all files with ``audio_format`` in ``data_dir``,
     then finds any annotations with ``annot_format`` if specified,
@@ -74,9 +75,14 @@ def prep_audio_dataset(
 
     Returns
     -------
-    dataset_df : pandas.Dataframe
-        Dataframe that represents a dataset of audio files,
-        optionally with annotations.
+    source_files_df : pandas.Dataframe
+        A set of source files that will be used to prepare a
+        data set for use with neural network models,
+        represented as a :class:`pandas.DataFrame`.
+        Will contain paths to audio files,
+        possibly paired with annotation files.
+        The columns of the dataframe are specified by
+        :const:`vak.prep.audio_dataset.DF_COLUMNS`.
     """
     # pre-conditions ---------------------------------------------------------------------------------------------------
     if audio_format not in constants.VALID_AUDIO_FORMATS:
diff --git a/src/vak/prep/frame_classification/__init__.py b/src/vak/prep/frame_classification/__init__.py
index f9ee48004..5f217779d 100644
--- a/src/vak/prep/frame_classification/__init__.py
+++ b/src/vak/prep/frame_classification/__init__.py
@@ -1,10 +1,14 @@
-from . import dataset_arrays, frame_classification, learncurve, validators
+from . import frame_classification, learncurve, make_splits, validators
+from .assign_samples_to_splits import assign_samples_to_splits
 from .frame_classification import prep_frame_classification_dataset
+from .source_files import get_or_make_source_files
 
 __all__ = [
-    "dataset_arrays",
+    "assign_samples_to_splits",
     "frame_classification",
+    "get_or_make_source_files",
     "learncurve",
+    "make_splits",
     "prep_frame_classification_dataset",
     "validators",
 ]
diff --git a/src/vak/prep/frame_classification/assign_samples_to_splits.py b/src/vak/prep/frame_classification/assign_samples_to_splits.py
new file mode 100644
index 000000000..ced7d89f9
--- /dev/null
+++ b/src/vak/prep/frame_classification/assign_samples_to_splits.py
@@ -0,0 +1,138 @@
+"""Assign samples in a dataset to splits.
+
+Given a set of source files represented by a dataframe,
+assign each sample (row) to a split.
+
+Helper function called by :func:`vak.prep.frame_classification.prep_frame_classification_dataset`.
+"""
+from __future__ import annotations
+
+import logging
+import pathlib
+
+import pandas as pd
+
+from .. import dataset_df_helper, split
+
+
+logger = logging.getLogger(__name__)
+
+
+def assign_samples_to_splits(
+        purpose: str,
+        dataset_df: pd.DataFrame,
+        dataset_path: str | pathlib.Path,
+        train_dur: float | None = None,
+        val_dur: float | None = None,
+        test_dur: float | None = None,
+        labelset: set | None = None,
+) -> pd.DataFrame:
+    """Assign samples in a dataset to splits.
+
+    Given a set of source files represented by a dataframe,
+    assign each sample (row) to a split.
+
+    Helper function called by :func:`vak.prep.frame_classification.prep_frame_classification_dataset`.
+
+    If no durations are specified for splits,
+    or the purpose is either `'eval'` or `'predict'`,
+    then all rows in the dataframe
+    will be assigned to ``purpose``.
+
+    Parameters
+    ----------
+    purpose : str
+        Purpose of the dataset.
+        One of {'train', 'eval', 'predict', 'learncurve'}.
+        These correspond to commands of the vak command-line interface.
+    train_dur : float
+        Total duration of training set, in seconds.
+        When creating a learning curve,
+        training subsets of shorter duration
+        will be drawn from this set. Default is None.
+    val_dur : float
+        Total duration of validation set, in seconds.
+        Default is None.
+    test_dur : float
+        Total duration of test set, in seconds.
+        Default is None.
+    dataset_df : pandas.DataFrame
+        That represents a dataset.
+    dataset_path : pathlib.Path
+        Path to csv saved from ``dataset_df``.
+    labelset : str, list, set
+        Set of unique labels for vocalizations. Strings or integers.
+        Default is ``None``. If not ``None``, then files will be skipped
+        where the associated annotation
+        contains labels not found in ``labelset``.
+        ``labelset`` is converted to a Python ``set`` using
+        :func:`vak.converters.labelset_to_set`.
+        See help for that function for details on how to specify ``labelset``.
+
+    Returns
+    -------
+    dataset_df : pandas.DataFrame
+        The same ``dataset_df`` with a `'split'` column added,
+        where each element in that column assigns the corresponding
+        row to one of the splits in the dataset.
+    """
+
+    # ---- (possibly) split into train / val / test sets ---------------------------------------------
+    # catch case where user specified duration for just training set, raise a helpful error instead of failing silently
+    if (purpose == "train" or purpose == "learncurve") and (
+        (train_dur is not None and train_dur > 0)
+        and (val_dur is None or val_dur == 0)
+        and (test_dur is None or val_dur == 0)
+    ):
+        raise ValueError(
+            "A duration specified for just training set, but prep function does not currently support creating a "
+            "single split of a specified duration. Either remove the train_dur option from the prep section and "
+            "rerun, in which case all data will be included in the training set, or specify values greater than "
+            "zero for test_dur (and val_dur, if a validation set will be used)"
+        )
+
+    if all(
+        [dur is None for dur in (train_dur, val_dur, test_dur)]
+    ) or purpose in (
+        "eval",
+        "predict",
+    ):
+        # then we're not going to split
+        logger.info("Will not split dataset.")
+        do_split = False
+    else:
+        if val_dur is not None and train_dur is None and test_dur is None:
+            raise ValueError(
+                "cannot specify only val_dur, unclear how to split dataset into training and test sets"
+            )
+        else:
+            logger.info("Will split dataset.")
+            do_split = True
+
+    if do_split:
+        dataset_df = split.frame_classification_dataframe(
+            dataset_df,
+            dataset_path,
+            labelset=labelset,
+            train_dur=train_dur,
+            val_dur=val_dur,
+            test_dur=test_dur,
+        )
+
+    elif (
+        do_split is False
+    ):  # add a split column, but assign everything to the same 'split'
+        # ideally we would just say split=purpose in call to add_split_col, but
+        # we have to special case, because "eval" looks for a 'test' split (not an "eval" split)
+        if purpose == "eval":
+            split_name = (
+                "test"  # 'split_name' to avoid name clash with split package
+            )
+        elif purpose == "predict":
+            split_name = "predict"
+
+        dataset_df = dataset_df_helper.add_split_col(
+            dataset_df, split=split_name
+        )
+
+    return dataset_df
diff --git a/src/vak/prep/frame_classification/frame_classification.py b/src/vak/prep/frame_classification/frame_classification.py
index 4d291c8bc..c84c26985 100644
--- a/src/vak/prep/frame_classification/frame_classification.py
+++ b/src/vak/prep/frame_classification/frame_classification.py
@@ -1,3 +1,5 @@
+"""Function that prepares datasets for neural network models
+that perform the frame classification task."""
 from __future__ import annotations
 
 import json
@@ -6,17 +8,19 @@
 import warnings
 
 import crowsetta.formats.seq
+import pandas as pd
 
 from ... import datasets
 from ...common import labels
 from ...common.converters import expanded_user_path, labelset_to_set
 from ...common.logging import config_logging_for_cli, log_version
 from ...common.timenow import get_timenow_as_str
-from .. import dataset_df_helper, sequence_dataset, split
-from ..audio_dataset import prep_audio_dataset
-from ..spectrogram_dataset.prep import prep_spectrogram_dataset
-from . import dataset_arrays, validators
-from .learncurve import make_learncurve_splits_from_dataset_df
+from .. import dataset_df_helper, sequence_dataset
+from . import validators
+from .assign_samples_to_splits import assign_samples_to_splits
+from .source_files import get_or_make_source_files
+from .learncurve import make_subsets_from_dataset_df
+from .make_splits import make_splits
 
 logger = logging.getLogger(__name__)
 
@@ -33,13 +37,14 @@ def prep_frame_classification_dataset(
     annot_file: str | pathlib.Path | None = None,
     labelset: set | None = None,
     audio_dask_bag_kwargs: dict | None = None,
-    train_dur: int | None = None,
-    val_dur: int | None = None,
-    test_dur: int | None = None,
+    train_dur: float | None = None,
+    val_dur: float | None = None,
+    test_dur: float | None = None,
     train_set_durs: list[float] | None = None,
     num_replicates: int | None = None,
     spect_key: str = "s",
     timebins_key: str = "t",
+    freqbins_key: str = "f",
 ):
     """Prepare datasets for neural network models
     that perform the frame classification task.
@@ -116,9 +121,11 @@ def prep_frame_classification_dataset(
         Each replicate uses a different randomly drawn subset of the training
         data (but of the same duration).
     spect_key : str
-        key for accessing spectrogram in files. Default is 's'.
+        Key for accessing spectrogram in files. Default is 's'.
     timebins_key : str
-        key for accessing vector of time bins in files. Default is 't'.
+        Key for accessing vector of time bins in files. Default is 't'.
+    freqbins_key : str
+        Key for accessing vector of frequency bins in files. Default is 'f'.
 
     Returns
     -------
@@ -264,96 +271,35 @@ def prep_frame_classification_dataset(
     )
     logger.info(f"Will prepare dataset as directory: {dataset_path}")
 
-    # ---- actually make the dataset -----------------------------------------------------------------------------------
-    if input_type == "spect":
-        dataset_df = prep_spectrogram_dataset(
-            labelset=labelset,
-            data_dir=data_dir,
-            annot_format=annot_format,
-            annot_file=annot_file,
-            audio_format=audio_format,
-            spect_format=spect_format,
-            spect_params=spect_params,
-            spect_output_dir=dataset_path,
-            audio_dask_bag_kwargs=audio_dask_bag_kwargs,
-        )
-    elif input_type == "audio":
-        dataset_df = prep_audio_dataset(
-            audio_format=audio_format,
-            data_dir=data_dir,
-            annot_format=annot_format,
-            labelset=labelset,
-        )
-
-    if dataset_df.empty:
-        raise ValueError(
-            "Calling `vak.prep.spectrogram_dataset.prep_spectrogram_dataset` "
-            "with arguments passed to `vak.core.prep` "
-            "returned an empty dataframe.\n"
-            "Please double-check arguments to `vak.core.prep` function."
-        )
+    # ---- get or make source files: either audio or spectrogram, possible paired with annotation files ----------------
+    source_files_df: pd.DataFrame = get_or_make_source_files(
+        data_dir,
+        input_type,
+        audio_format,
+        spect_format,
+        spect_params,
+        dataset_path,
+        annot_format,
+        annot_file,
+        labelset,
+        audio_dask_bag_kwargs,
+    )
 
     # save before (possibly) splitting, just in case duration args are not valid
     # (we can't know until we make dataset)
-    dataset_df.to_csv(dataset_csv_path)
-
-    # ---- (possibly) split into train / val / test sets ---------------------------------------------
-    # catch case where user specified duration for just training set, raise a helpful error instead of failing silently
-    if (purpose == "train" or purpose == "learncurve") and (
-        (train_dur is not None and train_dur > 0)
-        and (val_dur is None or val_dur == 0)
-        and (test_dur is None or val_dur == 0)
-    ):
-        raise ValueError(
-            "A duration specified for just training set, but prep function does not currently support creating a "
-            "single split of a specified duration. Either remove the train_dur option from the prep section and "
-            "rerun, in which case all data will be included in the training set, or specify values greater than "
-            "zero for test_dur (and val_dur, if a validation set will be used)"
-        )
+    source_files_df.to_csv(dataset_csv_path)
 
-    if all(
-        [dur is None for dur in (train_dur, val_dur, test_dur)]
-    ) or purpose in (
-        "eval",
-        "predict",
-    ):
-        # then we're not going to split
-        logger.info("Will not split dataset.")
-        do_split = False
-    else:
-        if val_dur is not None and train_dur is None and test_dur is None:
-            raise ValueError(
-                "cannot specify only val_dur, unclear how to split dataset into training and test sets"
-            )
-        else:
-            logger.info("Will split dataset.")
-            do_split = True
-
-    if do_split:
-        dataset_df = split.frame_classification_dataframe(
-            dataset_df,
-            dataset_path,
-            labelset=labelset,
-            train_dur=train_dur,
-            val_dur=val_dur,
-            test_dur=test_dur,
-        )
-
-    elif (
-        do_split is False
-    ):  # add a split column, but assign everything to the same 'split'
-        # ideally we would just say split=purpose in call to add_split_col, but
-        # we have to special case, because "eval" looks for a 'test' split (not an "eval" split)
-        if purpose == "eval":
-            split_name = (
-                "test"  # 'split_name' to avoid name clash with split package
-            )
-        elif purpose == "predict":
-            split_name = "predict"
-
-        dataset_df = dataset_df_helper.add_split_col(
-            dataset_df, split=split_name
-        )
+    # ---- assign samples to splits; adds a 'split' column to dataset_df, calling `vak.prep.split` if needed -----------
+    # once we assign a split, we consider this the ``dataset_df``
+    dataset_df: pd.DataFrame = assign_samples_to_splits(
+        purpose,
+        source_files_df,
+        dataset_path,
+        train_dur,
+        val_dur,
+        test_dur,
+        labelset,
+    )
 
     # ---- create and save labelmap ------------------------------------------------------------------------------------
     # we do this before creating array files since we need to load the labelmap to make frame label vectors
@@ -374,8 +320,9 @@ def prep_frame_classification_dataset(
     else:
         labelmap = None
 
-    # ---- make arrays that represent final dataset --------------------------------------------------------------------
-    dataset_df = dataset_arrays.make_npy_files_for_each_split(
+    # ---- actually move/copy/create files into directories representing splits ----------------------------------------
+    # now we're *remaking* the dataset_df (actually adding additional rows with the splits)
+    dataset_df: pd.DataFrame = make_splits(
         dataset_df,
         dataset_path,
         input_type,
@@ -384,20 +331,18 @@ def prep_frame_classification_dataset(
         audio_format,
         spect_key,
         timebins_key,
+        freqbins_key,
     )
 
-    # ---- if purpose is learncurve, additionally prep splits for that -------------------------------------------------
+    # ---- if purpose is learncurve, additionally prep training data subsets for the learning curve --------------------
     if purpose == "learncurve":
-        dataset_df = make_learncurve_splits_from_dataset_df(
+        dataset_df: pd.DataFrame = make_subsets_from_dataset_df(
             dataset_df,
             input_type,
             train_set_durs,
             num_replicates,
             dataset_path,
             labelmap,
-            audio_format,
-            spect_key,
-            timebins_key,
         )
 
     # ---- save csv file that captures provenance of source data -------------------------------------------------------
diff --git a/src/vak/prep/frame_classification/learncurve.py b/src/vak/prep/frame_classification/learncurve.py
index 363675d1d..1ebef22f6 100644
--- a/src/vak/prep/frame_classification/learncurve.py
+++ b/src/vak/prep/frame_classification/learncurve.py
@@ -6,71 +6,233 @@
 import pathlib
 from typing import Sequence
 
+import attrs
+import dask.bag as db
+import numpy as np
 import pandas as pd
+from dask.diagnostics import ProgressBar
 
-from ... import common
+from ... import common, datasets
 from .. import split
-from .dataset_arrays import make_npy_files_for_each_split
 
 
 logger = logging.getLogger(__name__)
 
 
-def make_learncurve_splits_from_dataset_df(
+@attrs.define(frozen=True)
+class Sample:
+    """Dataclass representing one sample
+    in a frame classification dataset.
+
+    Used to add paths for arrays from the sample
+    to a ``dataset_df``, and to build
+    the ``sample_ids`` vector and ``inds_in_sample`` vector
+    for the entire dataset."""
+
+    source_id: int = attrs.field()
+    sample_id_vec: np.ndarray
+    inds_in_sample_vec: np.ndarray
+
+
+def make_index_vectors_for_each_subset(
+    subsets_df: pd.DataFrame,
+    dataset_path: str | pathlib.Path,
+    input_type: str,
+) -> pd.DataFrame:
+    r"""Make npy files containing indexing vectors
+    for each subset of the training data 
+    used to generate a learning curve 
+    with a frame classification dataset.
+
+    This function is basically the same as
+    :func:`vak.prep.frame_classification.make_splits.make_splits`,
+    *except* that it only makes the indexing vectors
+    for each subset of the training data.
+    These indexing vectors are needed for each subset
+    to properly grab windows from the npy files during training.
+    There is no need to remake the npy files themselves though.
+
+    All the indexing vectors for each split are saved
+    in the "train" directory split inside ``dataset_path``.
+
+    The indexing vectors are used by
+    :class:`vak.datasets.frame_classification.WindowDataset`
+    and :class:`vak.datasets.frame_classification.FramesDataset`.
+    These vectors make it possible to work with files,
+    to avoid loading the entire dataset into memory,
+    and to avoid working with memory-mapped arrays.
+    The first is the ``sample_ids`` vector,
+    that represents the "ID" of any sample :math:`(x, y)` in the split.
+    We use these IDs to load the array files corresponding to the samples.
+    For a split with :math:`m` samples, this will be an array of length :math:`T`,
+    the total number of frames across all samples,
+    with elements :math:`i \in (0, 1, ..., m - 1)`
+    indicating which frames correspond to which sample :math:`m_i`:
+    :math:`(0, 0, 0, ..., 1, 1, ..., m - 1, m -1)`.
+    The second vector is the ``inds_in_sample`` vector.
+    This vector is the same length as ``sample_ids``, but its values represent
+    the indices of frames within each sample :math:`x_t`.
+    For a data set with :math:`T` total frames across all samples,
+    where :math:`t_i` indicates the number of frames in each :math:`x_i`,
+    this vector will look like :math:`(0, 1, ..., t_0, 0, 1, ..., t_1, ... t_m)`.
+
+    Parameters
+    ----------
+    subset_df : pandas.DataFrame
+        A :class:`pandas.DataFrame` representing the training data subsets.
+        This DataFrame is created by
+        :func:`vak.prep.frame_classification.learncurve.make_subsets_from_dataset_df`,
+        and then passed into this function.
+        It is created from  a ``pandas.DataFrame``
+        returned by :func:`vak.prep.frame_classification.get_or_make_source_files`
+        with a ``'split'`` column added.
+    dataset_path : pathlib.Path
+        Path to directory that represents dataset.
+    input_type : str
+        The type of input to the neural network model.
+        One of {'audio', 'spect'}.
+
+    Returns
+    -------
+    None
+    """
+    subsets = [
+        subset
+        for subset in sorted(subsets_df.subset.dropna().unique())
+    ]
+    for subset in subsets:
+        logger.info(f"Making indexing vectors for subset: {subset}")
+        subset_df = subsets_df[subsets_df.subset == subset].copy()
+        frames_paths = subset_df[
+            datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
+        ].values
+
+        def _return_index_arrays(
+            source_id_path_tup,
+        ):
+            """Function we use with dask to parallelize.
+            Defined in-line so variables are in scope.
+            """
+            source_id, frames_path = source_id_path_tup
+
+            frames_path = dataset_path / pathlib.Path(frames_path)
+
+            frames = datasets.frame_classification.helper.load_frames(
+                frames_path, input_type
+            )
+
+            n_frames = frames.shape[-1]
+            sample_id_vec = np.ones((n_frames,)).astype(np.int32) * source_id
+            inds_in_sample_vec = np.arange(n_frames)
+
+            return Sample(
+                source_id,
+                sample_id_vec,
+                inds_in_sample_vec,
+            )
+
+        # ---- make npy files for this split, parallelized with dask
+        # using nested function just defined
+        source_id_frames_path_tups = [
+            (source_id, frames_path)
+            for source_id, frames_path in enumerate(frames_paths)
+        ]
+
+        source_id_frames_path_bag = db.from_sequence(source_id_frames_path_tups)
+        with ProgressBar():
+            samples = list(
+                source_id_frames_path_bag.map(
+                    _return_index_arrays
+                )
+            )
+        samples = sorted(samples, key=lambda sample: sample.source_id)
+
+        # ---- save indexing vectors in train directory
+        sample_id_vec = np.concatenate(
+            list(sample.sample_id_vec for sample in samples)
+        )
+        np.save(
+            dataset_path / "train" /
+            datasets.frame_classification.helper.sample_ids_array_filename_for_subset(subset),
+            sample_id_vec,
+        )
+        inds_in_sample_vec = np.concatenate(
+            list(sample.inds_in_sample_vec for sample in samples)
+        )
+        np.save(
+            dataset_path / "train" /
+            datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(subset),
+            inds_in_sample_vec,
+        )
+
+
+def make_subsets_from_dataset_df(
     dataset_df: pd.DataFrame,
     input_type: str,
     train_set_durs: Sequence[float],
     num_replicates: int,
     dataset_path: pathlib.Path,
     labelmap: dict,
-    audio_format: str | None = None,
-    spect_key: str = "s",
-    timebins_key: str = "t",
 ) -> pd.DataFrame:
-    """Make splits for a learning curve
-    from a dataframe representing the entire dataset,
-    one split for each combination of (training set duration,
-    replicate number).
-    Each split is a randomly drawn subset of data
+    """Make subsets of the training data split for a learning curve.
+
+    Makes subsets given a dataframe representing the entire dataset,
+    with one subset for each combination of (training set duration,
+    replicate number). Each subset is randomly drawn
     from the total training split.
 
     Uses :func:`vak.prep.split.frame_classification_dataframe` to make
-    splits/subsets of the training data
-    from ``dataset_df``, and then uses
-    :func:`vak.prep.frame_classification.dataset_arrays.make_npy_files_for_each_split`
-    to make the array files for each split.
-
-    A new directory will be made for each combination of
-    (training set duration, replicate number) as shown below,
-    for ``train_durs=[4.0, 6.0], num_replicates=2``.
-
-    .. code-block:: console
-        032312-vak-frame-classification-dataset-generated-230820_144833
-        ├── 032312_prep_230820_144833.csv
-        ├── labelmap.json
-        ├── metadata.json
-        ├── prep_230820_144833.log
-        ├── spectrograms_generated_230820_144833
-        ├── test
-        ├── train
-        ├── train-dur-4.0-replicate-1
-        ├── train-dur-4.0-replicate-2
-        ├── train-dur-6.0-replicate-1
-        ├── train-dur-6.0-replicate-2
-        ├── TweetyNet_learncurve_audio_cbin_annot_notmat.toml
-        └── val
+    subsets of the training data from ``dataset_df``.
+
+    A new column will be added to the dataframe, `'subset'`,
+    and additional rows for each subset.
+    The dataframe is returned with these subsets added.
+    (The `'split'` for these rows will still be `'train'`.)
+    Additionally, a separate set of indexing vectors
+    will be made for each subset, using
+    :func:`vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset`.
 
+   .. code-block:: console
+
+      032312-vak-frame-classification-dataset-generated-231005_121809
+      ├── 032312_prep_231005_121809.csv
+      ├── labelmap.json
+      ├── metadata.json
+      ├── prep_231005_121809.log
+      ├── TweetyNet_learncurve_audio_cbin_annot_notmat.toml
+      ├── train
+          ├── gy6or6_baseline_230312_0808.138.cbin.spect.frame_labels.npy
+          ├── gy6or6_baseline_230312_0808.138.cbin.spect.frames.npy
+          ├── gy6or6_baseline_230312_0809.141.cbin.spect.frame_labels.npy
+          ├── gy6or6_baseline_230312_0809.141.cbin.spect.frames.npy
+          ├── gy6or6_baseline_230312_0813.163.cbin.spect.frame_labels.npy
+          ├── gy6or6_baseline_230312_0813.163.cbin.spect.frames.npy
+          ├── gy6or6_baseline_230312_0816.179.cbin.spect.frame_labels.npy
+          ├── gy6or6_baseline_230312_0816.179.cbin.spect.frames.npy
+          ├── gy6or6_baseline_230312_0820.196.cbin.spect.frame_labels.npy
+          ├── gy6or6_baseline_230312_0820.196.cbin.spect.frames.npy
+          ├── inds_in_sample.npy
+          ├── inds_in_sample-train-dur-4.0-replicate-1.npy
+          ├── inds_in_sample-train-dur-4.0-replicate-2.npy
+          ├── inds_in_sample-train-dur-6.0-replicate-1.npy
+          ├── inds_in_sample-train-dur-6.0-replicate-2.npy
+          ├── sample_ids.npy
+          ├── sample_ids-train-dur-4.0-replicate-1.npy
+          ├── sample_ids-train-dur-4.0-replicate-2.npy
+          ├── sample_ids-train-dur-6.0-replicate-1.npy
+          └── sample_ids-train-dur-6.0-replicate-2.npy
+      ...
 
     Parameters
     ----------
     dataset_df : pandas.DataFrame
-        Representing an entire dataset of vocalizations.
-    input_type : str
-        The type of input to the neural network model.
-        One of {'audio', 'spect'}.
+        Dataframe representing a dataset for frame classification models.
+        It is returned by
+        :func:`vak.prep.frame_classification.get_or_make_source_files`,
+        and has a ``'split'`` column added.
     train_set_durs : list
-        of int, durations in seconds of subsets taken from training data
-        to create a learning curve, e.g. [5, 10, 15, 20].
+        Durations in seconds of subsets taken from training data
+        to create a learning curve, e.g., `[5., 10., 15., 20.]`.
     num_replicates : int
         number of times to replicate training for each training set duration
         to better estimate metrics for a training set of that size.
@@ -78,32 +240,24 @@ def make_learncurve_splits_from_dataset_df(
         data (but of the same duration).
     dataset_path : str, pathlib.Path
         Directory where splits will be saved.
-    labelmap : dict
-        A :class:`dict` that maps a set of human-readable
-        string labels to the integer classes predicted by a neural
-        network model. As returned by :func:`vak.labels.to_map`.
-    audio_format : str
-        A :class:`string` representing the format of audio files.
-        One of :constant:`vak.common.constants.VALID_AUDIO_FORMATS`.
-    spect_key : str
-        Key for accessing spectrogram in files. Default is 's'.
-    timebins_key : str
-        Key for accessing vector of time bins in files. Default is 't'.
+    input_type : str
+        The type of input to the neural network model.
+        One of {'audio', 'spect'}.
 
     Returns
     -------
     dataset_df_out : pandas.DataFrame
         A pandas.DataFrame that has the original splits
-        from ``dataset_df`` as well as the additional subsets
+        from ``dataset_df``, as well as the additional subsets
         of the training data added, along with additional
-        'train_dur' and 'replicate_num' columns
-        that can be used during analysis.
+        columns, ``'subset', 'train_dur', 'replicate_num'``,
+        that are used by :mod:`vak`.
         Other functions like :func:`vak.learncurve.learncurve`
         specify a specific subset of the training data
-        by getting the split name with the function
+        by getting the subset name with the function
         :func:`vak.common.learncurve.get_train_dur_replicate_split_name`,
         and then filtering ``dataset_df_out`` with that name
-        using the 'split' column.
+        using the 'subset' column.
     """
     dataset_path = pathlib.Path(dataset_path)
 
@@ -114,14 +268,14 @@ def make_learncurve_splits_from_dataset_df(
 
     # will concat after loop, then use ``csv_path`` to replace
     # original dataset df with this one
-    all_train_durs_and_replicates_df = []
+    subsets_df = []
     for train_dur in train_set_durs:
         logger.info(
             f"Subsetting training set for training set of duration: {train_dur}",
         )
         for replicate_num in range(1, num_replicates + 1):
-            train_dur_replicate_split_name = (
-                common.learncurve.get_train_dur_replicate_split_name(
+            train_dur_replicate_subset_name = (
+                common.learncurve.get_train_dur_replicate_subset_name(
                     train_dur, replicate_num
                 )
             )
@@ -138,30 +292,27 @@ def make_learncurve_splits_from_dataset_df(
                 train_dur_replicate_df.split == "train"
             ]
             # next line, make split name in csv match the split name used for directory in dataset dir
-            train_dur_replicate_df["split"] = train_dur_replicate_split_name
+            train_dur_replicate_df["subset"] = train_dur_replicate_subset_name
             train_dur_replicate_df["train_dur"] = train_dur
             train_dur_replicate_df["replicate_num"] = replicate_num
-            all_train_durs_and_replicates_df.append(train_dur_replicate_df)
+            subsets_df.append(train_dur_replicate_df)
 
-    all_train_durs_and_replicates_df = pd.concat(
-        all_train_durs_and_replicates_df
+    subsets_df = pd.concat(
+        subsets_df
     )
-    all_train_durs_and_replicates_df = make_npy_files_for_each_split(
-        all_train_durs_and_replicates_df,
+
+    make_index_vectors_for_each_subset(
+        subsets_df,
         dataset_path,
         input_type,
-        "learncurve",  # purpose
-        labelmap,
-        audio_format,
-        spect_key,
-        timebins_key,
     )
 
     # keep the same validation, test, and total train sets by concatenating them with the train subsets
+    dataset_df["subset"] = None  # add column but have it be empty
     dataset_df = pd.concat(
         (
-            all_train_durs_and_replicates_df,
-            dataset_df,
+            subsets_df,
+            dataset_df
         )
     )
     # We reset the entire index across all splits, instead of repeating indices,
diff --git a/src/vak/prep/frame_classification/dataset_arrays.py b/src/vak/prep/frame_classification/make_splits.py
similarity index 64%
rename from src/vak/prep/frame_classification/dataset_arrays.py
rename to src/vak/prep/frame_classification/make_splits.py
index dbeb46978..a66187990 100644
--- a/src/vak/prep/frame_classification/dataset_arrays.py
+++ b/src/vak/prep/frame_classification/make_splits.py
@@ -5,6 +5,7 @@
 import copy
 import logging
 import pathlib
+import shutil
 
 import attrs
 import crowsetta
@@ -16,6 +17,7 @@
 from ... import common, datasets, transforms
 from .. import constants as prep_constants
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -85,44 +87,98 @@ class Sample:
     Used to add paths for arrays from the sample
     to a ``dataset_df``, and to build
     the ``sample_ids`` vector and ``inds_in_sample`` vector
-    for the entire dataset."""
+    for the entire dataset.
 
+    Attributes
+    ----------
+    source_id : int
+        Integer ID number used for sorting.
+    frames_path : str
+        The path to the input to the model
+        :math:`x` after it has been moved,
+        copied, or created from a ``source_path``.
+        Path will be written relative to ``dataset_path``.
+        We preserve the original paths as metadata,
+        and consider the files in the split to contain
+        frames, regardless of the source domain
+        of the data.
+    frame_labels_npy_path : str
+        Path to frame labels,
+        relative to ``dataset_path``.
+    sample_id_vec : numpy.ndarray
+        Sample ID vector for this sample.
+    inds_in_sample_vec : numpy.ndarray
+        Indices within sample.
+    """
     source_id: int = attrs.field()
-    frame_npy_path: str
+    source_path: str
     frame_labels_npy_path: str
     sample_id_vec: np.ndarray
     inds_in_sample_vec: np.ndarray
 
 
-def make_npy_files_for_each_split(
+def make_splits(
     dataset_df: pd.DataFrame,
     dataset_path: str | pathlib.Path,
     input_type: str,
     purpose: str,
     labelmap: dict,
-    audio_format: str,
+    audio_format: str | None = None,
     spect_key: str = "s",
     timebins_key: str = "t",
+    freqbins_key: str = "f",
 ) -> pd.DataFrame:
-    r"""Make npy files containing arrays
-    for each split of a frame classification dataset.
-
-    All the npy files for each split are saved
-    in a new directory inside ``dataset_path``
-    that has the same name as the split.
+    r"""Make each split of a frame classification dataset.
+
+    This function takes a :class:`pandas.Dataframe` returned by
+    :func:`vak.prep.spectrogram_dataset.prep_spectrogram_dataset`
+    or :func:`vak.prep.audio_dataset.prep_audio_dataset`,
+    after it has been assigned a `'split'` column,
+    and then copies, moves, or generates the required files
+    as appropriate for each split.
+
+    For each unique `'split'` in the :class:`pandas.Dataframe`,
+    a directory is made inside ``dataset_path``.
+    At a high level, all files needed for working with that split
+    will be in that directory
     E.g., the ``train`` directory inside ``dataset_path``
     would have all the files for every row in ``dataset_df``
     for which ``dataset_df['split'] == 'train'``.
 
-    The function creates two npy files for each row in ``dataset_df``.
-    One has the extension '.frames.npy` and contains the input
-    to the frame classification model. The other has the extension
-    '.frame_labels.npy', and contains a vector
+    The inputs to the neural network model
+    are moved or copied into the split directory,
+    or generated if necessary.
+    If the ``input_type`` is `'audio'`,
+    then the audio files are copied from their original directory.
+    If the ``input_type`` is `'spect'`,
+    and the spectrogram files are already
+    in ``dataset_path``, they are moved into the split directory
+    (under the assumption they were generated
+    by ``vak.prep.spectrogram_dataset.audio_helper``).
+    If they are npz files, but they are not in ``dataset_path``,
+    then they are validated to make sure they have the appropriate keys,
+    and then copied into the split directory.
+    This could be the case if the files were generated
+    by another program.
+    If they are mat files, they will be converted to npz
+    with the default keys for arrays,
+    and then saved in a new npz file in the split directory.
+    This step is required so that all dataset
+    prepared by :mod:`vak` are in a "normalized" or
+    "canonicalized" format.
+
+    In addition to copying or moving the audio or spectrogram
+    files that are inputs to the neural network model,
+    other npy files are made for each split
+    and saved in the corresponding directory.
+    This function creates one npy file for each row in ``dataset_df``.
+    It has the extension '.frame_labels.npy', and contains a vector
     where each element is the target label that
     the network should predict for the corresponding frame.
-    Taken together, these two files are the data
+    Taken together, the audio or spectrogram file in each row
+    along with its corresponding frame labels are the data
     for each sample :math:`(x, y)` in the dataset,
-    where :math:`x_t` is the frames and :math:`y_t` is the frame labels.
+    where :math:`x_t` supplies the "frames", and :math:`y_t` is the frame labels.
 
     This function also creates two additional npy files for each split.
     These npy files are "indexing" vectors that
@@ -132,9 +188,9 @@ def make_npy_files_for_each_split(
     to avoid loading the entire dataset into memory,
     and to avoid working with memory-mapped arrays.
     The first is the ``sample_ids`` vector,
-    that represents the "ID" of any sample :math:`(x, y)` in the dataset.
+    that represents the "ID" of any sample :math:`(x, y)` in the split.
     We use these IDs to load the array files corresponding to the samples.
-    For a dataset with :math:`m` samples, this will be an array of length :math:`T`,
+    For a split with :math:`m` samples, this will be an array of length :math:`T`,
     the total number of frames across all samples,
     with elements :math:`i \in (0, 1, ..., m - 1)`
     indicating which frames correspond to which sample :math:`m_i`:
@@ -176,6 +232,8 @@ def make_npy_files_for_each_split(
         Key for accessing spectrogram in files. Default is 's'.
     timebins_key : str
         Key for accessing vector of time bins in files. Default is 't'.
+    freqbins_key : str
+        key for accessing vector of frequency bins in files. Default is 'f'.
 
     Returns
     -------
@@ -190,6 +248,12 @@ def make_npy_files_for_each_split(
             f"Value for ``input_type`` was: {input_type}"
         )
 
+    if input_type == "audio" and audio_format is None:
+        raise ValueError(
+            f"Value for `input_type` was 'audio' but `audio_format` is None. "
+            f"Please specify the audio format."
+        )
+
     dataset_df_out = []
     splits = [
         split
@@ -197,6 +261,7 @@ def make_npy_files_for_each_split(
         if split != "None"
     ]
     for split in splits:
+        logger.info(f"Making split for dataset: {split}")
         split_subdir = dataset_path / split
         split_subdir.mkdir()
 
@@ -221,7 +286,9 @@ def make_npy_files_for_each_split(
             source_paths = split_df["spect_path"].values
         else:
             raise ValueError(f"Invalid ``input_type``: {input_type}")
-        # do this *again* after sorting the dataframe
+        source_paths = [pathlib.Path(source_path) for source_path in source_paths]
+
+        # we get annots again, *after* sorting the dataframe
         if purpose != "predict":
             annots = common.annotation.from_df(split_df)
         else:
@@ -235,32 +302,50 @@ def _save_dataset_arrays_and_return_index_arrays(
             Defined in-line so variables are in scope
             """
             source_id, source_path, annot = source_id_path_annot_tup
-            source_path = pathlib.Path(source_path)
 
             if input_type == "audio":
+                # we always copy audio to the split directory, to avoid damaging source data
+                frames_path = shutil.copy(source_path, split_subdir)
+                # after copying, we load frames to compute frame labels
                 frames, samplefreq = common.constants.AUDIO_FORMAT_FUNC_MAP[
                     audio_format
                 ](source_path)
                 if (
                     audio_format == "cbin"
-                ):  # convert to ~wav, from int16 to float64
+                ):  # convert to ~wav, from int16 to float64damage
                     frames = frames.astype(np.float64) / 32768.0
                 if annot:
                     frame_times = np.arange(frames.shape[-1]) / samplefreq
             elif input_type == "spect":
-                spect_dict = np.load(source_path)
+                if source_path.suffix.endswith('mat'):
+                    spect_dict = common.files.spect.load(source_path, "mat")
+                    # convert to .npz and save in spect_output_dir
+                    spect_dict_npz = {
+                        "s": spect_dict[spect_key],
+                        "t": spect_dict[timebins_key],
+                        "f": spect_dict[freqbins_key],
+                    }
+                    frames_path = split_subdir / (
+                            source_path.stem + ".npz"
+                    )
+                    np.savez(frames_path, **spect_dict_npz)
+                elif source_path.suffix.endswith('npz'):
+                    spect_dict = common.files.spect.load(source_path, "npz")
+                    if source_path.is_relative_to(dataset_path):
+                        # it's already in dataset_path, we just move it into the split
+                        frames_path = shutil.move(source_path, split_subdir)
+                    else:
+                        # it's somewhere else we copy it to be safe
+                        if not all([key in spect_dict for key in ('s', 't', 'f')]):
+                            raise ValueError(
+                                f"The following spectrogram file did not have valid keys: {source_path}\n."
+                                f"All npz files should have keys 's', 't', 'f' corresponding to the spectrogram,"
+                                f"the frequencies vector, and the time vector."
+                            )
+                        frames_path = shutil.copy(source_path, split_subdir)
                 frames = spect_dict[spect_key]
                 if annot:
                     frame_times = spect_dict[timebins_key]
-            frames_npy_path = split_subdir / (
-                source_path.stem
-                + datasets.frame_classification.constants.FRAMES_ARRAY_EXT
-            )
-            np.save(frames_npy_path, frames)
-            frames_npy_path = str(
-                # make sure we save path in csv as relative to dataset root
-                frames_npy_path.relative_to(dataset_path)
-            )
 
             n_frames = frames.shape[-1]
             sample_id_vec = np.ones((n_frames,)).astype(np.int32) * source_id
@@ -288,9 +373,13 @@ def _save_dataset_arrays_and_return_index_arrays(
             else:
                 frame_labels_npy_path = None
 
+            # Rewrite ``frames_path`` as relative to root
+            # because all functions and classes downstream expect this
+            frames_path = pathlib.Path(frames_path).relative_to(dataset_path)
+
             return Sample(
                 source_id,
-                frames_npy_path,
+                frames_path,
                 frame_labels_npy_path,
                 sample_id_vec,
                 inds_in_sample_vec,
@@ -338,19 +427,31 @@ def _save_dataset_arrays_and_return_index_arrays(
             inds_in_sample_vec,
         )
 
-        frame_npy_paths = [str(sample.frame_npy_path) for sample in samples]
+        # We convert `frames_paths` back to string
+        # (just in case they are pathlib.Paths) before adding back to dataframe.
+        # Note that these are all in split dirs, written relative to ``dataset_path``.
+        frames_paths = [str(sample.source_path) for sample in samples]
         split_df[
-            datasets.frame_classification.constants.FRAMES_NPY_PATH_COL_NAME
-        ] = frame_npy_paths
+            datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
+        ] = frames_paths
 
         frame_labels_npy_paths = [
-            str(sample.frame_labels_npy_path) for sample in samples
+            sample.frame_labels_npy_path
+            if isinstance(sample.frame_labels_npy_path, str) else None
+            for sample in samples
         ]
         split_df[
             datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME
         ] = frame_labels_npy_paths
         dataset_df_out.append(split_df)
 
+    # ---- clean up
+    # Remove any spect npz files that were *not* added to a split
+    spect_npz_files_not_in_split = sorted(dataset_path.glob(f'*{common.constants.SPECT_NPZ_EXTENSION}'))
+    if len(spect_npz_files_not_in_split) > 0:
+        for spect_npz_file in spect_npz_files_not_in_split:
+            spect_npz_file.unlink()
+
     # we reset the entire index across all splits, instead of repeating indices,
     # and we set drop=False because we don't want to add a new column 'index' or 'level_0'
     dataset_df_out = pd.concat(dataset_df_out).reset_index(drop=True)
diff --git a/src/vak/prep/frame_classification/source_files.py b/src/vak/prep/frame_classification/source_files.py
new file mode 100644
index 000000000..6b9fd1333
--- /dev/null
+++ b/src/vak/prep/frame_classification/source_files.py
@@ -0,0 +1,179 @@
+import logging
+import pathlib
+
+import pandas as pd
+
+from ...common.converters import expanded_user_path, labelset_to_set
+from .. import constants
+from ..audio_dataset import prep_audio_dataset
+from ..spectrogram_dataset.prep import prep_spectrogram_dataset
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_or_make_source_files(
+        data_dir: str | pathlib.Path,
+        input_type: str,
+        audio_format: str | None = None,
+        spect_format: str | None = None,
+        spect_params: dict | None = None,
+        spect_output_dir: str | pathlib.Path | None = None,
+        annot_format: str | None = None,
+        annot_file: str | pathlib.Path | None = None,
+        labelset: set | None = None,
+        audio_dask_bag_kwargs: dict | None = None,
+) -> pd.DataFrame:
+    """Get source files for a dataset, or make them.
+
+    Gets either audio or spectrogram files from ``data dir``,
+    possibly paired with annotation files.
+
+    If ``input_type`` is ``'audio'``, then this function will look
+    for files with the extension for ``audio_format`` in ``data_dir``.
+    If ``input_type`` is ``'spectrogram'``, and ``spect_format`` is specified,
+    then this function will look for files with the extension for that format
+    in ``data_dir``. If ``input_type`` is spectrogram,
+    and ``audio_format`` is specified,
+    this function will look for audio files with that extension
+    and then generate spectrograms for them using ``spect_params``.
+    If an ``annot_format`` is specified, this function will additionally
+    look for annotation files for the audio or spectrogram files.
+    If all annotations are in a single file, this can be specified
+    with the ``annot_file`` parameter, and that will be used instead
+    of looking for other annotation files.
+
+    Parameters
+    ----------
+    data_dir : str, Path
+        Path to directory with files from which to make dataset.
+    input_type : str
+        The type of input to the neural network model.
+        One of {'audio', 'spect'}.
+    audio_format : str
+        Format of audio files. One of {'wav', 'cbin'}.
+        Default is ``None``, but either ``audio_format`` or ``spect_format``
+        must be specified.
+    spect_format : str
+        Format of files containing spectrograms as 2-d matrices. One of {'mat', 'npz'}.
+        Default is None, but either audio_format or spect_format must be specified.
+    spect_params : dict, vak.config.SpectParams
+        Parameters for creating spectrograms. Default is ``None``.
+    spect_output_dir : str
+        Path to location where spectrogram files should be saved.
+        Default is None. If ``input_type`` is ``'spect'``,
+        then ``spect_output_dir`` defaults to ``data_dir``.
+    annot_format : str
+        Format of annotations. Any format that can be used with the
+        :module:`crowsetta` library is valid. Default is ``None``.
+    annot_file : str
+        Path to a single annotation file. Default is ``None``.
+        Used when a single file contains annotates multiple audio
+        or spectrogram files.
+    audio_dask_bag_kwargs : dict
+        Keyword arguments used when calling :func:`dask.bag.from_sequence`
+        inside :func:`vak.io.audio`, where it is used to parallelize
+        the conversion of audio files into spectrograms.
+        Option should be specified in config.toml file as an inline table,
+        e.g., ``audio_dask_bag_kwargs = { npartitions = 20 }``.
+        Allows for finer-grained control
+        when needed to process files of different sizes.
+    labelset : str, list, set
+        Set of unique labels for vocalizations. Strings or integers.
+        Default is ``None``. If not ``None``, then files will be skipped
+        where the associated annotation
+        contains labels not found in ``labelset``.
+        ``labelset`` is converted to a Python ``set`` using
+        :func:`vak.converters.labelset_to_set`.
+        See help for that function for details on how to specify ``labelset``.
+
+    Returns
+    -------
+    source_files_df : pandas.DataFrame
+        Source files that will become the dataset,
+        represented as a pandas.DataFrame.
+        Each row corresponds to one sample in the dataset,
+        either an audio file or spectrogram file,
+        possibly paired with annotations.
+    """
+    if input_type not in constants.INPUT_TYPES:
+        raise ValueError(
+            f"``input_type`` must be one of: {constants.INPUT_TYPES}\n"
+            f"Value for ``input_type`` was: {input_type}"
+        )
+
+    if input_type == "audio" and spect_format is not None:
+        raise ValueError(
+            f"Input type was 'audio' but a ``spect_format`` was specified: '{spect_format}'. "
+            f"Please specify ``audio_format`` only."
+        )
+
+    if input_type == "audio" and audio_format is None:
+        raise ValueError(
+            "Input type was 'audio' but no ``audio_format`` was specified. "
+        )
+
+    if audio_format is None and spect_format is None:
+        raise ValueError(
+            "Must specify either ``audio_format`` or ``spect_format``"
+        )
+
+    if audio_format and spect_format:
+        raise ValueError(
+            "Cannot specify both ``audio_format`` and ``spect_format``, "
+            "unclear whether to create spectrograms from audio files or "
+            "use already-generated spectrograms from array files"
+        )
+
+    if labelset is not None:
+        labelset = labelset_to_set(labelset)
+
+    data_dir = expanded_user_path(data_dir)
+    if not data_dir.is_dir():
+        raise NotADirectoryError(
+            f"Path specified for ``data_dir`` not found: {data_dir}"
+        )
+
+    if annot_file is not None:
+        annot_file = expanded_user_path(annot_file)
+        if not annot_file.exists():
+            raise FileNotFoundError(
+                f"Path specified for ``annot_file`` not found: {annot_file}"
+            )
+
+    if input_type == "spect":
+        source_files_df = prep_spectrogram_dataset(
+            data_dir,
+            annot_format,
+            labelset,
+            annot_file,
+            audio_format,
+            spect_format,
+            spect_params,
+            spect_output_dir,
+            audio_dask_bag_kwargs,
+        )
+        if source_files_df.empty:
+            raise ValueError(
+                "Calling `vak.prep.spectrogram_dataset.prep_spectrogram_dataset` "
+                "with arguments passed to `vak.prep.prep_frame_classification_dataset` "
+                "returned an empty dataframe.\n"
+                "Please double-check arguments to `prep_frame_classification_dataset` function."
+            )
+
+    elif input_type == "audio":
+        source_files_df = prep_audio_dataset(
+            audio_format,
+            data_dir,
+            annot_format,
+            labelset,
+        )
+        if source_files_df.empty:
+            raise ValueError(
+                "Calling `vak.prep.audio_dataset.prep_audio_dataset` "
+                "with arguments passed to `vak.prep.prep_frame_classification_dataset` "
+                "returned an empty dataframe.\n"
+                "Please double-check arguments to `prep_frame_classification_dataset` function."
+            )
+
+    return source_files_df
diff --git a/src/vak/prep/spectrogram_dataset/audio_helper.py b/src/vak/prep/spectrogram_dataset/audio_helper.py
index 2c84a7f2a..f5e6a17a0 100644
--- a/src/vak/prep/spectrogram_dataset/audio_helper.py
+++ b/src/vak/prep/spectrogram_dataset/audio_helper.py
@@ -236,7 +236,7 @@ def _spect_file(audio_file):
         }
         basename = os.path.basename(audio_file)
         npz_fname = os.path.join(
-            os.path.normpath(output_dir), basename + ".spect.npz"
+            os.path.normpath(output_dir), basename + constants.SPECT_NPZ_EXTENSION
         )
         np.savez(npz_fname, **spect_dict)
         return npz_fname
diff --git a/src/vak/prep/spectrogram_dataset/prep.py b/src/vak/prep/spectrogram_dataset/prep.py
index 323ddbd56..d36266403 100644
--- a/src/vak/prep/spectrogram_dataset/prep.py
+++ b/src/vak/prep/spectrogram_dataset/prep.py
@@ -2,13 +2,12 @@
 
 import logging
 import pathlib
-from datetime import datetime
 
 import attrs
 import crowsetta
 import pandas as pd
 
-from ...common import annotation
+from ...common import annotation, constants
 from ...common.converters import expanded_user_path, labelset_to_set
 from ...config.spect_params import SpectParamsConfig
 from . import audio_helper, spect_helper
@@ -70,10 +69,8 @@ def prep_spectrogram_dataset(
         Parameters for creating spectrograms.
         Default is None (implying that spectrograms are already made).
     spect_output_dir : str
-        path to location where spectrogram files should be saved.
+        Path to location where spectrogram files should be saved.
         Default is None, in which case it defaults to ``data_dir``.
-        A new directory will be created in ``spect_output_dir`` with
-        the name 'spectrograms_generated_{time stamp}'.
     audio_dask_bag_kwargs : dict
         Keyword arguments used when calling ``dask.bag.from_sequence``
         inside ``vak.io.audio``, where it is used to parallelize
@@ -85,8 +82,17 @@ def prep_spectrogram_dataset(
 
     Returns
     -------
-    dataset_df : pandas.DataFrame
-        The dataset prepared from the directory specified
+    source_files_df : pandas.DataFrame
+        A set of source files that will be used to prepare a
+        data set for use with neural network models,
+        represented as a :class:`pandas.DataFrame`.
+        Will contain paths to spectrogram files,
+        possibly paired with annotation files,
+        as well as the original audio files if the
+        spectrograms were generated from audio by
+        :func:`vak.prep.audio_helper.make_spectrogram_files_from_audio_files`.
+        The columns of the dataframe are specified by
+        :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`.
     """
     # ---- pre-conditions ----------------------------------------------------------------------------------------------
     if labelset is not None:
@@ -115,11 +121,6 @@ def prep_spectrogram_dataset(
     else:
         spect_output_dir = data_dir
 
-    timenow = datetime.now().strftime("%y%m%d_%H%M%S")
-    spect_dirname = f"spectrograms_generated_{timenow}"
-    spect_output_dir = spect_output_dir / spect_dirname
-    spect_output_dir.mkdir()
-
     if annot_format is not None:
         if annot_file is None:
             annot_files = annotation.files_from_dir(
@@ -158,10 +159,11 @@ def prep_spectrogram_dataset(
             dask_bag_kwargs=audio_dask_bag_kwargs,
         )
         spect_format = "npz"
-        spect_ext = ".spect.npz"
+        spect_ext = constants.SPECT_NPZ_EXTENSION
     else:  # if audio format is None
         spect_files = None
-        spect_ext = None
+        # make sure we use the vak extension for spectrogram files
+        spect_ext = constants.SPECT_FORMAT_EXT_MAP[spect_format]
 
     make_dataframe_kwargs = {
         "spect_format": spect_format,
@@ -169,7 +171,6 @@ def prep_spectrogram_dataset(
         "annot_list": annot_list,
         "annot_format": annot_format,
         "spect_ext": spect_ext,
-        "spect_output_dir": spect_output_dir,
     }
 
     if (
@@ -196,7 +197,7 @@ def prep_spectrogram_dataset(
         ]:
             make_dataframe_kwargs[key] = spect_params[key]
 
-    dataset_df = spect_helper.make_dataframe_of_spect_files(
+    source_files_df = spect_helper.make_dataframe_of_spect_files(
         **make_dataframe_kwargs
     )
-    return dataset_df
+    return source_files_df
diff --git a/src/vak/prep/spectrogram_dataset/spect_helper.py b/src/vak/prep/spectrogram_dataset/spect_helper.py
index a29ebe485..ab692806e 100644
--- a/src/vak/prep/spectrogram_dataset/spect_helper.py
+++ b/src/vak/prep/spectrogram_dataset/spect_helper.py
@@ -1,7 +1,8 @@
-"""function that converts a set of array files (.npz, .mat) containing spectrograms
-into a pandas DataFrame that represents a dataset used by ``vak``
+"""Function that converts a set of array files (.npz, .mat) containing spectrograms
+into a pandas DataFrame that represents a dataset used by ``vak``.
 
-the returned DataFrame has columns as specified by vak.io.spect.DF_COLUMNS
+The columns of the dataframe are specified by
+ :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`.
 """
 from __future__ import annotations
 
@@ -15,7 +16,7 @@
 
 from ...common import constants, files
 from ...common.annotation import map_annotated_to_annot
-from ...common.converters import expanded_user_path, labelset_to_set
+from ...common.converters import labelset_to_set
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +35,6 @@
 def make_dataframe_of_spect_files(
     spect_format: str,
     spect_dir: str | pathlib.Path | None = None,
-    spect_output_dir: str | pathlib.Path | None = None,
     spect_files: list | None = None,
     spect_ext: str | None = None,
     annot_list: list | None = None,
@@ -46,60 +46,63 @@ def make_dataframe_of_spect_files(
     spect_key: str = "s",
     audio_path_key: str = "audio_path",
 ) -> pd.DataFrame:
-    """Creates a dataset of spectrogram files from a directory,
+    """Get a set of spectrogram files from a directory,
     optionally paired with an annotation file or files,
-    and returns a Pandas DataFrame that represents the dataset.
+    and returns a Pandas DataFrame that represents all the files.
 
     Spectrogram files are array in npz files created by numpy
     or in mat files created by Matlab.
-    If files are in mat format, they will be converted to npz
-    with the default keys for arrays, and saved in
-    ``spect_output_dir``. This step is required so that all dataset
-    prepared by :mod:`vak` are in a "normalized" or
-    "canonicalized" format. If no ``spect_output_dir`` is provided
-    when the ``spect_format`` is ``'mat'``, then this function
-    will raise an error.
 
     Parameters
     ----------
     spect_format : str
-        format of files containing spectrograms. One of {'mat', 'npz'}
+        Format of files containing spectrograms. One of {'mat', 'npz'}
     spect_dir : str
-        path to directory of files containing spectrograms as arrays.
+        Path to directory of files containing spectrograms as arrays.
         Default is None.
     spect_files : list
         List of paths to array files. Default is None.
     annot_list : list
-        of annotations for array files. Default is None
+        List of annotations for array files. Default is None
     annot_format : str
-        name of annotation format. Added as a column to the DataFrame if specified.
+        Name of annotation format. Added as a column to the DataFrame if specified.
         Used by other functions that open annotation files via their paths from the DataFrame.
         Should be a format that the crowsetta library recognizes.
         Default is None.
     labelset : str, list, set
-        of str or int, set of unique labels for vocalizations. Default is None.
+        Set of unique labels for vocalizations, of str or int. Default is None.
         If not None, then files will be skipped where the associated annotation
         contains labels not found in ``labelset``.
-        ``labelset`` is converted to a Python ``set`` using ``vak.converters.labelset_to_set``.
+        ``labelset`` is converted to a Python ``set`` using
+        :func:`vak.common.converters.labelset_to_set`.
         See help for that function for details on how to specify labelset.
     n_decimals_trunc : int
-        number of decimal places to keep when truncating the timebin duration calculated from
-        the vector of time bins.
+        number of decimal places to keep when truncating the time
+        bin duration calculated from the vector of time bins.
         Default is 3, i.e. assumes milliseconds is the last significant digit.
     freqbins_key : str
-        key for accessing vector of frequency bins in files. Default is 'f'.
+        Key for accessing vector of frequency bins in files. Default is 'f'.
     timebins_key : str
-        key for accessing vector of time bins in files. Default is 't'.
+        Key for accessing vector of time bins in files. Default is 't'.
     spect_key : str
-        key for accessing spectrogram in files. Default is 's'.
+        Key for accessing spectrogram in files. Default is 's'.
     audio_path_key : str
-        key for accessing path to source audio file for spectrogram in files.
+        Key for accessing path to source audio file for spectrogram in files.
         Default is 'audio_path'.
 
     Returns
     -------
-    dataset_df : pandas.Dataframe
-        Dataframe that represents a dataset of vocalizations.
+    source_files_df : pandas.DataFrame
+        A set of source files that will be used to prepare a
+        data set for use with neural network models,
+        represented as a :class:`pandas.DataFrame`.
+        Will contain paths to spectrogram files,
+        possibly paired with annotation files,
+        as well as the original audio files if the
+        spectrograms were generated from audio by
+        :func:`vak.prep.audio_helper.make_spectrogram_files_from_audio_files`.
+        The columns of the dataframe are specified by
+        :const:`vak.prep.spectrogram_dataset.spect_helper.DF_COLUMNS`.
 
     Notes
     -----
@@ -120,14 +123,6 @@ def make_dataframe_of_spect_files(
             f"format '{spect_format}' not recognized."
         )
 
-    if spect_format == "mat" and spect_output_dir is None:
-        raise ValueError(
-            "Must provide ``spect_output_dir`` when ``spect_format`` is '.mat'."
-            "so that array files can be converted to npz format. "
-            "This is needed so that all datasets are in a standardized, "
-            "canonical format that other functions in the library expect."
-        )
-
     if all([arg is None for arg in (spect_dir, spect_files)]):
         raise ValueError(
             "must specify one of: spect_dir, spect_files"
@@ -151,13 +146,6 @@ def make_dataframe_of_spect_files(
     if labelset is not None:
         labelset = labelset_to_set(labelset)
 
-    if spect_output_dir:
-        spect_output_dir = expanded_user_path(spect_output_dir)
-        if not spect_output_dir.is_dir():
-            raise NotADirectoryError(
-                f"spect_output_dir not found: {spect_output_dir}"
-            )
-
     # ---- get a list of spectrogram files + associated annotation files -----------------------------------------------
     if spect_dir:  # then get spect_files from that dir
         # note we already validated format above
@@ -217,11 +205,14 @@ def make_dataframe_of_spect_files(
     # ---- actually make the dataframe ---------------------------------------------------------------------------------
     # this is defined here so all other arguments to 'to_dataframe' are in scope
     def _to_record(spect_annot_tuple):
-        """helper function that enables parallelized creation of "records",
-        i.e. rows for dataframe, from .
-        Accepts a two-element tuple containing (1) a dictionary that represents a spectrogram
+        """helper function that enables parallelized creation
+        of "records", i.e. rows for dataframe.
+        Accepts a two-element tuple containing
+        (1) a dictionary that represents a spectrogram
         and (2) annotation for that file"""
         spect_path, annot = spect_annot_tuple
+        spect_path = pathlib.Path(spect_path)
+
         spect_dict = files.spect.load(spect_path, spect_format)
 
         spect_dur = spect_dict[spect_key].shape[-1] * timebin_dur
@@ -236,18 +227,6 @@ def _to_record(spect_annot_tuple):
             # (or an error)
             audio_path = files.spect.find_audio_fname(spect_path)
 
-        if spect_format == "mat":
-            # convert to .npz and save in spect_output_dir
-            spect_dict_npz = {
-                "s": spect_dict[spect_key],
-                "t": spect_dict[timebins_key],
-                "f": spect_dict[freqbins_key],
-            }
-            spect_path = spect_output_dir / (
-                pathlib.Path(spect_path).stem + ".npz"
-            )
-            np.savez(spect_path, **spect_dict_npz)
-
         if annot is not None:
             annot_path = annot.annot_path
         else:
diff --git a/src/vak/train/frame_classification.py b/src/vak/train/frame_classification.py
index 925b27d0e..256daaa84 100644
--- a/src/vak/train/frame_classification.py
+++ b/src/vak/train/frame_classification.py
@@ -45,7 +45,7 @@ def train_frame_classification_model(
     ckpt_step: int | None = None,
     patience: int | None = None,
     device: str | None = None,
-    split: str = "train",
+    subset: str | None = None,
 ) -> None:
     """Train a model from the frame classification family
     and save results.
@@ -141,11 +141,11 @@ def train_frame_classification_model(
         number of validation steps to wait without performance on the
         validation set improving before stopping the training.
         Default is None, in which case training only stops after the specified number of epochs.
-    split : str
-        Name of split from dataset found at ``dataset_path`` to use
-        when training model. Default is 'train'. This parameter is used by
-        `vak.learncurve.learncurve` to specify specific subsets of the
-        training set to use when training models for a learning curve.
+    subset : str
+        Name of a subset from the training split of the dataset
+        to use when training model. This parameter is used by
+        :func:`vak.learncurve.learncurve` to specify subsets
+        when training models for a learning curve.
     """
     for path, path_name in zip(
         (checkpoint_path, spect_scaler_path),
@@ -221,7 +221,8 @@ def train_frame_classification_model(
         logger.info("will normalize spectrograms")
         spect_standardizer = transforms.StandardizeSpect.fit_dataset_path(
             dataset_path,
-            split=split,
+            split="train",
+            subset=subset,
         )
         joblib.dump(
             spect_standardizer, results_path.joinpath("StandardizeSpect")
@@ -249,7 +250,8 @@ def train_frame_classification_model(
         train_dataset_params = {}
     train_dataset = WindowDataset.from_dataset_path(
         dataset_path=dataset_path,
-        split=split,
+        split="train",
+        subset=subset,
         transform=transform,
         target_transform=target_transform,
         **train_dataset_params,
diff --git a/src/vak/train/parametric_umap.py b/src/vak/train/parametric_umap.py
index a254397ed..675dac90d 100644
--- a/src/vak/train/parametric_umap.py
+++ b/src/vak/train/parametric_umap.py
@@ -94,7 +94,7 @@ def train_parametric_umap_model(
     val_step: int | None = None,
     ckpt_step: int | None = None,
     device: str | None = None,
-    split: str = "train",
+    subset: str | None = None,
 ) -> None:
     """Train a model from the parametric UMAP family
     and save results.
@@ -228,7 +228,8 @@ def train_parametric_umap_model(
         train_dataset_params = {}
     train_dataset = ParametricUMAPDataset.from_dataset_path(
         dataset_path=dataset_path,
-        split=split,
+        split="train",
+        subset=subset,
         transform=transform,
         **train_dataset_params,
     )
diff --git a/src/vak/train/train_.py b/src/vak/train/train_.py
index c25046827..79ee2897f 100644
--- a/src/vak/train/train_.py
+++ b/src/vak/train/train_.py
@@ -32,7 +32,7 @@ def train(
     ckpt_step: int | None = None,
     patience: int | None = None,
     device: str | None = None,
-    split: str = "train",
+    subset: str | None = None,
 ):
     """Train a model and save results.
 
@@ -185,7 +185,7 @@ def train(
             ckpt_step=ckpt_step,
             patience=patience,
             device=device,
-            split=split,
+            subset=subset,
         )
     elif model_family == "ParametricUMAPModel":
         train_parametric_umap_model(
@@ -205,7 +205,7 @@ def train(
             val_step=val_step,
             ckpt_step=ckpt_step,
             device=device,
-            split=split,
+            subset=subset,
         )
     else:
         raise ValueError(f"Model family not recognized: {model_family}")
diff --git a/src/vak/transforms/defaults/frame_classification.py b/src/vak/transforms/defaults/frame_classification.py
index 2b5733b18..c4abd0f34 100644
--- a/src/vak/transforms/defaults/frame_classification.py
+++ b/src/vak/transforms/defaults/frame_classification.py
@@ -103,7 +103,7 @@ def __init__(
 
         self.annot_transform = vak_transforms.ToLongTensor()
 
-    def __call__(self, frames, frame_labels, source_path=None):
+    def __call__(self, frames, frame_labels, frames_path=None):
         if self.spect_standardizer:
             frames = self.spect_standardizer(frames)
 
@@ -124,8 +124,9 @@ def __call__(self, frames, frame_labels, source_path=None):
         if padding_mask is not None:
             item["padding_mask"] = padding_mask
 
-        if source_path is not None:
-            item["source_path"] = source_path
+        if frames_path is not None:
+            # make sure frames_path is a str, not a pathlib.Path
+            item["frames_path"] = str(frames_path)
 
         return item
 
@@ -171,7 +172,7 @@ def __init__(
             ]
         )
 
-    def __call__(self, frames, source_path=None):
+    def __call__(self, frames, frames_path=None):
         if self.spect_standardizer:
             frames = self.spect_standardizer(frames)
 
@@ -190,8 +191,9 @@ def __call__(self, frames, source_path=None):
         if padding_mask is not None:
             item["padding_mask"] = padding_mask
 
-        if source_path is not None:
-            item["source_path"] = source_path
+        if frames_path is not None:
+            # make sure frames_path is a str, not a pathlib.Path
+            item["frames_path"] = str(frames_path)
 
         return item
 
diff --git a/src/vak/transforms/transforms.py b/src/vak/transforms/transforms.py
index 9e2c66935..298c7f475 100644
--- a/src/vak/transforms/transforms.py
+++ b/src/vak/transforms/transforms.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 
+from ..common import constants
 from ..common.validators import column_or_1d
 from . import functional as F
 
@@ -76,7 +77,7 @@ def __init__(self, mean_freqs=None, std_freqs=None, non_zero_std=None):
         self.non_zero_std = non_zero_std
 
     @classmethod
-    def fit_dataset_path(cls, dataset_path, split="train"):
+    def fit_dataset_path(cls, dataset_path, split="train", subset: str | None = None):
         """Returns a :class:`StandardizeSpect` instance
         that is fit to a split from a dataset,
         given the path to that dataset and the
@@ -99,14 +100,18 @@ def fit_dataset_path(cls, dataset_path, split="train"):
 
         dataset_path = pathlib.Path(dataset_path)
         metadata = Metadata.from_dataset_path(dataset_path)
+        input_type = metadata.input_type
         dataset_csv_path = dataset_path / metadata.dataset_csv_filename
         dataset_path = dataset_csv_path.parent
-        df = pd.read_csv(dataset_csv_path)
-        df = df[df["split"] == split].copy()
-        frames_paths = df[
-            frame_classification.constants.FRAMES_NPY_PATH_COL_NAME
+        dataset_df = pd.read_csv(dataset_csv_path)
+        if subset:
+            dataset_df = dataset_df[dataset_df.split == split].copy()
+        else:
+            dataset_df = dataset_df[dataset_df.split == split].copy()
+        frames_paths = dataset_df[
+            frame_classification.constants.FRAMES_PATH_COL_NAME
         ].values
-        frames = np.load(dataset_path / frames_paths[0])
+        frames = np.load(dataset_path / frames_paths[0])[constants.SPECT_KEY]
 
         # in files, spectrograms are in orientation (freq bins, time bins)
         # so we take mean and std across columns, i.e. time bins, i.e. axis 1
@@ -114,7 +119,7 @@ def fit_dataset_path(cls, dataset_path, split="train"):
         std_freqs = np.std(frames, axis=1)
 
         for frames_path in frames_paths[1:]:
-            frames = np.load(dataset_path / frames_path)
+            frames = np.load(dataset_path / frames_path)[constants.SPECT_KEY]
             mean_freqs += np.mean(frames, axis=1)
             std_freqs += np.std(frames, axis=1)
         mean_freqs = mean_freqs / len(frames_paths)
diff --git a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml
index b0c9e6662..0922283e8 100644
--- a/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml
+++ b/tests/data_for_tests/configs/TweetyNet_learncurve_audio_cbin_annot_notmat.toml
@@ -3,7 +3,7 @@ dataset_type = "frame classification"
 input_type = "spect"
 data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312"
 output_dir = "./tests/data_for_tests/generated/prep/learncurve/audio_cbin_annot_notmat/TweetyNet"
-audio_format = "cbin"
+spect_format = "npz"
 annot_format = "notmat"
 labelset = "iabcdefghjk"
 train_dur = 50
diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml
index 61af4b692..da6a9175c 100644
--- a/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml
+++ b/tests/data_for_tests/configs/TweetyNet_predict_audio_cbin_annot_notmat.toml
@@ -3,7 +3,7 @@ dataset_type = "frame classification"
 input_type = "spect"
 data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032412"
 output_dir = "./tests/data_for_tests/generated/prep/predict/audio_cbin_annot_notmat/TweetyNet"
-audio_format = "cbin"
+spect_format = "npz"
 
 [SPECT_PARAMS]
 fft_size = 512
diff --git a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml
deleted file mode 100644
index 769dfba72..000000000
--- a/tests/data_for_tests/configs/TweetyNet_predict_audio_wav_annot_birdsongrec.toml
+++ /dev/null
@@ -1,41 +0,0 @@
-[PREP]
-dataset_type = "frame classification"
-input_type = "spect"
-data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0"
-output_dir = "./tests/data_for_tests/generated/prep/predict/audio_wav_annot_birdsong-recognition-dataset/TweetyNet"
-audio_format = "wav"
-
-[SPECT_PARAMS]
-fft_size = 512
-step_size = 64
-freq_cutoffs = [ 500, 10000,]
-thresh = 6.25
-transform_type = "log_spect"
-
-[PREDICT]
-spect_scaler_path = "/home/user/results_181014_194418/spect_scaler"
-checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/TweetyNet/checkpoints/max-val-acc-checkpoint.pt"
-labelmap_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/bl26lb16/results_200620_164245/labelmap.json"
-model = "TweetyNet"
-batch_size = 11
-num_workers = 16
-device = "cuda"
-output_dir = "./tests/data_for_tests/generated/results/predict/audio_wav_annot_birdsong-recognition-dataset/TweetyNet"
-annot_csv_filename = "Bird0.annot.csv"
-
-[PREDICT.transform_params]
-window_size = 88
-
-[TweetyNet.network]
-conv1_filters = 8
-conv1_kernel_size = [3, 3]
-conv2_filters = 16
-conv2_kernel_size = [5, 5]
-pool1_size = [4, 1]
-pool1_stride = [4, 1]
-pool2_size = [4, 1]
-pool2_stride = [4, 1]
-hidden_size = 32
-
-[TweetyNet.optimizer]
-lr = 0.001
diff --git a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml
deleted file mode 100644
index e3988e6ab..000000000
--- a/tests/data_for_tests/configs/TweetyNet_train_audio_wav_annot_birdsongrec.toml
+++ /dev/null
@@ -1,51 +0,0 @@
-[PREP]
-dataset_type = "frame classification"
-input_type = "spect"
-labelset = "012345678"
-data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0"
-output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet"
-audio_format = "wav"
-annot_format = "birdsong-recognition-dataset"
-annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml"
-test_dur = 50
-train_dur = 15
-val_dur = 30
-
-[SPECT_PARAMS]
-fft_size = 512
-step_size = 64
-freq_cutoffs = [ 500, 10000,]
-thresh = 6.25
-transform_type = "log_spect"
-
-[TRAIN]
-model = "TweetyNet"
-normalize_spectrograms = true
-batch_size = 11
-num_epochs = 2
-val_step = 50
-ckpt_step = 200
-patience = 4
-num_workers = 16
-device = "cuda"
-root_results_dir = "./tests/data_for_tests/generated/results/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet"
-
-[TRAIN.train_dataset_params]
-window_size = 88
-
-[TRAIN.val_transform_params]
-window_size = 88
-
-[TweetyNet.network]
-conv1_filters = 8
-conv1_kernel_size = [3, 3]
-conv2_filters = 16
-conv2_kernel_size = [5, 5]
-pool1_size = [4, 1]
-pool1_stride = [4, 1]
-pool2_size = [4, 1]
-pool2_stride = [4, 1]
-hidden_size = 32
-
-[TweetyNet.optimizer]
-lr = 0.001
diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml
index c53ca4766..932208616 100644
--- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml
+++ b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_cbin_annot_notmat.toml
@@ -3,7 +3,7 @@ dataset_type = "frame classification"
 input_type = "spect"
 data_dir = "./tests/data_for_tests/source/audio_cbin_annot_notmat/gy6or6/032312"
 output_dir = "./tests/data_for_tests/generated/prep/train/audio_cbin_annot_notmat/TweetyNet"
-audio_format = "cbin"
+spect_format = "npz"
 annot_format = "notmat"
 labelset = "iabcdefghjk"
 train_dur = 50
diff --git a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml b/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml
deleted file mode 100644
index d995aa4d5..000000000
--- a/tests/data_for_tests/configs/TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml
+++ /dev/null
@@ -1,53 +0,0 @@
-[PREP]
-dataset_type = "frame classification"
-input_type = "spect"
-labelset = "012345678"
-data_dir = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0"
-output_dir = "./tests/data_for_tests/generated/prep/train/audio_wav_annot_birdsong-recognition-dataset/TweetyNet"
-audio_format = "wav"
-annot_format = "birdsong-recognition-dataset"
-annot_file = "./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Annotation.xml"
-test_dur = 50
-train_dur = 15
-val_dur = 30
-
-[SPECT_PARAMS]
-fft_size = 512
-step_size = 64
-freq_cutoffs = [ 500, 10000,]
-thresh = 6.25
-transform_type = "log_spect"
-
-[TRAIN]
-model = "TweetyNet"
-normalize_spectrograms = true
-batch_size = 11
-num_epochs = 2
-val_step = 50
-ckpt_step = 200
-patience = 4
-num_workers = 16
-device = "cuda"
-root_results_dir = "./tests/data_for_tests/generated/results/train_continue/audio_wav_annot_birdsong-recognition-dataset/TweetyNet"
-checkpoint_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/TweetyNet/checkpoints/max-val-acc-checkpoint.pt"
-spect_scaler_path = "~/Documents/repos/coding/birdsong/TweetyNet/results/BFSongRepository/gy6or6/results_200620_165308/StandardizeSpect"
-
-[TRAIN.train_dataset_params]
-window_size = 88
-
-[TRAIN.val_transform_params]
-window_size = 88
-
-[TweetyNet.network]
-conv1_filters = 8
-conv1_kernel_size = [3, 3]
-conv2_filters = 16
-conv2_kernel_size = [5, 5]
-pool1_size = [4, 1]
-pool1_stride = [4, 1]
-pool2_size = [4, 1]
-pool2_stride = [4, 1]
-hidden_size = 32
-
-[TweetyNet.optimizer]
-lr = 0.001
diff --git a/tests/data_for_tests/configs/configs.json b/tests/data_for_tests/configs/configs.json
index d92c5e674..bff69985e 100644
--- a/tests/data_for_tests/configs/configs.json
+++ b/tests/data_for_tests/configs/configs.json
@@ -3,120 +3,117 @@
     {
       "filename": "TweetyNet_train_audio_cbin_annot_notmat.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "train",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": "audio_cbin_annot_notmat/gy6or6/032312",
+      "data_dir": null,
       "use_dataset_from_config": null,
       "use_result_from_config": null
     },
     {
       "filename": "TweetyNet_learncurve_audio_cbin_annot_notmat.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "learncurve",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": null,
+      "data_dir": "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032312",
       "use_dataset_from_config": null,
       "use_result_from_config": null
     },
     {
       "filename": "TweetyNet_eval_audio_cbin_annot_notmat.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "eval",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": "audio_cbin_annot_notmat/gy6or6/032412",
+      "data_dir": null,
       "use_dataset_from_config": null,
       "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml"
     },
     {
       "filename": "TweetyNet_predict_audio_cbin_annot_notmat.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "predict",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": null,
+      "data_dir": "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032412",
       "use_dataset_from_config": null,
       "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml"
     },
     {
       "filename": "TweetyNet_train_continue_audio_cbin_annot_notmat.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "train_continue",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": null,
+      "data_dir": "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032312",
       "use_dataset_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml",
       "use_result_from_config": "TweetyNet_train_audio_cbin_annot_notmat.toml"
     },
-    {
-      "filename": "TweetyNet_train_audio_wav_annot_birdsongrec.toml",
-      "model": "TweetyNet",
-      "config_type": "train",
-      "audio_format": "wav",
-      "spect_format": null,
-      "annot_format": "birdsong-recognition-dataset",
-      "use_dataset_from_config": null,
-      "use_result_from_config": null
-    },
-    {
-      "filename": "TweetyNet_predict_audio_wav_annot_birdsongrec.toml",
-      "model": "TweetyNet",
-      "config_type": "predict",
-      "audio_format": "wav",
-      "spect_format": null,
-      "annot_format": "birdsong-recognition-dataset",
-      "use_dataset_from_config": null,
-      "use_result_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml"
-    },
-    {
-      "filename": "TweetyNet_train_continue_audio_wav_annot_birdsongrec.toml",
-      "model": "TweetyNet",
-      "config_type": "train_continue",
-      "audio_format": "wav",
-      "spect_format": null,
-      "annot_format": "birdsong-recognition-dataset",
-      "use_dataset_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml",
-      "use_result_from_config": "TweetyNet_train_audio_wav_annot_birdsongrec.toml"
-    },
     {
       "filename": "TweetyNet_train_spect_mat_annot_yarden.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "train",
       "audio_format": null,
       "spect_format": "mat",
       "annot_format": "yarden",
+      "spect_output_dir": null,
+      "data_dir": null,
       "use_dataset_from_config": null,
       "use_result_from_config": null
     },
     {
       "filename": "TweetyNet_train_continue_spect_mat_annot_yarden.toml",
       "model": "TweetyNet",
+      "model_family": "frame_classification",
       "config_type": "train_continue",
       "audio_format": null,
       "spect_format": "mat",
       "annot_format": "yarden",
+      "spect_output_dir": null,
+      "data_dir": null,
       "use_dataset_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml",
       "use_result_from_config": "TweetyNet_train_spect_mat_annot_yarden.toml"
     },
     {
       "filename": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml",
       "model": "ConvEncoderUMAP",
+      "model_family": "parametric_umap",
       "config_type": "train",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": null,
+      "data_dir": null,
       "use_dataset_from_config": null,
       "use_result_from_config": null
     },
     {
       "filename": "ConvEncoderUMAP_eval_audio_cbin_annot_notmat.toml",
       "model": "ConvEncoderUMAP",
+      "model_family": "parametric_umap",
       "config_type": "eval",
       "audio_format": "cbin",
       "spect_format": null,
       "annot_format": "notmat",
+      "spect_output_dir": null,
+      "data_dir": null,
       "use_dataset_from_config": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml",
       "use_result_from_config": "ConvEncoderUMAP_train_audio_cbin_annot_notmat.toml"
     }
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
index e910c8357..ac174ea00 100644
--- a/tests/fixtures/__init__.py
+++ b/tests/fixtures/__init__.py
@@ -8,6 +8,7 @@
 from .device import *
 from .model import *
 from .path import *
+from .source_files import *
 from .spect import *
 from .split import *
 from .test_data import *
diff --git a/tests/fixtures/annot.py b/tests/fixtures/annot.py
index f140abb48..40ba6dec0 100644
--- a/tests/fixtures/annot.py
+++ b/tests/fixtures/annot.py
@@ -87,84 +87,8 @@ def labelset_notmat(generated_test_configs_root):
     return LABELSET_NOTMAT
 
 
-ANNOT_FILE_BIRDSONGREC = SOURCE_TEST_DATA_ROOT.joinpath(
-    "audio_wav_annot_birdsongrec", "Bird0", "Annotation.xml"
-)
-
-
-@pytest.fixture
-def annot_file_birdsongrec():
-    return ANNOT_FILE_BIRDSONGREC
-
-
-scribe_birdsongrec = crowsetta.Transcriber(format="birdsong-recognition-dataset")
-ANNOT_LIST_BIRDSONGREC = scribe_birdsongrec.from_file(ANNOT_FILE_BIRDSONGREC).to_annot()
-
-
-@pytest.fixture
-def annot_list_birdsongrec():
-    return ANNOT_LIST_BIRDSONGREC
-
-
-ANNOT_DIR_TEXTGRID = SOURCE_TEST_DATA_ROOT.joinpath("audio_wav_annot_textgrid", "AGBk")
-
-
-@pytest.fixture
-def annot_dir_textgrid():
-    return ANNOT_DIR_TEXTGRID
-
-
-ANNOT_FILES_TEXTGRID = sorted(ANNOT_DIR_TEXTGRID.glob("*.TextGrid"))
-
-
-@pytest.fixture
-def annot_files_textgrid():
-    return ANNOT_FILES_TEXTGRID
-
-
-scribe_textgrid = crowsetta.Transcriber(format="textgrid")
-ANNOT_LIST_TEXTGRID = [scribe_textgrid.from_file(textgrid).to_annot()
-                       for textgrid in ANNOT_FILES_TEXTGRID]
-
-
-@pytest.fixture
-def annot_list_textgrid():
-    return ANNOT_LIST_TEXTGRID
-
-
-ANNOT_DIR_SIMPLE_SEQ = SOURCE_TEST_DATA_ROOT.joinpath(
-    "audio_cbin_annot_simple_seq", "gy6or6", "032312"
-)
-
-
-@pytest.fixture
-def annot_dir_simple_seq():
-    return ANNOT_DIR_SIMPLE_SEQ
-
-
-ANNOT_FILES_SIMPLE_SEQ = sorted(ANNOT_DIR_SIMPLE_SEQ.glob("*.cbin.csv"))
-
-
-@pytest.fixture
-def annot_files_simple_seq():
-    return ANNOT_FILES_SIMPLE_SEQ
-
-
-scribe_simple_seq = crowsetta.Transcriber(format="simple-seq")
-ANNOT_LIST_SIMPLE_SEQ = [scribe_simple_seq.from_file(simpleseq)
-                         for simpleseq in ANNOT_FILES_SIMPLE_SEQ]
-
-
-@pytest.fixture
-def annot_list_simple_seq():
-    return ANNOT_LIST_SIMPLE_SEQ
-
-
 @pytest.fixture
-def specific_annot_list(annot_list_birdsongrec,
-                        annot_list_notmat,
-                        annot_list_simple_seq,
-                        annot_list_textgrid,
+def specific_annot_list(annot_list_notmat,
                         annot_list_yarden):
     """factory fixture, returns a function that
     returns a fixture containing a list of Annotation objects,
@@ -173,10 +97,7 @@ def specific_annot_list(annot_list_birdsongrec,
     so that unit tests can be parameterized with annotation format names
     """
     FORMAT_ANNOT_LIST_FIXTURE_MAP = {
-        "birdsong-recognition-dataset": annot_list_birdsongrec,
         "notmat": annot_list_notmat,
-        "simple-seq": annot_list_simple_seq,
-        "textgrid": annot_list_textgrid,
         "yarden": annot_list_yarden,
     }
 
diff --git a/tests/fixtures/audio.py b/tests/fixtures/audio.py
index 4753a4097..94121e793 100644
--- a/tests/fixtures/audio.py
+++ b/tests/fixtures/audio.py
@@ -64,42 +64,8 @@ def audio_list_cbin_labels_not_in_labelset():
     return AUDIO_LIST_CBIN_LABELS_NOT_IN_LABELSET
 
 
-AUDIO_DIR_WAV_BIRDSONGREC = SOURCE_TEST_DATA_ROOT.joinpath("audio_wav_annot_birdsongrec", "Bird0", "Wave")
-
-
-@pytest.fixture
-def audio_dir_wav_birdsongrec():
-    return AUDIO_DIR_WAV_BIRDSONGREC
-
-
-AUDIO_LIST_WAV_BIRDSONGREC = sorted(AUDIO_DIR_WAV_BIRDSONGREC.glob("*.wav"))
-
-
-@pytest.fixture
-def audio_list_wav_birdsongrec():
-    return AUDIO_LIST_WAV_BIRDSONGREC
-
-
-AUDIO_DIR_WAV_TEXTGRID = SOURCE_TEST_DATA_ROOT.joinpath("audio_wav_annot_textgrid", "AGBk")
-
-
-@pytest.fixture
-def audio_dir_wav_textgrid():
-    return AUDIO_DIR_WAV_TEXTGRID
-
-
-AUDIO_LIST_WAV_TEXTGRID = sorted(AUDIO_DIR_WAV_TEXTGRID.glob("*.WAV"))
-
-
-@pytest.fixture
-def audio_list_wav_textgrid():
-    return AUDIO_LIST_WAV_TEXTGRID
-
-
 @pytest.fixture
-def audio_list_factory(audio_list_cbin,
-                       audio_list_wav_birdsongrec,
-                       audio_list_wav_textgrid):
+def audio_list_factory(audio_list_cbin):
     """factory fixture, returns a function that
     returns a fixture containing a list of Annotation objects,
     given a specified annotation format
@@ -108,9 +74,6 @@ def audio_list_factory(audio_list_cbin,
     """
     FORMAT_AUDIO_LIST_FIXTURE_MAP = {
         "audio_cbin_annot_notmat": audio_list_cbin,
-        "audio_cbin_annot_simple-seq": audio_list_cbin,
-        "audio_wav_annot_birdsong-recognition-dataset": audio_list_wav_birdsongrec,
-        "audio_wav_annot_textgrid": audio_list_wav_textgrid,
     }
 
     def _audio_list_factory(audio_format, annot_format):
diff --git a/tests/fixtures/config.py b/tests/fixtures/config.py
index ac4c74573..dae6e50f4 100644
--- a/tests/fixtures/config.py
+++ b/tests/fixtures/config.py
@@ -17,7 +17,7 @@ def test_configs_root():
 
     Two types of config files in this directory:
     1) those used by the tests/scripts/generate_data_for_tests.py script.
-       Will be listed in configs.json. See ``specific_config`` fixture below
+       Will be listed in configs.json. See ``specific_config_toml_path`` fixture below
        for details about types of configs.
     2) those used by tests that are static, e.g., ``invalid_section_config.toml``
 
@@ -47,7 +47,7 @@ def list_of_schematized_configs(test_configs_root):
       "annot_format": "notmat"
     }
 
-    The ``specific_config`` factory fixture returns a function that
+    The ``specific_config_toml_path`` factory fixture returns a function that
     itself return a configuration ``filename``, when provided values for
     all of the other keys.
     """
@@ -88,7 +88,7 @@ def all_generated_configs():
 
 
 @pytest.fixture
-def specific_config(generated_test_configs_root, list_of_schematized_configs, tmp_path):
+def specific_config_toml_path(generated_test_configs_root, list_of_schematized_configs, tmp_path):
     """returns a factory function
     that will return the path
     to a specific configuration file, determined by
@@ -226,7 +226,7 @@ def _return_toml(toml_path):
 
 
 @pytest.fixture
-def specific_config_toml(specific_config):
+def specific_config_toml(specific_config_toml_path):
     """returns a function that will return a dict
     containing parsed toml from a
     specific configuration file, determined by
@@ -241,7 +241,7 @@ def _specific_config_toml(
         audio_format=None,
         spect_format=None,
     ):
-        config_path = specific_config(
+        config_path = specific_config_toml_path(
             config_type, model, annot_format, audio_format, spect_format
         )
         return _return_toml(config_path)
diff --git a/tests/fixtures/source_files.py b/tests/fixtures/source_files.py
new file mode 100644
index 000000000..935c80c13
--- /dev/null
+++ b/tests/fixtures/source_files.py
@@ -0,0 +1,108 @@
+"""Fixtures having to do with source files, i.e.,
+the "raw" files that go into a data set
+used with neural networks
+"""
+import pandas as pd
+import pytest
+
+from .test_data import GENERATED_TEST_DATA_ROOT
+
+# copied from vaktestdata.constants; could we add this to that with sys.path? or vice versa
+GENERATED_SOURCE_FILES_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-csv"
+GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-with-splits-csv"
+
+
+@pytest.fixture
+def specific_source_files_csv_path():
+    """Factory fixture that returns a specific source file csv"""
+    def _specific_source_files_csv_path(
+            config_type,
+            model_name,
+            annot_format,
+            audio_format=None,
+            spect_format=None,
+    ):
+        if audio_format and spect_format:
+            raise ValueError(
+                "Specify audio_format or spect_format, not both"
+            )
+        if audio_format:
+            csv_filename = f'{model_name}_{config_type}_audio_{audio_format}_annot_{annot_format}.toml-source-files.csv'
+        elif spect_format:
+            csv_filename = f'{model_name}_{config_type}_spect_{spect_format}_annot_{annot_format}.toml-source-files.csv'
+        csv_path = GENERATED_SOURCE_FILES_CSV_DIR / csv_filename
+        return csv_path
+
+    return _specific_source_files_csv_path
+
+
+@pytest.fixture
+def specific_source_files_df(
+        specific_source_files_csv_path
+):
+    """Factory fixture that returns a specific source file csv"""
+    def _specific_source_files_df(
+            config_type,
+            model_name,
+            annot_format,
+            audio_format=None,
+            spect_format=None,
+    ):
+        csv_path = specific_source_files_csv_path(
+            config_type,
+            model_name,
+            annot_format,
+            audio_format,
+            spect_format,
+        )
+        df = pd.read_csv(csv_path)
+        return df
+    return _specific_source_files_df
+
+
+@pytest.fixture
+def specific_source_files_with_split_csv_path():
+    """Factory fixture that returns a specific source file csv"""
+    def _specific_source_files_with_split_csv_path(
+            config_type,
+            model_name,
+            annot_format,
+            audio_format=None,
+            spect_format=None,
+    ):
+        if audio_format and spect_format:
+            raise ValueError(
+                "Specify audio_format or spect_format, not both"
+            )
+        if audio_format:
+            csv_filename = f'{model_name}_{config_type}_audio_{audio_format}_annot_{annot_format}.toml-source-files-with-split.csv'
+        elif spect_format:
+            csv_filename = f'{model_name}_{config_type}_spect_{spect_format}_annot_{annot_format}.toml-source-files-with-split.csv'
+        csv_path = GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR / csv_filename
+        return csv_path
+
+    return _specific_source_files_with_split_csv_path
+
+
+@pytest.fixture
+def specific_source_files_with_split_df(
+        specific_source_files_with_split_csv_path
+):
+    """Factory fixture that returns a specific source file csv"""
+    def _specific_source_files_with_split_df(
+            config_type,
+            model_name,
+            annot_format,
+            audio_format=None,
+            spect_format=None,
+    ):
+        csv_path = specific_source_files_with_split_csv_path(
+            config_type,
+            model_name,
+            annot_format,
+            audio_format,
+            spect_format,
+        )
+        df = pd.read_csv(csv_path)
+        return df
+    return _specific_source_files_with_split_df
diff --git a/tests/fixtures/spect.py b/tests/fixtures/spect.py
index 53bc10f27..9aa8ec402 100644
--- a/tests/fixtures/spect.py
+++ b/tests/fixtures/spect.py
@@ -22,11 +22,8 @@ def spect_dir_mat():
     return SPECT_DIR_MAT
 
 
-SPECT_DIR_NPZ = sorted(
-        GENERATED_TEST_DATA_ROOT.joinpath(
-            "prep", "train", "audio_cbin_annot_notmat", "TweetyNet"
-        ).glob("*vak-frame-classification-dataset-generated*/spectrograms_generated_*")
-    )[0]
+# this dir is created by ./tests/scripts/generate_test_data
+SPECT_DIR_NPZ = GENERATED_TEST_DATA_ROOT / "spect-output-dir/audio_cbin_annot_notmat/gy6or6/032312"
 
 
 @pytest.fixture
diff --git a/tests/scripts/generate_data_for_tests.py b/tests/scripts/generate_data_for_tests.py
index c4dd83ccd..db54f972f 100644
--- a/tests/scripts/generate_data_for_tests.py
+++ b/tests/scripts/generate_data_for_tests.py
@@ -65,7 +65,7 @@
 import vaktestdata
 
 
-logger = logging.getLogger()  # 'base' logger
+logger = logging.getLogger('vaktestdata')  # 'base' logger
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 stream_handler = logging.StreamHandler(sys.stdout)
 stream_handler.setFormatter(formatter)
@@ -101,7 +101,17 @@ def generate_test_data(
     """
     # need to run `prep` before we run other commands
     if step in ('prep', 'all'):
+        # first we generate outputs of processing steps
+        # leading up to a dataset that speed up tests
+        vaktestdata.dirs.make_spect_output_dir_in_generated()  # for any prepared spectrograms
+        vaktestdata.dirs.make_source_files_csv_dir_in_generated()  # for csvs of source files
+        vaktestdata.dirs.make_source_files_with_splits_csv_dir_in_generated()  # same csvs, with splits added
+
+        # -- now actually run prep for all the configs
         config_paths = vaktestdata.configs.copy_config_files()
+
+        vaktestdata.source_files.set_up_source_files_and_csv_files_for_frame_classification_models()
+
         vaktestdata.dirs.make_subdirs_in_generated(config_paths)
         # run prep for some models
         vaktestdata.prep.run_prep()
@@ -114,7 +124,7 @@ def generate_test_data(
         for command in commands:
             if command == "prep":
                 continue  # we don't run prep in this code block
-            print(f"running configs for command: {command}")
+            logger.info(f"running configs for command: {command}")
             command_config_metadata = [
                 config_metadata
                 for config_metadata in vaktestdata.constants.CONFIG_METADATA
@@ -134,7 +144,7 @@ def generate_test_data(
 
             for config_metadata in command_config_metadata:
                 config_path = vaktestdata.constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename
-                print(
+                logger.info(
                     f"n\Running 'vak {command}', using config: {config_path.name}"
                 )
                 vak.cli.cli.cli(command, config_path)
diff --git a/tests/scripts/vaktestdata/__init__.py b/tests/scripts/vaktestdata/__init__.py
index f12f6b06c..cf29ad1f9 100644
--- a/tests/scripts/vaktestdata/__init__.py
+++ b/tests/scripts/vaktestdata/__init__.py
@@ -5,4 +5,5 @@
     dirs,
     parser,
     prep,
+    source_files,
 )
diff --git a/tests/scripts/vaktestdata/config_metadata.py b/tests/scripts/vaktestdata/config_metadata.py
index 73402c848..5154e7b32 100644
--- a/tests/scripts/vaktestdata/config_metadata.py
+++ b/tests/scripts/vaktestdata/config_metadata.py
@@ -3,11 +3,61 @@
 
 @attrs.define
 class ConfigMetadata:
-    filename: str = attrs.field()
-    model: str = attrs.field()
-    config_type: str = attrs.field()
-    audio_format: str = attrs.field()
-    spect_format: str = attrs.field()
-    annot_format: str = attrs.field()
-    use_dataset_from_config = attrs.field(default=None)
-    use_result_from_config = attrs.field(default=None)
+    """Dataclass that represents metadata
+    about a configuration file
+
+    Attributes
+    ----------
+    filename : str
+        The name of the configuration file.
+    model : str
+        The name of the model in :mod:`vak`
+        that the configuration file is used with.
+    model_family : str
+        The name of the model family
+        for the model in the configuration file.
+    config_type : str
+        The type of config, one of
+        {'train', 'eval', 'predict', 'learncurve'}.
+    audio_format : str
+        The format of the audio files.
+    spect_format : str
+        The format of the spectrogram files.
+    spect_output_dir : str, optional
+        The directory where spectrograms should be saved
+        when generated for this configuration file.
+        If not specified, then no spectrograms are generated.
+        This attribute is used to avoid repeatedly
+        generating the same set of spectrograms for multiple
+        configs.
+    data_dir : str, optional
+        The directory that should be used as the `data_dir`
+        option for this config.
+        The option will be changed to this value in the generated
+        config file.
+        This attribute is used to avoid repeatedly
+        generating the same set of spectrograms for multiple
+        configs.
+    use_dataset_from_config : str, optional
+        The filename of another configuration file.
+        The ``dataset_path`` option of that configuration file
+        will be used for this configuration file.
+        This option is used to avoid repeatedly
+        generating the same dataset for multiple configs.
+    use_results_from_config : str, optional
+        The filename of another configuration file.
+        The most recent results from ``results_path`` option
+        of that configuration file
+        will be used for this configuration file.
+    """
+    filename: str = attrs.field(converter=str)
+    model: str = attrs.field(converter=str)
+    model_family: str = attrs.field(converter=str)
+    config_type: str = attrs.field(converter=str)
+    audio_format: str = attrs.field(converter=attrs.converters.optional(str), default=None)
+    spect_format: str = attrs.field(converter=attrs.converters.optional(str), default=None)
+    annot_format: str = attrs.field(converter=attrs.converters.optional(str), default=None)
+    spect_output_dir: str = attrs.field(converter=attrs.converters.optional(str), default=None)
+    data_dir: str = attrs.field(converter=attrs.converters.optional(str), default=None)
+    use_dataset_from_config: str = attrs.field(converter=attrs.converters.optional(str), default=None)
+    use_result_from_config: str = attrs.field(converter=attrs.converters.optional(str), default=None)
diff --git a/tests/scripts/vaktestdata/constants.py b/tests/scripts/vaktestdata/constants.py
index 867ebf36b..705c93d96 100644
--- a/tests/scripts/vaktestdata/constants.py
+++ b/tests/scripts/vaktestdata/constants.py
@@ -13,8 +13,13 @@
     ConfigMetadata(**config_metadata_dict)
     for config_metadata_dict in CONFIG_METADATA_LIST
 ]
-GENERATED_TEST_DATA = TEST_DATA_ROOT / "generated"
-GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA / "configs"
+GENERATED_TEST_DATA_ROOT = TEST_DATA_ROOT / "generated"
+
+GENERATED_SPECT_OUTPUT_DIR = GENERATED_TEST_DATA_ROOT / "spect-output-dir"
+GENERATED_SOURCE_FILES_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-csv"
+GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR = GENERATED_TEST_DATA_ROOT / "source-files-with-splits-csv"
+
+GENERATED_TEST_CONFIGS_ROOT = GENERATED_TEST_DATA_ROOT / "configs"
 
 # convention is that all the config.toml files in tests/data_for_tests/configs
 # that should be run when generating test data
diff --git a/tests/scripts/vaktestdata/dirs.py b/tests/scripts/vaktestdata/dirs.py
index 8debd01f1..bbb764031 100644
--- a/tests/scripts/vaktestdata/dirs.py
+++ b/tests/scripts/vaktestdata/dirs.py
@@ -7,6 +7,16 @@
 logger = logging.getLogger(__name__)
 
 
+def make_spect_output_dir_in_generated():
+    constants.GENERATED_SPECT_OUTPUT_DIR.mkdir()
+
+def make_source_files_csv_dir_in_generated():
+    constants.GENERATED_SOURCE_FILES_CSV_DIR.mkdir()
+
+def make_source_files_with_splits_csv_dir_in_generated():
+    constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR.mkdir()
+
+
 def make_subdirs_in_generated(config_paths):
     """make sub-directories inside ./tests/data_for_tests/generated
 
@@ -26,7 +36,7 @@ def make_subdirs_in_generated(config_paths):
 
     for top_level_dir in constants.TOP_LEVEL_DIRS:  # datasets / results
         subdir_to_make = (
-                constants.GENERATED_TEST_DATA / top_level_dir
+                constants.GENERATED_TEST_DATA_ROOT / top_level_dir
         )
         logger.info(
             f"Making sub-directory: {subdir_to_make}"
@@ -47,7 +57,7 @@ def make_subdirs_in_generated(config_paths):
 
         if config_metadata.use_dataset_from_config is None:  # we need to make dataset dir
             subdir_to_make = (
-                    constants.GENERATED_TEST_DATA / 'prep' / config_type / data_dir / model
+                    constants.GENERATED_TEST_DATA_ROOT / 'prep' / config_type / data_dir / model
             )
             logger.info(
                 f"Making sub-directory: {subdir_to_make}"
@@ -55,7 +65,7 @@ def make_subdirs_in_generated(config_paths):
             subdir_to_make.mkdir(parents=True)
 
         subdir_to_make = (
-                constants.GENERATED_TEST_DATA / 'results' / config_type / data_dir / model
+                constants.GENERATED_TEST_DATA_ROOT / 'results' / config_type / data_dir / model
         )
         logger.info(
             f"Making sub-directory: {subdir_to_make}"
diff --git a/tests/scripts/vaktestdata/source_files.py b/tests/scripts/vaktestdata/source_files.py
new file mode 100644
index 000000000..e53d0e2ee
--- /dev/null
+++ b/tests/scripts/vaktestdata/source_files.py
@@ -0,0 +1,194 @@
+# Do this here to suppress warnings before we import vak
+import logging
+import shutil
+import warnings
+
+from numba.core.errors import NumbaDeprecationWarning
+warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
+
+import pandas as pd
+import toml
+
+import vak
+
+from . import constants
+
+
+logger = logging.getLogger(__name__)
+
+
+def set_up_source_files_and_csv_files_for_frame_classification_models():
+    """Set up source files and csv files
+    used when testing functionality for frame classification models.
+
+    This function does the following
+    - First, get only config files that have the model family set to "frame_classification"
+    - Then for all those config files:
+      - Generate spectrograms for all the ones that have "spect_output_dir"
+      - Then for all the other ones that have "data_dir", set that option in the config file
+    - Then for *all* the config files, run `get_or_make_source_files` (again)
+      - to get a source files dataframe
+      - and save this to csv
+      - and then save it again with a ``'split'`` column added
+    """
+    # first just get configs we're going to prep later
+    configs_to_make_spectrograms = [
+        config_metadata
+        for config_metadata in constants.CONFIG_METADATA
+        if config_metadata.model_family == "frame_classification" and config_metadata.spect_output_dir is not None
+    ]
+
+    for config_metadata in configs_to_make_spectrograms:
+        spect_output_dir = constants.GENERATED_SPECT_OUTPUT_DIR / config_metadata.spect_output_dir
+        spect_output_dir.mkdir(parents=True)
+
+        config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename
+        logger.info(
+            f"\nRunning :func:`vak.prep.frame_classification.get_or_make_source_files` to generate data for tests, "
+            f"using config:\n{config_path.name}"
+        )
+        cfg = vak.config.parse.from_toml_path(config_path)
+
+        source_files_df: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files(
+            data_dir=cfg.prep.data_dir,
+            input_type=cfg.prep.input_type,
+            audio_format=cfg.prep.audio_format,
+            spect_format=cfg.prep.spect_format,
+            spect_params=cfg.spect_params,
+            spect_output_dir=spect_output_dir,
+            annot_format=cfg.prep.annot_format,
+            annot_file=cfg.prep.annot_file,
+            labelset=cfg.prep.labelset,
+            audio_dask_bag_kwargs=cfg.prep.audio_dask_bag_kwargs,
+        )
+
+        # We copy annotation files to spect_output_dir
+        # so we can "prep" from that directory later.
+        # This means we have repeats of some files still, which is annoying;
+        # .not.mat files are about ~1.2K though
+        for annot_path in source_files_df['annot_path'].values:
+            shutil.copy(annot_path, spect_output_dir)
+
+        csv_path = constants.GENERATED_SOURCE_FILES_CSV_DIR / f'{config_metadata.filename}-source-files.csv'
+        source_files_df.to_csv(csv_path, index=False)
+
+        config_toml: dict = vak.config.parse._load_toml_from_path(config_path)
+        purpose = vak.cli.prep.purpose_from_toml(config_toml, config_path)
+        dataset_df: pd.DataFrame = vak.prep.frame_classification.assign_samples_to_splits(
+            purpose,
+            source_files_df,
+            dataset_path=spect_output_dir,
+            train_dur=cfg.prep.train_dur,
+            val_dur=cfg.prep.val_dur,
+            test_dur=cfg.prep.test_dur,
+            labelset=cfg.prep.labelset,
+        )
+        source_files_with_split_csv_path = (
+                constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR /
+                f'{config_metadata.filename}-source-files-with-split.csv'
+        )
+        dataset_df.to_csv(source_files_with_split_csv_path)
+
+    configs_to_add_data_dir = [
+        config_metadata
+        for config_metadata in constants.CONFIG_METADATA
+        if config_metadata.model_family == "frame_classification" and config_metadata.data_dir is not None
+    ]
+
+    for config_metadata in configs_to_add_data_dir:
+        config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename
+        logger.info(
+            f"\nRunning :func:`vak.prep.frame_classification.get_or_make_source_files` to generate data for tests, "
+            f"using config:\n{config_path.name}"
+        )
+
+        with config_path.open("r") as fp:
+            config_toml = toml.load(fp)
+        data_dir = constants.GENERATED_TEST_DATA_ROOT / config_metadata.data_dir
+        config_toml['PREP']['data_dir'] = str(data_dir)
+        with config_path.open("w") as fp:
+            toml.dump(config_toml, fp)
+
+        cfg = vak.config.parse.from_toml_path(config_path)
+
+        source_files_df: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files(
+            data_dir=cfg.prep.data_dir,
+            input_type=cfg.prep.input_type,
+            audio_format=cfg.prep.audio_format,
+            spect_format=cfg.prep.spect_format,
+            spect_params=cfg.spect_params,
+            spect_output_dir=None,
+            annot_format=cfg.prep.annot_format,
+            annot_file=cfg.prep.annot_file,
+            labelset=cfg.prep.labelset,
+            audio_dask_bag_kwargs=cfg.prep.audio_dask_bag_kwargs,
+        )
+
+        csv_path = constants.GENERATED_SOURCE_FILES_CSV_DIR / f'{config_metadata.filename}-source-files.csv'
+        source_files_df.to_csv(csv_path, index=False)
+
+        config_toml: dict = vak.config.parse._load_toml_from_path(config_path)
+        purpose = vak.cli.prep.purpose_from_toml(config_toml, config_path)
+        dataset_df: pd.DataFrame = vak.prep.frame_classification.assign_samples_to_splits(
+            purpose,
+            source_files_df,
+            dataset_path=data_dir,
+            train_dur=cfg.prep.train_dur,
+            val_dur=cfg.prep.val_dur,
+            test_dur=cfg.prep.test_dur,
+            labelset=cfg.prep.labelset,
+        )
+        source_files_with_split_csv_path = (
+                constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR /
+                f'{config_metadata.filename}-source-files-with-split.csv'
+        )
+        dataset_df.to_csv(source_files_with_split_csv_path)
+
+    configs_without_spect_output_or_data_dir_to_change = [
+        config_metadata
+        for config_metadata in constants.CONFIG_METADATA
+        if config_metadata.model_family == "frame_classification" and (
+                config_metadata.spect_output_dir is None and config_metadata.data_dir is None
+        )
+    ]
+    for config_metadata in configs_without_spect_output_or_data_dir_to_change:
+        config_path = constants.GENERATED_TEST_CONFIGS_ROOT / config_metadata.filename
+        if not config_path.exists():
+            raise FileNotFoundError(f"{config_path} not found")
+        logger.info(
+            f"\nRunning :func:`vak.prep.frame_classification.get_or_make_source_files` to generate data for tests, "
+            f"using config:\n{config_path.name}"
+        )
+        cfg = vak.config.parse.from_toml_path(config_path)
+        source_files_df: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files(
+            data_dir=cfg.prep.data_dir,
+            input_type=cfg.prep.input_type,
+            audio_format=cfg.prep.audio_format,
+            spect_format=cfg.prep.spect_format,
+            spect_params=cfg.spect_params,
+            spect_output_dir=None,
+            annot_format=cfg.prep.annot_format,
+            annot_file=cfg.prep.annot_file,
+            labelset=cfg.prep.labelset,
+            audio_dask_bag_kwargs=cfg.prep.audio_dask_bag_kwargs,
+        )
+
+        csv_path = constants.GENERATED_SOURCE_FILES_CSV_DIR / f'{config_metadata.filename}-source-files.csv'
+        source_files_df.to_csv(csv_path, index=False)
+
+        config_toml: dict = vak.config.parse._load_toml_from_path(config_path)
+        purpose = vak.cli.prep.purpose_from_toml(config_toml, config_path)
+        dataset_df: pd.DataFrame = vak.prep.frame_classification.assign_samples_to_splits(
+            purpose,
+            source_files_df,
+            dataset_path=cfg.prep.data_dir,
+            train_dur=cfg.prep.train_dur,
+            val_dur=cfg.prep.val_dur,
+            test_dur=cfg.prep.test_dur,
+            labelset=cfg.prep.labelset,
+        )
+        source_files_with_split_csv_path = (
+                constants.GENERATED_SOURCE_FILES_WITH_SPLITS_CSV_DIR /
+                f'{config_metadata.filename}-source-files-with-split.csv'
+        )
+        dataset_df.to_csv(source_files_with_split_csv_path)
diff --git a/tests/test_cli/test_eval.py b/tests/test_cli/test_eval.py
index 3e0c5ef99..f94f68f46 100644
--- a/tests/test_cli/test_eval.py
+++ b/tests/test_cli/test_eval.py
@@ -18,7 +18,7 @@
     ],
 )
 def test_eval(
-    model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device
+    model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device
 ):
     output_dir = tmp_path.joinpath(
         f"test_eval_{audio_format}_{spect_format}_{annot_format}"
@@ -30,7 +30,7 @@ def test_eval(
         {"section": "EVAL", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model=model_name,
         audio_format=audio_format,
@@ -49,7 +49,7 @@ def test_eval(
 
 
 def test_eval_dataset_path_none_raises(
-        specific_config, tmp_path,
+        specific_config_toml_path, tmp_path,
 ):
     """Test that cli.eval raises ValueError when dataset_path is None
     (presumably because `vak prep` was not run yet)
@@ -58,7 +58,7 @@ def test_eval_dataset_path_none_raises(
         {"section": "EVAL", "option": "dataset_path", "value": "DELETE-OPTION"},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_cli/test_learncurve.py b/tests/test_cli/test_learncurve.py
index abf3adaaa..8dce64302 100644
--- a/tests/test_cli/test_learncurve.py
+++ b/tests/test_cli/test_learncurve.py
@@ -10,7 +10,7 @@
 from . import cli_asserts
 
 
-def test_learncurve(specific_config, tmp_path, device):
+def test_learncurve(specific_config_toml_path, tmp_path, device):
     root_results_dir = tmp_path.joinpath("test_learncurve_root_results_dir")
     root_results_dir.mkdir()
 
@@ -23,7 +23,7 @@ def test_learncurve(specific_config, tmp_path, device):
         {"section": "LEARNCURVE", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="learncurve",
         model="TweetyNet",
         audio_format="cbin",
@@ -45,7 +45,7 @@ def test_learncurve(specific_config, tmp_path, device):
 
 
 def test_learning_curve_dataset_path_none_raises(
-        specific_config, tmp_path,
+        specific_config_toml_path, tmp_path,
 ):
     """Test that cli.learncurve.learning_curve
     raises ValueError when dataset_path is None
@@ -66,7 +66,7 @@ def test_learning_curve_dataset_path_none_raises(
             "value": "DELETE-OPTION"},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="learncurve",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_cli/test_predict.py b/tests/test_cli/test_predict.py
index ff2764364..6269c01d9 100644
--- a/tests/test_cli/test_predict.py
+++ b/tests/test_cli/test_predict.py
@@ -14,11 +14,10 @@
     "model_name, audio_format, spect_format, annot_format",
     [
         ("TweetyNet", "cbin", None, "notmat"),
-        ("TweetyNet", "wav", None, "birdsong-recognition-dataset"),
     ],
 )
 def test_predict(
-    model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device
+    model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device
 ):
     output_dir = tmp_path.joinpath(
         f"test_predict_{audio_format}_{spect_format}_{annot_format}"
@@ -30,7 +29,7 @@ def test_predict(
         {"section": "PREDICT", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="predict",
         model=model_name,
         audio_format=audio_format,
@@ -47,7 +46,7 @@ def test_predict(
 
 
 def test_predict_dataset_path_none_raises(
-        specific_config, tmp_path,
+        specific_config_toml_path, tmp_path,
 ):
     """Test that cli.predict raises ValueError when dataset_path is None
     (presumably because `vak prep` was not run yet)
@@ -56,7 +55,7 @@ def test_predict_dataset_path_none_raises(
         {"section": "PREDICT", "option": "dataset_path", "value": "DELETE-OPTION"},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="predict",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_cli/test_prep.py b/tests/test_cli/test_prep.py
index fee7eaf7e..cfdd8453e 100644
--- a/tests/test_cli/test_prep.py
+++ b/tests/test_cli/test_prep.py
@@ -15,9 +15,7 @@
         ("eval", "cbin", None, "notmat"),
         ("learncurve", "cbin", None, "notmat"),
         ("predict", "cbin", None, "notmat"),
-        ("predict", "wav", None, "birdsong-recognition-dataset"),
         ("train", "cbin", None, "notmat"),
-        ("train", "wav", None, "birdsong-recognition-dataset"),
         ("train", None, "mat", "yarden"),
     ],
 )
@@ -26,11 +24,11 @@ def test_purpose_from_toml(
     audio_format,
     spect_format,
     annot_format,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
 ):
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type=config_type,
         model=default_model,
         audio_format=audio_format,
@@ -47,9 +45,7 @@ def test_purpose_from_toml(
         ("eval", "cbin", None, "notmat"),
         ("learncurve", "cbin", None, "notmat"),
         ("predict", "cbin", None, "notmat"),
-        ("predict", "wav", None, "birdsong-recognition-dataset"),
         ("train", "cbin", None, "notmat"),
-        ("train", "wav", None, "birdsong-recognition-dataset"),
         ("train", None, "mat", "yarden"),
     ],
 )
@@ -58,7 +54,7 @@ def test_prep(
     audio_format,
     spect_format,
     annot_format,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
     dummy_tmpfile_csv,
@@ -77,7 +73,7 @@ def test_prep(
             "value": None,
         },
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type=config_type,
         model=default_model,
         audio_format=audio_format,
@@ -98,9 +94,7 @@ def test_prep(
         ("eval", "cbin", None, "notmat"),
         ("learncurve", "cbin", None, "notmat"),
         ("predict", "cbin", None, "notmat"),
-        ("predict", "wav", None, "birdsong-recognition-dataset"),
         ("train", "cbin", None, "notmat"),
-        ("train", "wav", None, "birdsong-recognition-dataset"),
         ("train", None, "mat", "yarden"),
     ],
 )
@@ -109,7 +103,7 @@ def test_prep_dataset_path_raises(
     audio_format,
     spect_format,
     annot_format,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
 
@@ -122,7 +116,7 @@ def test_prep_dataset_path_raises(
     options_to_change = [
         {"section": "PREP", "option": "output_dir", "value": str(output_dir)},
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type=config_type,
         model=default_model,
         audio_format=audio_format,
diff --git a/tests/test_cli/test_train.py b/tests/test_cli/test_train.py
index cb02736aa..c59716ff2 100644
--- a/tests/test_cli/test_train.py
+++ b/tests/test_cli/test_train.py
@@ -15,12 +15,11 @@
     "model_name, audio_format, spect_format, annot_format",
     [
         ("TweetyNet", "cbin", None, "notmat"),
-        ("TweetyNet", "wav", None, "birdsong-recognition-dataset"),
         ("TweetyNet", None, "mat", "yarden"),
     ],
 )
 def test_train(
-    model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device
+    model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device
 ):
     root_results_dir = tmp_path.joinpath("test_train_root_results_dir")
     root_results_dir.mkdir()
@@ -34,7 +33,7 @@ def test_train(
         {"section": "TRAIN", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model=model_name,
         audio_format=audio_format,
@@ -56,7 +55,7 @@ def test_train(
 
 
 def test_train_dataset_path_none_raises(
-        specific_config, tmp_path,
+        specific_config_toml_path, tmp_path,
 ):
     """Test that cli.train raises ValueError when dataset_path is None
     (presumably because `vak prep` was not run yet)
@@ -69,7 +68,7 @@ def test_train_dataset_path_none_raises(
         {"section": "TRAIN", "option": "dataset_path", "value": "DELETE-OPTION"},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_common/test_annotation.py b/tests/test_common/test_annotation.py
index ab6e4511f..f32134421 100644
--- a/tests/test_common/test_annotation.py
+++ b/tests/test_common/test_annotation.py
@@ -72,9 +72,7 @@ def test_audio_stem_from_path_raises(path, audio_ext):
 @pytest.mark.parametrize(
     "source_type, source_format, annot_format, audio_ext",
     [
-        ("audio", "wav", "birdsong-recognition-dataset", None),
         ("spect", "mat", "yarden", None),
-        ("audio", "wav", "birdsong-recognition-dataset", "wav"),
         ("spect", "mat", "yarden", "wav"),
     ],
 )
@@ -118,7 +116,6 @@ def test__map_using_notated_path(
     "source_type, source_format, annot_format, annotated_ext, method",
     [
         ("audio", "cbin", "notmat", None, "remove"),
-        ("audio", "wav", "textgrid", None, "replace"),
     ],
 )
 def test__map_using_ext(
@@ -166,11 +163,7 @@ def test__map_using_ext(
     "source_type, source_format, annot_format, method",
     [
         ("audio", "cbin", "notmat", "remove"),
-        ("audio", "cbin", "simple-seq", "remove"),
-        ("audio", "wav", "birdsong-recognition-dataset", None),
-        ("audio", "wav", "textgrid", "replace"),
         ("spect", "mat", "yarden", None),
-        ("audio", "wav", "textgrid", "replace"),
     ],
 )
 def test_map_annotated_to_annot(
diff --git a/tests/test_common/test_files/test_files.py b/tests/test_common/test_files/test_files.py
index 09f773d92..4cac95f40 100644
--- a/tests/test_common/test_files/test_files.py
+++ b/tests/test_common/test_files/test_files.py
@@ -64,29 +64,3 @@ def test_files_from_dir_with_cbin(audio_dir_cbin, audio_list_cbin):
     # files.from_dir returns str not Path, need to convert fixture
     audio_list_cbin = [str(audio_path) for audio_path in audio_list_cbin]
     assert sorted(audio_list_cbin) == sorted(files)
-
-
-@pytest.mark.parametrize(
-    ("dir_path", "ext"),
-    [
-        ("./tests/data_for_tests/source/audio_wav_annot_textgrid/AGBk/", "WAV"),
-        ("./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0/Wave", "wav"),
-    ],
-)
-def test_from_dir_is_case_insensitive(dir_path, ext):
-    files = vak.common.files.files.from_dir(dir_path, ext)
-    assert len(files) > 0
-    assert all([str(file).endswith(ext) for file in files])
-
-
-@pytest.mark.parametrize(
-    ("dir_path", "ext"),
-    [
-        ("./tests/data_for_tests/source/audio_wav_annot_textgrid/", "WAV"),
-        ("./tests/data_for_tests/source/audio_wav_annot_birdsongrec/Bird0", "wav"),
-    ],
-)
-def test_from_dir_searches_child_dir(dir_path, ext):
-    files = vak.common.files.files.from_dir(dir_path, ext)
-    assert len(files) > 0
-    assert all([str(file).endswith(ext) for file in files])
diff --git a/tests/test_datasets/test_frame_classification/test_frames_dataset.py b/tests/test_datasets/test_frame_classification/test_frames_dataset.py
index 953e9c3c1..a7674ec61 100644
--- a/tests/test_datasets/test_frame_classification/test_frames_dataset.py
+++ b/tests/test_datasets/test_frame_classification/test_frames_dataset.py
@@ -12,13 +12,13 @@ class TestWindowDataset:
         ]
     )
     def test_from_dataset_path(self, config_type, model_name, audio_format, spect_format, annot_format,
-                               split, specific_config):
+                               split, specific_config_toml_path):
         """Test we can get a FramesDataset instance from the classmethod ``from_dataset_path``"""
-        toml_path = specific_config(config_type,
-                                    model_name,
-                                    audio_format=audio_format,
-                                    spect_format=spect_format,
-                                    annot_format=annot_format)
+        toml_path = specific_config_toml_path(config_type,
+                                              model_name,
+                                              audio_format=audio_format,
+                                              spect_format=spect_format,
+                                              annot_format=annot_format)
         cfg = vak.config.parse.from_toml_path(toml_path)
         cfg_command = getattr(cfg, config_type)
 
diff --git a/tests/test_datasets/test_frame_classification/test_helper.py b/tests/test_datasets/test_frame_classification/test_helper.py
new file mode 100644
index 000000000..2be8e4fdc
--- /dev/null
+++ b/tests/test_datasets/test_frame_classification/test_helper.py
@@ -0,0 +1,46 @@
+import numpy as np
+import pytest
+
+import vak.datasets.frame_classification.helper
+
+from ... import fixtures
+
+
+@pytest.mark.parametrize(
+    'subset',
+    [
+        'train-dur-4.0-replicate-1',
+        'train-dur-4.0-replicate-2'
+    ]
+)
+def test_sample_ids_array_filename_for_subset(subset):
+    out = vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset(subset)
+    assert isinstance(out, str)
+    assert out == vak.datasets.frame_classification.constants.SAMPLE_IDS_ARRAY_FILENAME.replace(
+                '.npy', f'-{subset}.npy'
+            )
+
+
+@pytest.mark.parametrize(
+    'subset',
+    [
+        'train-dur-4.0-replicate-1',
+        'train-dur-4.0-replicate-2'
+    ]
+)
+def test_inds_in_sample_array_filename_for_subset(subset):
+    out = vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(subset)
+    assert isinstance(out, str)
+    assert out == vak.datasets.frame_classification.constants.INDS_IN_SAMPLE_ARRAY_FILENAME.replace(
+                '.npy', f'-{subset}.npy'
+            )
+
+
+@pytest.fixture(params=fixtures.spect.SPECT_LIST_NPZ)
+def frames_path(request):
+    return request.param
+
+
+def test_load_frames(frames_path):
+    out = vak.datasets.frame_classification.helper.load_frames(frames_path, input_type="spect")
+    assert isinstance(out, np.ndarray)
diff --git a/tests/test_datasets/test_frame_classification/test_window_dataset.py b/tests/test_datasets/test_frame_classification/test_window_dataset.py
index 5bc5f6586..613fd1854 100644
--- a/tests/test_datasets/test_frame_classification/test_window_dataset.py
+++ b/tests/test_datasets/test_frame_classification/test_window_dataset.py
@@ -13,13 +13,13 @@ class TestWindowDataset:
         ]
     )
     def test_from_dataset_path(self, config_type, model_name, audio_format, spect_format, annot_format,
-                               split, transform_kwargs, specific_config):
+                               split, transform_kwargs, specific_config_toml_path):
         """Test we can get a WindowDataset instance from the classmethod ``from_dataset_path``"""
-        toml_path = specific_config(config_type,
-                                    model_name,
-                                    audio_format=audio_format,
-                                    spect_format=spect_format,
-                                    annot_format=annot_format)
+        toml_path = specific_config_toml_path(config_type,
+                                              model_name,
+                                              audio_format=audio_format,
+                                              spect_format=spect_format,
+                                              annot_format=annot_format)
         cfg = vak.config.parse.from_toml_path(toml_path)
         cfg_command = getattr(cfg, config_type)
 
diff --git a/tests/test_datasets/test_parametric_umap/test_parametric_umap.py b/tests/test_datasets/test_parametric_umap/test_parametric_umap.py
index 38a2782da..15eab713f 100644
--- a/tests/test_datasets/test_parametric_umap/test_parametric_umap.py
+++ b/tests/test_datasets/test_parametric_umap/test_parametric_umap.py
@@ -12,13 +12,13 @@ class TestParametricUMAPDataset:
         ]
     )
     def test_from_dataset_path(self, config_type, model_name, audio_format, spect_format, annot_format,
-                               split, transform_kwargs, specific_config):
+                               split, transform_kwargs, specific_config_toml_path):
         """Test we can get a WindowDataset instance from the classmethod ``from_dataset_path``"""
-        toml_path = specific_config(config_type,
-                                    model_name,
-                                    audio_format=audio_format,
-                                    spect_format=spect_format,
-                                    annot_format=annot_format)
+        toml_path = specific_config_toml_path(config_type,
+                                              model_name,
+                                              audio_format=audio_format,
+                                              spect_format=spect_format,
+                                              annot_format=annot_format)
         cfg = vak.config.parse.from_toml_path(toml_path)
         cfg_command = getattr(cfg, config_type)
 
diff --git a/tests/test_eval/test_eval.py b/tests/test_eval/test_eval.py
index 94822887f..b4e69322b 100644
--- a/tests/test_eval/test_eval.py
+++ b/tests/test_eval/test_eval.py
@@ -20,7 +20,7 @@
 )
 def test_eval(
         audio_format, spect_format, annot_format, model_name, eval_function_to_mock,
-        specific_config, tmp_path
+        specific_config_toml_path, tmp_path
 ):
     """Test that :func:`vak.eval.eval` dispatches to the correct model-specific
     training functions"""
@@ -34,7 +34,7 @@ def test_eval(
         {"section": "EVAL", "option": "device", "value": 'cpu'},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model=model_name,
         audio_format=audio_format,
diff --git a/tests/test_eval/test_frame_classification.py b/tests/test_eval/test_frame_classification.py
index 5fe4c6d8d..ce299c0e6 100644
--- a/tests/test_eval/test_frame_classification.py
+++ b/tests/test_eval/test_frame_classification.py
@@ -44,7 +44,7 @@ def test_eval_frame_classification_model(
         audio_format,
         spect_format,
         annot_format,
-        specific_config,
+        specific_config_toml_path,
         tmp_path,
         device,
         post_tfm_kwargs
@@ -59,7 +59,7 @@ def test_eval_frame_classification_model(
         {"section": "EVAL", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model=model_name,
         audio_format=audio_format,
@@ -98,7 +98,7 @@ def test_eval_frame_classification_model(
 )
 def test_eval_frame_classification_model_raises_file_not_found(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     tmp_path,
     device
 ):
@@ -117,7 +117,7 @@ def test_eval_frame_classification_model_raises_file_not_found(
         path_option_to_change,
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model="TweetyNet",
         audio_format="cbin",
@@ -152,7 +152,7 @@ def test_eval_frame_classification_model_raises_file_not_found(
 )
 def test_eval_frame_classification_model_raises_not_a_directory(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     device,
     tmp_path,
 ):
@@ -175,7 +175,7 @@ def test_eval_frame_classification_model_raises_not_a_directory(
             {"section": "EVAL", "option": "output_dir", "value": str(output_dir)}
         )
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_eval/test_parametric_umap.py b/tests/test_eval/test_parametric_umap.py
index adc95a729..5b803a7e7 100644
--- a/tests/test_eval/test_parametric_umap.py
+++ b/tests/test_eval/test_parametric_umap.py
@@ -23,7 +23,7 @@ def test_eval_parametric_umap_model(
         audio_format,
         spect_format,
         annot_format,
-        specific_config,
+        specific_config_toml_path,
         tmp_path,
         device,
 ):
@@ -37,7 +37,7 @@ def test_eval_parametric_umap_model(
         {"section": "EVAL", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model=model_name,
         audio_format=audio_format,
@@ -72,7 +72,7 @@ def test_eval_parametric_umap_model(
 )
 def test_eval_frame_classification_model_raises_file_not_found(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     tmp_path,
     device
 ):
@@ -89,7 +89,7 @@ def test_eval_frame_classification_model_raises_file_not_found(
         path_option_to_change,
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model="ConvEncoderUMAP",
         audio_format="cbin",
@@ -123,7 +123,7 @@ def test_eval_frame_classification_model_raises_file_not_found(
 )
 def test_eval_frame_classification_model_raises_not_a_directory(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     device,
     tmp_path,
 ):
@@ -145,7 +145,7 @@ def test_eval_frame_classification_model_raises_not_a_directory(
             {"section": "EVAL", "option": "output_dir", "value": str(output_dir)}
         )
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="eval",
         model="ConvEncoderUMAP",
         audio_format="cbin",
diff --git a/tests/test_learncurve/test_frame_classification.py b/tests/test_learncurve/test_frame_classification.py
index 7ca125fd6..cc3484279 100644
--- a/tests/test_learncurve/test_frame_classification.py
+++ b/tests/test_learncurve/test_frame_classification.py
@@ -51,10 +51,10 @@ def assert_learncurve_output_matches_expected(cfg, model_name, results_path):
     ]
 )
 def test_learning_curve_for_frame_classification_model(
-        model_name, audio_format, annot_format, specific_config, tmp_path, device):
+        model_name, audio_format, annot_format, specific_config_toml_path, tmp_path, device):
     options_to_change = {"section": "LEARNCURVE", "option": "device", "value": device}
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="learncurve",
         model=model_name,
         audio_format=audio_format,
@@ -99,7 +99,7 @@ def test_learning_curve_for_frame_classification_model(
     ]
 )
 def test_learncurve_raises_not_a_directory(dir_option_to_change,
-                                           specific_config,
+                                           specific_config_toml_path,
                                            tmp_path, device):
     """Test that core.learncurve.learning_curve raises NotADirectoryError
     when the following directories do not exist:
@@ -109,7 +109,7 @@ def test_learncurve_raises_not_a_directory(dir_option_to_change,
         {"section": "LEARNCURVE", "option": "device", "value": device},
         dir_option_to_change
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="learncurve",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_models/test_base.py b/tests/test_models/test_base.py
index 4f8dc282c..0c3236296 100644
--- a/tests/test_models/test_base.py
+++ b/tests/test_models/test_base.py
@@ -180,7 +180,7 @@ def test_validate_init_raises(self, definition, kwargs, expected_exception, monk
     def test_load_state_dict_from_path(self,
                                        model_name,
                                        # our fixtures
-                                       specific_config,
+                                       specific_config_toml_path,
                                        # pytest fixtures
                                        monkeypatch,
                                        device
@@ -190,7 +190,7 @@ def test_load_state_dict_from_path(self,
         We use actual model definitions here so we can test with real checkpoints.
         """
         definition = self.MODEL_DEFINITION_MAP[model_name]
-        train_toml_path = specific_config('train', model_name, audio_format='cbin', annot_format='notmat')
+        train_toml_path = specific_config_toml_path('train', model_name, audio_format='cbin', annot_format='notmat')
         train_cfg = vak.config.parse.from_toml_path(train_toml_path)
 
         # stuff we need just to be able to instantiate network
@@ -224,7 +224,7 @@ def test_load_state_dict_from_path(self,
         model = vak.models.base.Model(network=network)
         model.to(device)
 
-        eval_toml_path = specific_config('eval', model_name, audio_format='cbin', annot_format='notmat')
+        eval_toml_path = specific_config_toml_path('eval', model_name, audio_format='cbin', annot_format='notmat')
         eval_cfg = vak.config.parse.from_toml_path(eval_toml_path)
         checkpoint_path = eval_cfg.eval.checkpoint_path
 
diff --git a/tests/test_models/test_frame_classification_model.py b/tests/test_models/test_frame_classification_model.py
index 4694fa160..e77e84acf 100644
--- a/tests/test_models/test_frame_classification_model.py
+++ b/tests/test_models/test_frame_classification_model.py
@@ -79,13 +79,13 @@ def test_init(self,
     def test_from_config(self,
                          definition,
                          # our fixtures
-                         specific_config,
+                         specific_config_toml_path,
                          # pytest fixtures
                          monkeypatch,
                          ):
         definition = vak.models.definition.validate(definition)
         model_name = definition.__name__.replace('Definition', '')
-        toml_path = specific_config('train', model_name, audio_format='cbin', annot_format='notmat')
+        toml_path = specific_config_toml_path('train', model_name, audio_format='cbin', annot_format='notmat')
         cfg = vak.config.parse.from_toml_path(toml_path)
 
         # stuff we need just to be able to instantiate network
diff --git a/tests/test_models/test_parametric_umap_model.py b/tests/test_models/test_parametric_umap_model.py
index b3b75d3e0..36087ddbc 100644
--- a/tests/test_models/test_parametric_umap_model.py
+++ b/tests/test_models/test_parametric_umap_model.py
@@ -81,12 +81,12 @@ def test_from_config(
             self,
             input_shape,
             definition,
-            specific_config,
+            specific_config_toml_path,
             monkeypatch,
     ):
         definition = vak.models.definition.validate(definition)
         model_name = definition.__name__.replace('Definition', '')
-        toml_path = specific_config('train', model_name, audio_format='cbin', annot_format='notmat')
+        toml_path = specific_config_toml_path('train', model_name, audio_format='cbin', annot_format='notmat')
         cfg = vak.config.parse.from_toml_path(toml_path)
 
         monkeypatch.setattr(
diff --git a/tests/test_nn/test_loss/test_dice.py b/tests/test_nn/test_loss/test_dice.py
index f993c7898..bf453a1ee 100644
--- a/tests/test_nn/test_loss/test_dice.py
+++ b/tests/test_nn/test_loss/test_dice.py
@@ -1,7 +1,7 @@
 """test loss functions"""
 import torch
 from torch.autograd import gradcheck
-from torch.testing import assert_allclose
+from torch.testing import assert_close
 
 import vak.nn.loss
 
@@ -35,7 +35,7 @@ def test_all_zeros(self, device, dtype):
 
         criterion = vak.nn.loss.DiceLoss()
         loss = criterion(logits, labels)
-        assert_allclose(loss, torch.zeros_like(loss), rtol=1e-3, atol=1e-3)
+        assert_close(loss, torch.zeros_like(loss), rtol=1e-3, atol=1e-3)
 
     def test_gradcheck(self, device, dtype):
         num_classes = 3
@@ -55,7 +55,7 @@ def test_jit(self, device, dtype):
         op = vak.nn.dice_loss
         op_script = torch.jit.script(op)
 
-        assert_allclose(op(logits, labels), op_script(logits, labels))
+        assert_close(op(logits, labels), op_script(logits, labels))
 
     def test_module(self, device, dtype):
         num_classes = 3
@@ -66,4 +66,4 @@ def test_module(self, device, dtype):
         op = vak.nn.dice_loss
         op_module = vak.nn.loss.DiceLoss()
 
-        assert_allclose(op(logits, labels), op_module(logits, labels))
+        assert_close(op(logits, labels), op_module(logits, labels))
diff --git a/tests/test_predict/test_frame_classification.py b/tests/test_predict/test_frame_classification.py
index 65f76d133..6726ec09b 100644
--- a/tests/test_predict/test_frame_classification.py
+++ b/tests/test_predict/test_frame_classification.py
@@ -19,9 +19,7 @@ def assert_predict_output_matches_expected(output_dir, annot_csv_filename):
     "model_name, audio_format, spect_format, annot_format, save_net_outputs",
     [
         ("TweetyNet", "cbin", None, "notmat", False),
-        ("TweetyNet", "wav", None, "birdsong-recognition-dataset", False),
         ("TweetyNet", "cbin", None, "notmat", True),
-        ("TweetyNet", "wav", None, "birdsong-recognition-dataset", True),
     ],
 )
 def test_predict_with_frame_classification_model(
@@ -30,7 +28,7 @@ def test_predict_with_frame_classification_model(
     spect_format,
     annot_format,
     save_net_outputs,
-    specific_config,
+        specific_config_toml_path,
     tmp_path,
     device,
 ):
@@ -44,7 +42,7 @@ def test_predict_with_frame_classification_model(
         {"section": "PREDICT", "option": "device", "value": device},
         {"section": "PREDICT", "option": "save_net_outputs", "value": save_net_outputs},
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="predict",
         model=model_name,
         audio_format=audio_format,
@@ -103,7 +101,7 @@ def test_predict_with_frame_classification_model(
 )
 def test_predict_with_frame_classification_model_raises_file_not_found(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     tmp_path,
     device
 ):
@@ -119,7 +117,7 @@ def test_predict_with_frame_classification_model_raises_file_not_found(
         {"section": "PREDICT", "option": "device", "value": device},
         path_option_to_change,
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="predict",
         model="TweetyNet",
         audio_format="cbin",
@@ -160,7 +158,7 @@ def test_predict_with_frame_classification_model_raises_file_not_found(
 )
 def test_predict_with_frame_classification_model_raises_not_a_directory(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     device,
     tmp_path,
 ):
@@ -183,7 +181,7 @@ def test_predict_with_frame_classification_model_raises_not_a_directory(
             {"section": "PREDICT", "option": "output_dir", "value": str(output_dir)}
         )
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="predict",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_predict/test_predict.py b/tests/test_predict/test_predict.py
index 0aa528abf..98051ca80 100644
--- a/tests/test_predict/test_predict.py
+++ b/tests/test_predict/test_predict.py
@@ -17,7 +17,7 @@
 )
 def test_predict(
     audio_format, spect_format, annot_format, model_name, predict_function_to_mock,
-    specific_config, tmp_path
+        specific_config_toml_path, tmp_path
 ):
     """Test that :func:`vak.predict.predict` dispatches to the correct model-specific
     training functions"""
@@ -31,7 +31,7 @@ def test_predict(
         {"section": "PREDICT", "option": "device", "value": 'cpu'},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="predict",
         model=model_name,
         audio_format=audio_format,
diff --git a/tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py b/tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py
new file mode 100644
index 000000000..d354dfd6a
--- /dev/null
+++ b/tests/test_prep/test_frame_classification/test_assign_samples_to_splits.py
@@ -0,0 +1,70 @@
+import pandas as pd
+import pytest
+
+import vak
+
+
+@pytest.mark.parametrize(
+    'config_type, model_name, audio_format, spect_format, annot_format, input_type',
+    [
+        ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'),
+        ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        # TODO: add audio cases
+    ]
+)
+def test_assign_samples_to_splits(
+        config_type, model_name, audio_format, spect_format, annot_format,
+        input_type, tmp_path, specific_config_toml_path, specific_source_files_df,
+):
+    toml_path = specific_config_toml_path(
+        config_type,
+        model_name,
+        annot_format,
+        audio_format,
+        spect_format,
+    )
+
+    cfg = vak.config.parse.from_toml_path(toml_path)
+
+    # ---- set up ----
+    tmp_dataset_path = tmp_path / 'dataset_dir'
+    tmp_dataset_path.mkdir()
+
+    purpose = config_type
+
+    source_files_df = specific_source_files_df(
+        config_type,
+        model_name,
+        annot_format,
+        audio_format,
+        spect_format,
+    )
+
+    out = vak.prep.frame_classification.assign_samples_to_splits(
+        purpose,
+        source_files_df,
+        tmp_dataset_path,
+        cfg.prep.train_dur,
+        cfg.prep.val_dur,
+        cfg.prep.test_dur,
+        cfg.prep.labelset,
+    )
+
+    assert isinstance(out, pd.DataFrame)
+    assert 'split' in out.columns
+    if purpose == 'predict':
+        assert all(val == 'predict' for val in out['split'].values)
+    elif purpose == 'eval':
+        assert all(val == 'test' for val in out['split'].values)
+    else:
+        split_vals = out['split'].values.tolist()
+        assert all(
+            [
+                split_name in split_vals
+                for split_name in ('train', 'val', 'test')
+                if hasattr(cfg.prep, f'{split_name}_dur') and getattr(cfg.prep, f'{split_name}_dur') is not None
+            ]
+        )
diff --git a/tests/test_prep/test_frame_classification/test_dataset_arrays.py b/tests/test_prep/test_frame_classification/test_dataset_arrays.py
deleted file mode 100644
index 8c3c82f54..000000000
--- a/tests/test_prep/test_frame_classification/test_dataset_arrays.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""Unit tests for vak.prep.frame_classification.dataset_arrays"""
-import json
-import pathlib
-import shutil
-
-import crowsetta
-import pytest
-
-import vak.prep.frame_classification.dataset_arrays
-
-
-@pytest.mark.parametrize(
-    'annots, expected_sort_inds',
-    [
-        (
-            [
-                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
-                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b']
-                ), annot_path='./fake'),
-                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
-                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b']
-                ), annot_path='./fake'),
-            ],
-            [0, 1,]
-        ),
-        (
-            [
-                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
-                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b']
-                ), annot_path='./fake'),
-                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
-                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b']
-                ), annot_path='./fake'),
-                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
-                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b']
-                ), annot_path='./fake'),
-            ],
-            [0, 1, 2],
-        ),
-    ]
-)
-def test_argsort_by_label_freq(annots, expected_sort_inds):
-    out = vak.prep.frame_classification.dataset_arrays.argsort_by_label_freq(annots)
-    assert isinstance(out, list)
-    assert out == expected_sort_inds
-
-
-def copy_dataset_df_files_to_tmp_path_data_dir(dataset_df, dataset_path, config_type, input_type, tmp_path_data_dir):
-    """Copy all the files in a dataset DataFrame to a `tmp_path_data_dir`,
-    and change the paths in the Dataframe, so that we can then call
-    `vak.prep.frame_classification.helper.move_files_into_split_subdirs`."""
-    paths_cols = []
-    if input_type == 'spect':
-        paths_cols.append('spect_path')
-    elif input_type == 'audio':
-        paths_cols.append('audio_path')
-    if config_type != 'predict':
-        paths_cols.append('annot_path')
-    for paths_col in paths_cols:
-        paths = dataset_df[paths_col].values
-        new_paths = []
-        for path in paths:
-            new_path = shutil.copy(src=dataset_path / path, dst=tmp_path_data_dir)
-            new_paths.append(new_path)
-        dataset_df[paths_col] = new_paths
-    return dataset_df
-
-
-@pytest.mark.parametrize(
-    'config_type, model_name, audio_format, spect_format, annot_format, input_type',
-    [
-        ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
-        ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
-        ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
-        ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'),
-        ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
-        # TODO: add audio cases
-    ]
-)
-def test_make_npy_files_for_each_split(config_type, model_name, audio_format, spect_format, annot_format,
-                                       input_type, tmp_path, specific_dataset_df, specific_dataset_path):
-    dataset_df = specific_dataset_df(config_type, model_name, annot_format, audio_format, spect_format)
-    dataset_path = specific_dataset_path(config_type, model_name, annot_format, audio_format, spect_format)
-    tmp_path_data_dir = tmp_path / 'data_dir'
-    tmp_path_data_dir.mkdir()
-    copy_dataset_df_files_to_tmp_path_data_dir(dataset_df, dataset_path, config_type, input_type, tmp_path_data_dir)
-
-    tmp_dataset_path = tmp_path / 'dataset_dir'
-    tmp_dataset_path.mkdir()
-
-    if config_type != 'predict':
-        with (dataset_path / 'labelmap.json').open('r') as fp:
-            labelmap = json.load(fp)
-    else:
-        labelmap = None
-
-    purpose = config_type
-
-    vak.prep.frame_classification.dataset_arrays.make_npy_files_for_each_split(
-        dataset_df,
-        tmp_dataset_path,
-        input_type,
-        purpose,
-        labelmap,
-        audio_format,
-    )
-
-    splits = [
-        split
-        for split in sorted(dataset_df.split.dropna().unique())
-        if split != "None"
-    ]
-
-    for split in splits:
-        split_subdir = tmp_dataset_path / split
-        if split != 'None':
-            assert split_subdir.exists()
-        elif split == 'None':
-            assert not split_subdir.exists()
-
-        split_df = dataset_df[dataset_df.split == split].copy()
-
-        if purpose != "predict":
-            annots = vak.common.annotation.from_df(split_df)
-        else:
-            annots = None
-
-        if input_type == "audio":
-            source_paths = split_df["audio_path"].values
-        elif input_type == "spect":
-            source_paths = split_df["spect_path"].values
-
-        source_paths = [pathlib.Path(source_path) for source_path in source_paths]
-
-        if annots:
-            source_path_annot_tups = [
-                (source_path, annot)
-                for source_path, annot in zip(source_paths, annots)
-            ]
-        else:
-            source_path_annot_tups = [
-                (source_path, None)
-                for source_path in source_paths
-            ]
-
-        for source_path_annot_tup in source_path_annot_tups:
-            source_path, annot = source_path_annot_tup
-            frames_array_file_that_should_exist = split_subdir / (
-                source_path.stem
-                + vak.datasets.frame_classification.constants.FRAMES_ARRAY_EXT
-            )
-            assert frames_array_file_that_should_exist.exists()
-            if annot:
-                frame_labels_file_that_should_exist = split_subdir / (
-                    source_path.stem
-                    + vak.datasets.frame_classification.constants.FRAME_LABELS_EXT
-                )
-                assert frame_labels_file_that_should_exist.exists()
-
-        sample_id_vec_path = (
-            split_subdir /
-            vak.datasets.frame_classification.constants.SAMPLE_IDS_ARRAY_FILENAME
-        )
-        assert sample_id_vec_path.exists()
-
-        inds_in_sample_vec_path = (
-            split_subdir /
-            vak.datasets.frame_classification.constants.INDS_IN_SAMPLE_ARRAY_FILENAME
-        )
-        assert inds_in_sample_vec_path.exists()
diff --git a/tests/test_prep/test_frame_classification/test_frame_classification.py b/tests/test_prep/test_frame_classification/test_frame_classification.py
index b0fdf781f..31a264847 100644
--- a/tests/test_prep/test_frame_classification/test_frame_classification.py
+++ b/tests/test_prep/test_frame_classification/test_frame_classification.py
@@ -34,22 +34,26 @@ def assert_prep_output_matches_expected(dataset_path, df_returned_by_prep):
             check_exact = False
         else:
             check_exact = True
-        try:
-            assert_series_equal(
-                df_from_dataset_path[column],
-                df_returned_by_prep[column],
-                check_exact=check_exact,
-            )
-        except:
-            breakpoint()
-
-    for column in ('spect_path', 'annot_path'):
-        paths = df_from_dataset_path[column].values
-        if not all([isinstance(path, str) for path in paths]):
-            continue
-        for path in paths:
-            path = pathlib.Path(path)
-            assert (dataset_path / path).exists()
+        assert_series_equal(
+            df_from_dataset_path[column],
+            df_returned_by_prep[column],
+            check_exact=check_exact,
+        )
+
+    if vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME in df_returned_by_prep.columns:
+        frames_paths = df_returned_by_prep[
+            vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
+        ].values
+        for frames_path in frames_paths:
+            assert (dataset_path / frames_path).exists()
+
+    if vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME in df_returned_by_prep.columns:
+        frame_labels_paths = df_returned_by_prep[
+            vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME
+        ].values
+        if not all([frame_labels_path is None for frame_labels_path in frame_labels_paths]):
+            for frame_labels_path in frame_labels_paths:
+                assert (dataset_path / frame_labels_path).exists()
 
 
 @pytest.mark.parametrize(
@@ -58,9 +62,7 @@ def assert_prep_output_matches_expected(dataset_path, df_returned_by_prep):
         ("eval", "cbin", None, "notmat"),
         ("learncurve", "cbin", None, "notmat"),
         ("predict", "cbin", None, "notmat"),
-        ("predict", "wav", None, "birdsong-recognition-dataset"),
         ("train", "cbin", None, "notmat"),
-        ("train", "wav", None, "birdsong-recognition-dataset"),
         ("train", None, "mat", "yarden"),
     ],
 )
@@ -69,7 +71,7 @@ def test_prep_frame_classification_dataset(
     audio_format,
     spect_format,
     annot_format,
-    specific_config,
+    specific_config_toml_path,
     default_model,
     tmp_path,
 ):
@@ -85,7 +87,7 @@ def test_prep_frame_classification_dataset(
             "value": str(output_dir),
         },
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type=config_type,
         model=default_model,
         audio_format=audio_format,
@@ -123,7 +125,6 @@ def test_prep_frame_classification_dataset(
         ("eval", "cbin", None, "notmat"),
         ("learncurve", "cbin", None, "notmat"),
         ("train", "cbin", None, "notmat"),
-        ("train", "wav", None, "birdsong-recognition-dataset"),
         ("train", None, "mat", "yarden"),
     ],
 )
@@ -132,7 +133,7 @@ def test_prep_frame_classification_dataset_raises_when_labelset_required_but_is_
     audio_format,
     spect_format,
     annot_format,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
 ):
@@ -158,7 +159,7 @@ def test_prep_frame_classification_dataset_raises_when_labelset_required_but_is_
          "value": "DELETE-OPTION",
          },
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type=config_type,
         model=default_model,
         audio_format=audio_format,
@@ -188,9 +189,9 @@ def test_prep_frame_classification_dataset_raises_when_labelset_required_but_is_
 
 
 def test_prep_frame_classification_dataset_with_single_audio_and_annot(source_test_data_root,
-                                          specific_config,
-                                          default_model,
-                                          tmp_path):
+                                                                       specific_config_toml_path,
+                                                                       default_model,
+                                                                       tmp_path):
     """
     regression test, checks that we avoid a repeat of
     https://github.com/vocalpy/vak/issues/467
@@ -226,7 +227,7 @@ def test_prep_frame_classification_dataset_with_single_audio_and_annot(source_te
         },
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type='eval',
         model=default_model,
         audio_format='cbin',
@@ -257,7 +258,7 @@ def test_prep_frame_classification_dataset_with_single_audio_and_annot(source_te
 
 
 def test_prep_frame_classification_dataset_when_annot_has_single_segment(source_test_data_root,
-                                                                         specific_config,
+                                                                         specific_config_toml_path,
                                                                          default_model,
                                                                          tmp_path):
     """
@@ -284,7 +285,7 @@ def test_prep_frame_classification_dataset_when_annot_has_single_segment(source_
         },
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type='eval',
         model=default_model,
         audio_format='cbin',
@@ -323,7 +324,7 @@ def test_prep_frame_classification_dataset_when_annot_has_single_segment(source_
 )
 def test_prep_frame_classification_dataset_raises_not_a_directory(
     dir_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
 ):
@@ -331,7 +332,7 @@ def test_prep_frame_classification_dataset_raises_not_a_directory(
     when one of the following is not a directory:
     data_dir, output_dir
     """
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="TweetyNet",
         audio_format="cbin",
@@ -368,7 +369,7 @@ def test_prep_frame_classification_dataset_raises_not_a_directory(
 )
 def test_prep_frame_classification_dataset_raises_file_not_found(
     path_option_to_change,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
 ):
@@ -379,7 +380,7 @@ def test_prep_frame_classification_dataset_raises_file_not_found(
     Structuring unit test this way in case other path
     parameters get added.
     """
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_prep/test_frame_classification/test_get_or_make_source_files.py b/tests/test_prep/test_frame_classification/test_get_or_make_source_files.py
new file mode 100644
index 000000000..d43ca7380
--- /dev/null
+++ b/tests/test_prep/test_frame_classification/test_get_or_make_source_files.py
@@ -0,0 +1,88 @@
+from unittest import mock
+
+import pandas as pd
+import pytest
+
+import vak
+
+FAKE_SOURCE_FILES_DF = pd.DataFrame.from_records(
+    [
+        {'audio_path': 'bird0-2023.10.12.cbin',
+         'spect_path': 'bird0-2023.10.12.cbin.spect.npz',
+         'annot_path': 'bird0-2023.10.12.cbin.not.mat'}
+    ]
+)
+
+
+@pytest.mark.parametrize(
+    'config_type, model_name, audio_format, spect_format, annot_format, input_type',
+    [
+        ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'),
+        ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+    ]
+)
+def test_get_or_make_source_files(
+        config_type, model_name, audio_format, spect_format, annot_format,
+        input_type, tmp_path, specific_config_toml_path
+):
+    """Test that this `vak.prep.frame_classification.get_or_make_source_files` dispatches correctly.
+
+    Other unit tests already test the functions that this function calls.
+    """
+    toml_path = specific_config_toml_path(
+        config_type,
+        model_name,
+        annot_format,
+        audio_format,
+        spect_format,
+    )
+
+    cfg = vak.config.parse.from_toml_path(toml_path)
+
+    # ---- set up ----
+    tmp_dataset_path = tmp_path / 'dataset_dir'
+    tmp_dataset_path.mkdir()
+
+    if cfg.prep.input_type == 'audio':
+        with mock.patch('vak.prep.frame_classification.source_files.prep_audio_dataset', autospec=True) as mock_prep_audio_dataset:
+            mock_prep_audio_dataset.return_value = FAKE_SOURCE_FILES_DF
+
+            out: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files(
+                cfg.prep.data_dir,
+                cfg.prep.input_type,
+                cfg.prep.audio_format,
+                cfg.prep.spect_format,
+                cfg.spect_params,
+                tmp_dataset_path,
+                cfg.prep.annot_format,
+                cfg.prep.annot_file,
+                cfg.prep.labelset,
+                cfg.prep.audio_dask_bag_kwargs,
+            )
+
+            assert mock_prep_audio_dataset.called
+            assert isinstance(out, pd.DataFrame)
+
+    elif cfg.prep.input_type == 'spect':
+        with mock.patch(
+                'vak.prep.frame_classification.source_files.prep_spectrogram_dataset', autospec=True
+        ) as mock_prep_spect_dataset:
+            mock_prep_spect_dataset.return_value = FAKE_SOURCE_FILES_DF
+
+            out: pd.DataFrame = vak.prep.frame_classification.get_or_make_source_files(
+                cfg.prep.data_dir,
+                cfg.prep.input_type,
+                cfg.prep.audio_format,
+                cfg.prep.spect_format,
+                cfg.spect_params,
+                tmp_dataset_path,
+                cfg.prep.annot_format,
+                cfg.prep.annot_file,
+                cfg.prep.labelset,
+                cfg.prep.audio_dask_bag_kwargs,
+            )
+            assert mock_prep_spect_dataset.called
+            assert isinstance(out, pd.DataFrame)
diff --git a/tests/test_prep/test_frame_classification/test_learncurve.py b/tests/test_prep/test_frame_classification/test_learncurve.py
index 1de0ad22a..150e6483a 100644
--- a/tests/test_prep/test_frame_classification/test_learncurve.py
+++ b/tests/test_prep/test_frame_classification/test_learncurve.py
@@ -1,5 +1,6 @@
 import json
 import shutil
+from unittest import mock
 
 import numpy as np
 import pandas as pd
@@ -10,6 +11,117 @@
 import vak.common.paths
 import vak.prep.frame_classification
 
+@pytest.mark.parametrize(
+    'model_name, audio_format, annot_format, input_type',
+    [
+        ('TweetyNet', 'cbin', 'notmat', 'spect')
+    ]
+)
+def test_make_index_vectors_for_each_subsets(
+    model_name, audio_format, annot_format, input_type, specific_config_toml_path, device, tmp_path,
+):
+    root_results_dir = tmp_path.joinpath("tmp_root_results_dir")
+    root_results_dir.mkdir()
+    options_to_change = [
+        {
+            "section": "LEARNCURVE",
+            "option": "root_results_dir",
+            "value": str(root_results_dir),
+        },
+    ]
+    toml_path = specific_config_toml_path(
+        config_type="learncurve",
+        model=model_name,
+        audio_format=audio_format,
+        annot_format=annot_format,
+        options_to_change=options_to_change,
+    )
+    cfg = vak.config.parse.from_toml_path(toml_path)
+
+    dataset_path = cfg.learncurve.dataset_path
+    metadata = vak.datasets.frame_classification.Metadata.from_dataset_path(dataset_path)
+    dataset_csv_path = dataset_path / metadata.dataset_csv_filename
+    dataset_df = pd.read_csv(dataset_csv_path)
+
+    subsets_df = dataset_df[
+        ~dataset_df['subset'].isnull()
+    ]
+
+    tmp_dataset_path = tmp_path / f"test_make_learncurve_splits_from_dataset_df"
+    shutil.copytree(dataset_path, tmp_dataset_path)
+    # delete all the subset indices vectors, since we're about to test that we make them
+    for train_dur in cfg.prep.train_set_durs:
+        for replicate_num in range(1, cfg.prep.num_replicates + 1):
+            train_dur_replicate_subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name(
+                    train_dur, replicate_num
+                )
+            sample_id_vec_path = (tmp_dataset_path / "train" /
+                                  vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset(
+                                      train_dur_replicate_subset_name)
+                                  )
+            sample_id_vec_path.unlink()
+            inds_in_sample_vec_path = (tmp_dataset_path / "train" /
+                                       vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(
+                                       train_dur_replicate_subset_name)
+                                  )
+            inds_in_sample_vec_path.unlink()
+
+    vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset(
+        subsets_df,
+        tmp_dataset_path,
+        input_type,
+    )
+
+    assert sorted(subsets_df['train_dur'].unique()) == cfg.prep.train_set_durs
+    assert sorted(
+        subsets_df['replicate_num'].unique()
+    ) == list(range(1, cfg.prep.num_replicates + 1))
+
+    # assert that each expected split name is in data frame
+    for train_dur in cfg.prep.train_set_durs:
+        train_dur_df = subsets_df[np.isclose(subsets_df['train_dur'], train_dur)].copy()
+        # assert correct number of replicates for this train duration
+        assert sorted(
+            train_dur_df['replicate_num']
+        ) == list(range(1, cfg.prep.num_replicates + 1))
+
+        for replicate_num in range(1, cfg.prep.num_replicates + 1):
+            subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name(
+                    train_dur, replicate_num
+                )
+
+            # test that indexing vectors got made
+            sample_id_vec_path = (tmp_dataset_path / "train" /
+                                  vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset(
+                                      subset_name)
+                                  )
+            assert sample_id_vec_path.exists()
+
+            inds_in_sample_vec_path = (tmp_dataset_path / "train" /
+                                       vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(
+                                           subset_name)
+                                       )
+            assert inds_in_sample_vec_path.exists()
+
+            this_subset_df = subsets_df[subsets_df['subset'] == subset_name]
+            frames_paths = this_subset_df[
+                vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
+            ].values
+            sample_id_vec, inds_in_sample_vec = [], []
+            for sample_id, frames_path in enumerate(frames_paths):
+                # make indexing vectors that we use to test
+                frames = vak.datasets.frame_classification.helper.load_frames(tmp_dataset_path / frames_path,
+                                                                              input_type)
+                n_frames = frames.shape[-1]
+                sample_id_vec.append(np.ones((n_frames,)).astype(np.int32) * sample_id)
+                inds_in_sample_vec.append(np.arange(n_frames))
+            expected_sample_id_vec = np.concatenate(sample_id_vec)
+            expected_inds_in_sample_vec = np.concatenate(inds_in_sample_vec)
+            sample_id_vec = np.load(sample_id_vec_path)
+            assert np.array_equal(sample_id_vec, expected_sample_id_vec)
+            inds_in_sample_vec = np.load(inds_in_sample_vec_path)
+            assert np.array_equal(inds_in_sample_vec, expected_inds_in_sample_vec)
+
 
 @pytest.mark.parametrize(
     'model_name, audio_format, annot_format, input_type',
@@ -17,8 +129,8 @@
         ('TweetyNet', 'cbin', 'notmat', 'spect')
     ]
 )
-def test_make_learncurve_splits_from_dataset_df(
-    model_name, audio_format, annot_format, input_type, specific_config, device, tmp_path,
+def test_make_subsets_from_dataset_df(
+    model_name, audio_format, annot_format, input_type, specific_config_toml_path, device, tmp_path,
 ):
     root_results_dir = tmp_path.joinpath("tmp_root_results_dir")
     root_results_dir.mkdir()
@@ -29,7 +141,7 @@ def test_make_learncurve_splits_from_dataset_df(
             "value": str(root_results_dir),
         },
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="learncurve",
         model=model_name,
         audio_format=audio_format,
@@ -49,57 +161,70 @@ def test_make_learncurve_splits_from_dataset_df(
 
     tmp_dataset_path = tmp_path / f"test_make_learncurve_splits_from_dataset_df"
     shutil.copytree(dataset_path, tmp_dataset_path)
-    # delete all the split directories since we're about to test that we make them
+    # delete all the subset indices vectors, since we're about to test that we make them
     for train_dur in cfg.prep.train_set_durs:
         for replicate_num in range(1, cfg.prep.num_replicates + 1):
-            train_dur_replicate_split_name = vak.common.learncurve.get_train_dur_replicate_split_name(
+            train_dur_replicate_subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name(
                     train_dur, replicate_num
                 )
-            split_dir = tmp_dataset_path / train_dur_replicate_split_name
-            shutil.rmtree(split_dir)
-
-    out = vak.prep.frame_classification.learncurve.make_learncurve_splits_from_dataset_df(
-        dataset_df,
-        "spect",
-        cfg.prep.train_set_durs,
-        cfg.prep.num_replicates,
-        tmp_dataset_path,
-        labelmap,
-        audio_format=audio_format,
-    )
+            sample_id_vec_path = (tmp_dataset_path / "train" /
+                                  vak.datasets.frame_classification.helper.sample_ids_array_filename_for_subset(
+                                      train_dur_replicate_subset_name)
+                                  )
+            sample_id_vec_path.unlink()
+            inds_in_sample_vec_path = (tmp_dataset_path / "train" /
+                                       vak.datasets.frame_classification.helper.inds_in_sample_array_filename_for_subset(
+                                       train_dur_replicate_subset_name)
+                                  )
+            inds_in_sample_vec_path.unlink()
+
+    # now reset the dataset df to what it would have been before we passed it into `make_splits`
+    dataset_df = dataset_df[
+        # drop any rows where there *is* a train dur -- because these are the subsets
+        dataset_df['train_dur'].isnull()
+        # drop the columns added by ``make_splits``, then reset the index
+    ].drop(columns=['subset', 'train_dur', 'replicate_num']).reset_index(drop=True)
+
+    with mock.patch('vak.prep.frame_classification.learncurve.make_index_vectors_for_each_subset') as mock_idx_vectors:
+        out = vak.prep.frame_classification.learncurve.make_subsets_from_dataset_df(
+            dataset_df,
+            input_type,
+            cfg.prep.train_set_durs,
+            cfg.prep.num_replicates,
+            tmp_dataset_path,
+            labelmap,
+        )
+        assert mock_idx_vectors.called
+
     assert isinstance(out, pd.DataFrame)
 
-    splits_df = dataset_df[
-        ~dataset_df.split.isin(('train', 'val', 'test'))
+    for added_column in ('subset', 'train_dur', 'replicate_num'):
+        assert added_column in out.columns
+
+    subsets_df = out[
+        ~out['subset'].isnull()
     ]
-    assert sorted(splits_df['train_dur'].unique()) == cfg.prep.train_set_durs
+    assert sorted(subsets_df['train_dur'].unique()) == cfg.prep.train_set_durs
     assert sorted(
-        splits_df['replicate_num'].unique()
+        subsets_df['replicate_num'].unique()
     ) == list(range(1, cfg.prep.num_replicates + 1))
 
     # assert that each expected split name is in data frame
-    all_split_names = []
     for train_dur in cfg.prep.train_set_durs:
-        train_dur_df = splits_df[np.isclose(splits_df['train_dur'], train_dur)].copy()
+        train_dur_df = subsets_df[np.isclose(subsets_df['train_dur'], train_dur)].copy()
         # assert correct number of replicates for this train duration
         assert sorted(
             train_dur_df['replicate_num']
         ) == list(range(1, cfg.prep.num_replicates + 1))
 
         for replicate_num in range(1, cfg.prep.num_replicates + 1):
-            train_dur_replicate_split_name = vak.common.learncurve.get_train_dur_replicate_split_name(
+            subset_name = vak.common.learncurve.get_train_dur_replicate_subset_name(
                     train_dur, replicate_num
                 )
-            all_split_names.append(train_dur_replicate_split_name)
-
-            # assert directory holding split files exists
-            split_dir = tmp_dataset_path / train_dur_replicate_split_name
-            assert split_dir.exists() and split_dir.is_dir()
 
             # assert this train_dur + replicate split exists in dataframe
-            assert np.isin(train_dur_replicate_split_name, splits_df['split'].values)
-            this_split_df = splits_df[splits_df['split'] == train_dur_replicate_split_name]
+            assert np.isin(subset_name, subsets_df['subset'].values)
+            this_subset_df = subsets_df[subsets_df['subset'] == subset_name]
 
             # assert that it has the correct duration
-            assert this_split_df['duration'].sum() >= train_dur
-
+            assert this_subset_df['duration'].sum() >= train_dur
diff --git a/tests/test_prep/test_frame_classification/test_make_splits.py b/tests/test_prep/test_frame_classification/test_make_splits.py
new file mode 100644
index 000000000..5d5ef11cf
--- /dev/null
+++ b/tests/test_prep/test_frame_classification/test_make_splits.py
@@ -0,0 +1,236 @@
+"""Unit tests for vak.prep.frame_classification.make_splits"""
+import json
+import pathlib
+import shutil
+
+import crowsetta
+import numpy as np
+import pandas as pd
+import pytest
+
+import vak.prep.frame_classification.make_splits
+
+
+@pytest.mark.parametrize(
+    'annots, expected_sort_inds',
+    [
+        (
+            [
+                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
+                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b']
+                ), annot_path='./fake'),
+                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
+                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b']
+                ), annot_path='./fake'),
+            ],
+            [0, 1,]
+        ),
+        (
+            [
+                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
+                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['a', 'b', 'b']
+                ), annot_path='./fake'),
+                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
+                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b']
+                ), annot_path='./fake'),
+                crowsetta.Annotation(seq=crowsetta.Sequence.from_keyword(
+                    onsets_s=[0.1, 0.3, 0.5], offsets_s=[0.2, 0.4, 0.6], labels=['b', 'b', 'b']
+                ), annot_path='./fake'),
+            ],
+            [0, 1, 2],
+        ),
+    ]
+)
+def test_argsort_by_label_freq(annots, expected_sort_inds):
+    out = vak.prep.frame_classification.make_splits.argsort_by_label_freq(annots)
+    assert isinstance(out, list)
+    assert out == expected_sort_inds
+
+
+def copy_dataset_df_files_to_tmp_path_data_dir(dataset_df, dataset_path, config_type, input_type, tmp_path_data_dir):
+    """Copy all the files in a dataset DataFrame to a `tmp_path_data_dir`,
+    and change the paths in the Dataframe, so that we can then call
+    `vak.prep.frame_classification.helper.move_files_into_split_subdirs`."""
+    paths_cols = []
+    if input_type == 'spect':
+        paths_cols.append('spect_path')
+    elif input_type == 'audio':
+        paths_cols.append('audio_path')
+    if config_type != 'predict':
+        paths_cols.append('annot_path')
+    for paths_col in paths_cols:
+        paths = dataset_df[paths_col].values
+        new_paths = []
+        for path in paths:
+            new_path = shutil.copy(src=dataset_path / path, dst=tmp_path_data_dir)
+            new_paths.append(new_path)
+        dataset_df[paths_col] = new_paths
+    return dataset_df
+
+
+@pytest.mark.parametrize(
+    'config_type, model_name, audio_format, spect_format, annot_format, input_type',
+    [
+        ('train', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('predict', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('eval', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        ('train', 'TweetyNet', None, 'mat', 'yarden', 'spect'),
+        ('learncurve', 'TweetyNet', 'cbin', None, 'notmat', 'spect'),
+        # TODO: add audio cases
+    ]
+)
+def test_make_splits(config_type, model_name, audio_format, spect_format, annot_format,
+                     input_type, tmp_path, specific_config_toml_path, specific_source_files_with_split_df):
+    toml_path = specific_config_toml_path(
+        config_type,
+        model_name,
+        annot_format,
+        audio_format,
+        spect_format,
+    )
+    cfg = vak.config.parse.from_toml_path(toml_path)
+
+    # ---- set up ----
+    tmp_dataset_path = tmp_path / 'dataset_dir'
+    tmp_dataset_path.mkdir()
+
+    purpose = config_type
+
+    dataset_df = specific_source_files_with_split_df(
+        config_type,
+        model_name,
+        annot_format,
+        audio_format,
+        spect_format,
+    )
+    if purpose != "predict":
+        # TODO: add option to generate predict using existing dataset, so we can get labelmap from it
+        map_unlabeled_segments = vak.prep.sequence_dataset.has_unlabeled_segments(
+            dataset_df
+        )
+        labelmap = vak.common.labels.to_map(
+            cfg.prep.labelset, map_unlabeled=map_unlabeled_segments
+        )
+    else:
+        labelmap = None
+
+    dataset_df_with_splits = vak.prep.frame_classification.make_splits.make_splits(
+        dataset_df,
+        tmp_dataset_path,
+        cfg.prep.input_type,
+        purpose,
+        labelmap,
+        cfg.prep.audio_format,
+    )
+    assert isinstance(dataset_df_with_splits, pd.DataFrame)
+
+    splits = [
+        split
+        for split in sorted(dataset_df_with_splits.split.dropna().unique())
+        if split != "None"
+    ]
+
+    for split in splits:
+        split_subdir = tmp_dataset_path / split
+        if split != 'None':
+            assert split_subdir.exists()
+        elif split == 'None':
+            assert not split_subdir.exists()
+
+        split_df = dataset_df_with_splits[
+            dataset_df_with_splits.split == split
+        ].copy()
+
+        assert vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME in split_df.columns
+
+        frames_paths = split_df[
+            vak.datasets.frame_classification.constants.FRAMES_PATH_COL_NAME
+        ].values
+
+        if purpose != "predict":
+            assert vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME in split_df.columns
+
+            frame_labels_paths = split_df[
+                vak.datasets.frame_classification.constants.FRAME_LABELS_NPY_PATH_COL_NAME
+            ].values
+
+            annots = vak.common.annotation.from_df(split_df)
+
+            frames_tuples = [
+                (frames_path, frame_labels_path, annot)
+                for frames_path, frame_labels_path, annot in zip(
+                    frames_paths, frame_labels_paths, annots
+                )
+            ]
+        else:
+            frames_tuples = [
+                (frames_path, None, None)
+                for frames_path in frames_paths
+            ]
+
+        sample_id_vecs, inds_in_sample_vecs = [], []
+        for sample_id, frames_tuple in enumerate(frames_tuples):
+            frames_path, frame_labels_path, annot = frames_tuple
+            frames_file_that_should_exist = tmp_dataset_path / frames_path
+            assert frames_file_that_should_exist.exists()
+
+            # NOTE we load frames to confirm we can and also to make indexing vectors we use to test,
+            # see next code block
+            frames = vak.datasets.frame_classification.helper.load_frames(tmp_dataset_path / frames_path, input_type)
+            assert isinstance(frames, np.ndarray)
+
+            # make indexing vectors that we use to test
+            n_frames = frames.shape[-1]
+            sample_id_vecs.append(np.ones((n_frames,)).astype(np.int32) * sample_id)
+            inds_in_sample_vecs.append(np.arange(n_frames))
+
+            if frame_labels_path is not None and annot is not None:
+                frame_labels_file_that_should_exist = tmp_dataset_path / frame_labels_path
+                assert frame_labels_file_that_should_exist.exists()
+
+                if input_type == "audio":
+                    _, samplefreq = vak.common.constants.AUDIO_FORMAT_FUNC_MAP[
+                        audio_format
+                    ](tmp_dataset_path / frames_path)
+                    frame_times = np.arange(frames.shape[-1]) / samplefreq
+                elif input_type == "spect":
+                    spect_dict = vak.common.files.spect.load(tmp_dataset_path / frames_path, "npz")
+                    frame_times = spect_dict[vak.common.constants.TIMEBINS_KEY]
+
+                lbls_int = [labelmap[lbl] for lbl in annot.seq.labels]
+                expected_frame_labels = vak.transforms.frame_labels.from_segments(
+                    lbls_int,
+                    annot.seq.onsets_s,
+                    annot.seq.offsets_s,
+                    frame_times,
+                    unlabeled_label=labelmap["unlabeled"],
+                )
+                frame_labels = np.load(frame_labels_file_that_should_exist)
+                assert np.array_equal(frame_labels, expected_frame_labels)
+
+        # assert there are no remaining .spect.npz files in dataset path (root)
+        # because they were moved in to splits, and we removed any remaining that were not put into splits
+        spect_npz_files_not_in_split = sorted(
+            tmp_dataset_path.glob(f'*{vak.common.constants.SPECT_NPZ_EXTENSION}')
+        )
+        assert len(spect_npz_files_not_in_split) == 0
+
+        sample_id_vec_path = (
+            split_subdir /
+            vak.datasets.frame_classification.constants.SAMPLE_IDS_ARRAY_FILENAME
+        )
+        assert sample_id_vec_path.exists()
+
+        expected_sample_id_vec = np.concatenate(sample_id_vecs)
+        sample_id_vec = np.load(sample_id_vec_path)
+        assert np.array_equal(sample_id_vec, expected_sample_id_vec)
+
+        inds_in_sample_vec_path = (
+            split_subdir /
+            vak.datasets.frame_classification.constants.INDS_IN_SAMPLE_ARRAY_FILENAME
+        )
+        assert inds_in_sample_vec_path.exists()
+
+        expected_inds_in_sample_vec = np.concatenate(inds_in_sample_vecs)
+        inds_in_sample_vec = np.load(inds_in_sample_vec_path)
+        assert np.array_equal(inds_in_sample_vec, expected_inds_in_sample_vec)
diff --git a/tests/test_prep/test_prep.py b/tests/test_prep/test_prep.py
index 4481af81c..8e995f8bc 100644
--- a/tests/test_prep/test_prep.py
+++ b/tests/test_prep/test_prep.py
@@ -13,9 +13,7 @@
         ("eval", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"),
         ("learncurve", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"),
         ("predict", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"),
-        ("predict", "wav", None, "birdsong-recognition-dataset", "vak.prep.prep_.prep_frame_classification_dataset"),
         ("train", "cbin", None, "notmat", "vak.prep.prep_.prep_frame_classification_dataset"),
-        ("train", "wav", None, "birdsong-recognition-dataset", "vak.prep.prep_.prep_frame_classification_dataset"),
         ("train", None, "mat", "yarden", "vak.prep.prep_.prep_frame_classification_dataset"),
     ],
 )
@@ -25,7 +23,7 @@ def test_prep(
     spect_format,
     annot_format,
     dataset_prep_func_to_mock,
-    specific_config,
+        specific_config_toml_path,
     default_model,
     tmp_path,
 ):
@@ -42,7 +40,7 @@ def test_prep(
             "value": str(output_dir),
         },
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type=config_type,
         model=default_model,
         audio_format=audio_format,
diff --git a/tests/test_prep/test_sequence_dataset.py b/tests/test_prep/test_sequence_dataset.py
index 80ef3739e..66410204d 100644
--- a/tests/test_prep/test_sequence_dataset.py
+++ b/tests/test_prep/test_sequence_dataset.py
@@ -8,7 +8,6 @@
     'model_name, config_type, audio_format, spect_format, annot_format, expected_result',
     [
         ("TweetyNet", "train", "cbin", None, "notmat", True),
-        ("TweetyNet", "train", "wav", None, "birdsong-recognition-dataset", True),
         ("TweetyNet", "train", None, "mat", "yarden", True),
     ]
 )
diff --git a/tests/test_prep/test_spectrogram_dataset/test_prep.py b/tests/test_prep/test_spectrogram_dataset/test_prep.py
index 792ff1c85..f72244883 100644
--- a/tests/test_prep/test_spectrogram_dataset/test_prep.py
+++ b/tests/test_prep/test_spectrogram_dataset/test_prep.py
@@ -101,45 +101,32 @@ def assert_returned_dataframe_matches_expected(
                 ]
             )
 
-        # test that all the generated spectrogram files are in a
-        # newly-created directory inside spect_output_dir
+        # test that all the generated spectrogram files are in spect_output_dir
         assert all(
             [
-                spect_path.parents[1] == spect_output_dir
+                spect_path.parents[0] == spect_output_dir
                 for spect_path in spect_paths_from_df
             ]
         )
 
     elif spect_format:  # implies that --> we made the dataframe from spect files
-        if spect_format == 'mat':
-            expected_spect_file_names = [
-                spect_path.name.replace('.mat', '.npz')
-                for spect_path in expected_spect_paths
-            ]
-        else:
-            expected_spect_file_names = [
-                spect_path.name for spect_path in expected_spect_paths
-            ]
+        expected_spect_file_names = [
+            spect_path.name for spect_path in expected_spect_paths
+        ]
 
         assert all(
-            [expected_spect_file_name in spect_file_names_from_df
-             for expected_spect_file_name in expected_spect_file_names]
+            [spect_file_name_from_df in expected_spect_file_names
+             for spect_file_name_from_df in spect_file_names_from_df]
         )
 
         # test that **only** expected paths were in DataFrame
         if not_expected_spect_paths is not None:
-            if spect_format == 'mat':
-                not_expected_spect_file_names = [
-                    spect_path.name.replace('.mat', '.npz')
-                    for spect_path in not_expected_spect_paths
-                ]
-            else:
-                not_expected_spect_file_names = [
-                    spect_path.name for spect_path in not_expected_spect_paths
-                ]
+            not_expected_spect_file_names = [
+                spect_path.name for spect_path in not_expected_spect_paths
+            ]
             assert all(
-                [not_expected_spect_file_name not in spect_file_names_from_df
-                 for not_expected_spect_file_name in not_expected_spect_file_names]
+                [spect_file_name_from_df not in not_expected_spect_file_names
+                 for spect_file_name_from_df in spect_file_names_from_df]
             )
 
 
diff --git a/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py b/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py
index 77babd444..a5864b47f 100644
--- a/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py
+++ b/tests/test_prep/test_spectrogram_dataset/test_spect_helper.py
@@ -29,35 +29,23 @@ def assert_expected_spect_paths_in_dataframe(
         of paths to spectrogram files, that should **not** be in dataset_df.spect_path column
     """
     spect_file_names_from_df = [spect_path.name for spect_path in spect_paths_from_df]
-
-    if spect_format == 'mat':
-        expected_spectfile_names = [
-            spect_path.name.replace('.mat', '.npz')
-            for spect_path in expected_spect_paths
-        ]
-    else:
-        expected_spectfile_names = [
-            spect_path.name for spect_path in expected_spect_paths
-        ]
+    expected_spectfile_names = [
+        spect_path.name for spect_path in expected_spect_paths
+    ]
 
     assert all(
-        [expected_spect_file in spect_file_names_from_df for expected_spect_file in expected_spectfile_names]
+        [spect_file_name_from_df in expected_spectfile_names
+         for spect_file_name_from_df in spect_file_names_from_df]
     )
 
     # test that **only** expected paths were in DataFrame
     if not_expected_spect_paths is not None:
-        if spect_format == 'mat':
-            not_expected_spectfile_names = [
-                spect_path.name.replace('.mat', '.npz')
-                for spect_path in not_expected_spect_paths
-            ]
-        else:
-            not_expected_spectfile_names = [
-                spect_path.name for spect_path in not_expected_spect_paths
-            ]
+        not_expected_spectfile_names = [
+            spect_path.name for spect_path in not_expected_spect_paths
+        ]
         assert all(
-            [not_expected_spect_file not in spect_file_names_from_df
-             for not_expected_spect_file in not_expected_spectfile_names]
+            [spect_file_name_from_df not in not_expected_spectfile_names
+             for spect_file_name_from_df in spect_file_names_from_df]
         )
 
 
@@ -119,7 +107,6 @@ def test_make_dataframe_of_spect_files(
         spect_format=spect_format,
         spect_dir=spect_dir,
         spect_files=spect_files,
-        spect_output_dir=spect_output_dir,
         labelset=labelset,
         annot_list=annot_list,
         annot_format=annot_format,
@@ -144,13 +131,10 @@ def test_make_dataframe_of_spect_files(
         expected_spect_list, not_expected_spect_list
     )
 
-    if spect_format == 'mat':
-        expected_parent = spect_output_dir
-    else:
-        expected_parent = specific_spect_dir(spect_format)
-    assert all(
-        [spect_path.parent == expected_parent for spect_path in spect_paths_from_df]
-    )
+    if spect_dir is not None:
+        assert all(
+            [spect_path.parent == spect_dir for spect_path in spect_paths_from_df]
+        )
 
 
 def test_make_dataframe_of_spect_files_no_spect_dir_files_or_map_raises(annot_list_yarden):
diff --git a/tests/test_train/test_frame_classification.py b/tests/test_train/test_frame_classification.py
index c2ef7f201..9cddd7e17 100644
--- a/tests/test_train/test_frame_classification.py
+++ b/tests/test_train/test_frame_classification.py
@@ -38,12 +38,11 @@ def assert_train_output_matches_expected(cfg: vak.config.config.Config, model_na
     "model_name, audio_format, spect_format, annot_format",
     [
         ("TweetyNet", "cbin", None, "notmat"),
-        ("TweetyNet", "wav", None, "birdsong-recognition-dataset"),
         ("TweetyNet", None, "mat", "yarden"),
     ],
 )
 def test_train_frame_classification_model(
-    model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device
+    model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device
 ):
     results_path = vak.common.paths.generate_results_dir_name_as_path(tmp_path)
     results_path.mkdir()
@@ -51,7 +50,7 @@ def test_train_frame_classification_model(
         {"section": "TRAIN", "option": "device", "value": device},
         {"section": "TRAIN", "option": "root_results_dir", "value": results_path}
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model=model_name,
         audio_format=audio_format,
@@ -92,12 +91,11 @@ def test_train_frame_classification_model(
     "model_name, audio_format, spect_format, annot_format",
     [
         ("TweetyNet", "cbin", None, "notmat"),
-        ("TweetyNet", "wav", None, "birdsong-recognition-dataset"),
         ("TweetyNet", None, "mat", "yarden"),
     ],
 )
 def test_continue_training(
-    model_name, audio_format, spect_format, annot_format, specific_config, tmp_path, device
+    model_name, audio_format, spect_format, annot_format, specific_config_toml_path, tmp_path, device
 ):
     results_path = vak.common.paths.generate_results_dir_name_as_path(tmp_path)
     results_path.mkdir()
@@ -105,7 +103,7 @@ def test_continue_training(
         {"section": "TRAIN", "option": "device", "value": device},
         {"section": "TRAIN", "option": "root_results_dir", "value": results_path}
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train_continue",
         model=model_name,
         audio_format=audio_format,
@@ -149,7 +147,7 @@ def test_continue_training(
     ]
 )
 def test_train_raises_file_not_found(
-    path_option_to_change, specific_config, tmp_path, device
+    path_option_to_change, specific_config_toml_path, tmp_path, device
 ):
     """Test that pre-conditions in `vak.train` raise FileNotFoundError
     when one of the following does not exist:
@@ -159,7 +157,7 @@ def test_train_raises_file_not_found(
         {"section": "TRAIN", "option": "device", "value": device},
         path_option_to_change
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="TweetyNet",
         audio_format="cbin",
@@ -204,7 +202,7 @@ def test_train_raises_file_not_found(
     ]
 )
 def test_train_raises_not_a_directory(
-    path_option_to_change, specific_config, device, tmp_path
+    path_option_to_change, specific_config_toml_path, device, tmp_path
 ):
     """Test that core.train raises NotADirectory
     when directory does not exist
@@ -214,7 +212,7 @@ def test_train_raises_not_a_directory(
         {"section": "TRAIN", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="TweetyNet",
         audio_format="cbin",
diff --git a/tests/test_train/test_parametric_umap.py b/tests/test_train/test_parametric_umap.py
index 11e5f709c..a64516e0a 100644
--- a/tests/test_train/test_parametric_umap.py
+++ b/tests/test_train/test_parametric_umap.py
@@ -35,7 +35,7 @@ def assert_train_output_matches_expected(cfg: vak.config.config.Config, model_na
 )
 def test_train_parametric_umap_model(
     model_name, audio_format, spect_format, annot_format,
-    specific_config, tmp_path, device
+        specific_config_toml_path, tmp_path, device
 ):
     results_path = vak.common.paths.generate_results_dir_name_as_path(tmp_path)
     results_path.mkdir()
@@ -43,7 +43,7 @@ def test_train_parametric_umap_model(
         {"section": "TRAIN", "option": "device", "value": device},
         {"section": "TRAIN", "option": "root_results_dir", "value": results_path}
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model=model_name,
         audio_format=audio_format,
@@ -83,7 +83,7 @@ def test_train_parametric_umap_model(
     ]
 )
 def test_train_parametric_umap_model_raises_file_not_found(
-    path_option_to_change, specific_config, tmp_path, device
+    path_option_to_change, specific_config_toml_path, tmp_path, device
 ):
     """Test that pre-conditions in :func:`vak.train.parametric_umap.train_parametric_umap_model`
     raise FileNotFoundError when one of the following does not exist:
@@ -93,7 +93,7 @@ def test_train_parametric_umap_model_raises_file_not_found(
         {"section": "TRAIN", "option": "device", "value": device},
         path_option_to_change
     ]
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="ConvEncoderUMAP",
         audio_format="cbin",
@@ -135,7 +135,7 @@ def test_train_parametric_umap_model_raises_file_not_found(
     ]
 )
 def test_train_parametric_umap_model_raises_not_a_directory(
-    path_option_to_change, specific_config, device, tmp_path
+    path_option_to_change, specific_config_toml_path, device, tmp_path
 ):
     """Test that core.train raises NotADirectory
     when directory does not exist
@@ -145,7 +145,7 @@ def test_train_parametric_umap_model_raises_not_a_directory(
         {"section": "TRAIN", "option": "device", "value": device},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model="ConvEncoderUMAP",
         audio_format="cbin",
diff --git a/tests/test_train/test_train.py b/tests/test_train/test_train.py
index c43e6631f..559853a24 100644
--- a/tests/test_train/test_train.py
+++ b/tests/test_train/test_train.py
@@ -14,8 +14,6 @@
     [
         ("cbin", None, "notmat", "TweetyNet",
          'vak.train.train_.train_frame_classification_model'),
-        ("wav", None, "birdsong-recognition-dataset", "TweetyNet",
-         'vak.train.train_.train_frame_classification_model'),
         (None, "mat", "yarden", "TweetyNet",
          'vak.train.train_.train_frame_classification_model'),
         ("cbin", None, "notmat", "ConvEncoderUMAP",
@@ -24,7 +22,7 @@
 )
 def test_train(
     audio_format, spect_format, annot_format, model_name, train_function_to_mock,
-    specific_config, tmp_path
+        specific_config_toml_path, tmp_path
 ):
     """Test that :func:`vak.train.train` dispatches to the correct model-specific
     training functions"""
@@ -40,7 +38,7 @@ def test_train(
         {"section": "TRAIN", "option": "device", "value": 'cpu'},
     ]
 
-    toml_path = specific_config(
+    toml_path = specific_config_toml_path(
         config_type="train",
         model=model_name,
         audio_format=audio_format,
diff --git a/tests/test_transforms/test_transforms.py b/tests/test_transforms/test_transforms.py
index a90270388..b71a42749 100644
--- a/tests/test_transforms/test_transforms.py
+++ b/tests/test_transforms/test_transforms.py
@@ -60,26 +60,19 @@ def test_fit_dataset_path(self, split, train_cbin_notmat_df,
             annot_format="notmat"
         )
 
-        dataset_csv_path = specific_dataset_csv_path(
-            config_type="train",
-            model="TweetyNet",
-            audio_format="cbin",
-            annot_format="notmat"
-        )
-
         if split is None:
             split_to_test = 'train'
         else:
             split_to_test = split
         # ---- set up
         df_split = train_cbin_notmat_df[train_cbin_notmat_df.split == split_to_test].copy()
-        spect_paths = df_split['spect_path'].values
-        spect = vak.common.files.spect.load(dataset_path / spect_paths[0])['s']
+        spect_paths = df_split['frames_path'].values
+        spect = vak.common.files.spect.load(dataset_path / spect_paths[0])[vak.common.constants.SPECT_KEY]
         mean_freqs = np.mean(spect, axis=1)
         std_freqs = np.std(spect, axis=1)
 
         for spect_path in spect_paths[1:]:
-            spect = vak.common.files.spect.load(dataset_path / spect_path)['s']
+            spect = vak.common.files.spect.load(dataset_path / spect_path)[vak.common.constants.SPECT_KEY]
             mean_freqs += np.mean(spect, axis=1)
             std_freqs += np.std(spect, axis=1)
         expected_mean_freqs = mean_freqs / len(spect_paths)