Merge pull request #3396 from samuelgarcia/refactor_GTStudy

Start a new module for benchmark. Implement SorterStudy that replace GroundTruthStudy
SpikeInterface · Oct 7, 2024 · 3522a07 · 3522a07
2 parents fbbb89f + 56fff23
commit 3522a07
Show file tree

Hide file tree

Showing 32 changed files with 835 additions and 1,107 deletions.
diff --git a/doc/images/overview.png b/doc/images/overview.png
diff --git a/doc/modules/benchmark.rst b/doc/modules/benchmark.rst
@@ -0,0 +1,141 @@
+Benchmark module
+================
+
+This module contains machinery to compare some sorters against ground truth in many multiple situtation.
+
+
+..notes::
+
+    In 0.102.0 The previous :py:func:`~spikeinterface.comparison.GroundTruthStudy()` has been replaced by
+    :py:func:`~spikeinterface.benchmark.SorterStudy()`
+
+
+This module also aims to benchmark sorting components (detection, clustering, motion, template matching) using the
+same base class :py:func:`~spikeinterface.benchmark.BenchmarkStudy()` but specialized to a targeted component.
+
+By design, the main class handle the concept of "levels" : this allows to compare several complexities at the same time.
+For instance, compare kilosort4 vs kilsort2.5 (level 0) for different noises amplitudes (level 1) combined with
+several motion vectors (leevel 2).
+
+**Example: compare many sorters : a ground truth study**
+
+We have a high level class to compare many sorters against ground truth: :py:func:`~spikeinterface.benchmark.SorterStudy()`
+
+
+A study is a systematic performance comparison of several ground truth recordings with several sorters or several cases
+like the different parameter sets.
+
+The study class proposes high-level tool functions to run many ground truth comparisons with many "cases"
+on many recordings and then collect and aggregate results in an easy way.
+
+The all mechanism is based on an intrinsic organization into a "study_folder" with several subfolders:
+
+  * datasets: contains ground truth datasets
+  * sorters : contains outputs of sorters
+  * sortings: contains light copy of all sorting
+  * metrics: contains metrics
+  * ...
+
+
+.. code-block:: python
+
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    import spikeinterface.extractors as se
+    import spikeinterface.widgets as sw
+    from spikeinterface.benchmark import SorterStudy
+
+
+    # generate 2 simulated datasets (could be also mearec files)
+    rec0, gt_sorting0 = generate_ground_truth_recording(num_channels=4, durations=[30.], seed=42)
+    rec1, gt_sorting1 = generate_ground_truth_recording(num_channels=4, durations=[30.], seed=91)
+
+    datasets = {
+        "toy0": (rec0, gt_sorting0),
+        "toy1": (rec1, gt_sorting1),
+    }
+
+    # define some "cases" here we want to test tridesclous2 on 2 datasets and spykingcircus2 on one dataset
+    # so it is a two level study (sorter_name, dataset)
+    # this could be more complicated like (sorter_name, dataset, params)
+    cases = {
+        ("tdc2", "toy0"): {
+            "label": "tridesclous2 on tetrode0",
+            "dataset": "toy0",
+            "params": {"sorter_name": "tridesclous2"}
+        },
+        ("tdc2", "toy1"): {
+            "label": "tridesclous2 on tetrode1",
+            "dataset": "toy1",
+            "params": {"sorter_name": "tridesclous2"}
+        },
+        ("sc", "toy0"): {
+            "label": "spykingcircus2 on tetrode0",
+            "dataset": "toy0",
+            "params": {
+                "sorter_name": "spykingcircus",
+                "docker_image": True
+            },
+        },
+    }
+    # this initilizes a folder
+    study = SorterStudy.create(study_folder=study_folder, datasets=datasets, cases=cases,
+                                    levels=["sorter_name", "dataset"])
+
+
+    # This internally do run_sorter() for all cases in one function
+    study.run()
+
+    # Run the benchmark : this internanly do compare_sorter_to_ground_truth() for all cases
+    study.compute_results()
+
+    # Collect comparisons one by one
+    for case_key in study.cases:
+        print('*' * 10)
+        print(case_key)
+        # raw counting of tp/fp/...
+        comp = study.get_result(case_key)["gt_comparison"]
+        # summary
+        comp.print_summary()
+        perf_unit = comp.get_performance(method='by_unit')
+        perf_avg = comp.get_performance(method='pooled_with_average')
+        # some plots
+        m = comp.get_confusion_matrix()
+        w_comp = sw.plot_agreement_matrix(sorting_comparison=comp)
+
+    # Collect synthetic dataframes and display
+    # As shown previously, the performance is returned as a pandas dataframe.
+    # The spikeinterface.comparison.get_performance_by_unit() function,
+    # gathers all the outputs in the study folder and merges them into a single dataframe.
+    # Same idea for spikeinterface.comparison.get_count_units()
+
+    # this is a dataframe
+    perfs = study.get_performance_by_unit()
+
+    # this is a dataframe
+    unit_counts = study.get_count_units()
+
+    # Study also have several plotting methods for plotting the result
+    study.plot_agreement_matrix()
+    study.plot_unit_counts()
+    study.plot_performances(mode="ordered")
+    study.plot_performances(mode="snr")
+
+
+
+
+Benchmark spike collisions
+--------------------------
+
+SpikeInterface also has a specific toolset to benchmark how well sorters are at recovering spikes in "collision".
+
+We have three classes to handle collision-specific comparisons, and also to quantify the effects on correlogram
+estimation:
+
+  * :py:class:`~spikeinterface.comparison.CollisionGTComparison`
+  * :py:class:`~spikeinterface.comparison.CorrelogramGTComparison`
+
+For more details, checkout the following paper:
+
+`Samuel Garcia, Alessio P. Buccino and Pierre Yger. "How Do Spike Collisions Affect Spike Sorting Performance?" <https://doi.org/10.1523/ENEURO.0105-22.2022>`_
diff --git a/doc/modules/comparison.rst b/doc/modules/comparison.rst
@@ -5,6 +5,10 @@ Comparison module
 SpikeInterface has a :py:mod:`~spikeinterface.comparison` module, which contains functions and tools to compare
 spike trains and templates (useful for tracking units over multiple sessions).
 
+.. note::
+
+    In version 0.102.0 the benchmark part of comparison has moved in the new :py:mod:`~spikeinterface.benchmark`
+
 In addition, the :py:mod:`~spikeinterface.comparison` module contains advanced benchmarking tools to evaluate
 the effects of spike collisions on spike sorting results, and to construct hybrid recordings for comparison.
 
@@ -242,135 +246,6 @@ An **over-merged** unit has a relatively high agreement (>= 0.2 by default) for
 
     cmp_gt_HS.get_redundant_units(redundant_score=0.2)
 
-
-**Example: compare many sorters with a Ground Truth Study**
-
-We also have a high level class to compare many sorters against ground truth:
-:py:func:`~spikeinterface.comparison.GroundTruthStudy()`
-
-A study is a systematic performance comparison of several ground truth recordings with several sorters or several cases
-like the different parameter sets.
-
-The study class proposes high-level tool functions to run many ground truth comparisons with many "cases"
-on many recordings and then collect and aggregate results in an easy way.
-
-The all mechanism is based on an intrinsic organization into a "study_folder" with several subfolders:
-
-  * datasets: contains ground truth datasets
-  * sorters : contains outputs of sorters
-  * sortings: contains light copy of all sorting
-  * metrics: contains metrics
-  * ...
-
-
-.. code-block:: python
-
-    import matplotlib.pyplot as plt
-    import seaborn as sns
-
-    import spikeinterface.extractors as se
-    import spikeinterface.widgets as sw
-    from spikeinterface.comparison import GroundTruthStudy
-
-
-    # generate 2 simulated datasets (could be also mearec files)
-    rec0, gt_sorting0 = generate_ground_truth_recording(num_channels=4, durations=[30.], seed=42)
-    rec1, gt_sorting1 = generate_ground_truth_recording(num_channels=4, durations=[30.], seed=91)
-
-    datasets = {
-        "toy0": (rec0, gt_sorting0),
-        "toy1": (rec1, gt_sorting1),
-    }
-
-    # define some "cases" here we want to test tridesclous2 on 2 datasets and spykingcircus2 on one dataset
-    # so it is a two level study (sorter_name, dataset)
-    # this could be more complicated like (sorter_name, dataset, params)
-    cases = {
-        ("tdc2", "toy0"): {
-            "label": "tridesclous2 on tetrode0",
-            "dataset": "toy0",
-            "run_sorter_params": {
-                "sorter_name": "tridesclous2",
-            },
-        },
-        ("tdc2", "toy1"): {
-            "label": "tridesclous2 on tetrode1",
-            "dataset": "toy1",
-            "run_sorter_params": {
-                "sorter_name": "tridesclous2",
-            },
-        },
-
-        ("sc", "toy0"): {
-            "label": "spykingcircus2 on tetrode0",
-            "dataset": "toy0",
-            "run_sorter_params": {
-                "sorter_name": "spykingcircus",
-                "docker_image": True
-            },
-        },
-    }
-    # this initilizes a folder
-    study = GroundTruthStudy.create(study_folder=study_folder, datasets=datasets, cases=cases,
-                                    levels=["sorter_name", "dataset"])
-
-
-    # all cases in one function
-    study.run_sorters()
-
-    # Collect comparisons
-    #
-    # You can collect in one shot all results and run the
-    # GroundTruthComparison on it.
-    # So you can have fine access to all individual results.
-    #
-    # Note: use exhaustive_gt=True when you know exactly how many
-    # units in the ground truth (for synthetic datasets)
-
-    # run all comparisons and loop over the results
-    study.run_comparisons(exhaustive_gt=True)
-    for key, comp in study.comparisons.items():
-        print('*' * 10)
-        print(key)
-        # raw counting of tp/fp/...
-        print(comp.count_score)
-        # summary
-        comp.print_summary()
-        perf_unit = comp.get_performance(method='by_unit')
-        perf_avg = comp.get_performance(method='pooled_with_average')
-        # some plots
-        m = comp.get_confusion_matrix()
-        w_comp = sw.plot_agreement_matrix(sorting_comparison=comp)
-
-    # Collect synthetic dataframes and display
-    # As shown previously, the performance is returned as a pandas dataframe.
-    # The spikeinterface.comparison.get_performance_by_unit() function,
-    # gathers all the outputs in the study folder and merges them into a single dataframe.
-    # Same idea for spikeinterface.comparison.get_count_units()
-
-    # this is a dataframe
-    perfs = study.get_performance_by_unit()
-
-    # this is a dataframe
-    unit_counts = study.get_count_units()
-
-    # we can also access run times
-    run_times = study.get_run_times()
-    print(run_times)
-
-    # Easy plotting with seaborn
-    fig1, ax1 = plt.subplots()
-    sns.barplot(data=run_times, x='rec_name', y='run_time', hue='sorter_name', ax=ax1)
-    ax1.set_title('Run times')
-
-    ##############################################################################
-
-    fig2, ax2 = plt.subplots()
-    sns.swarmplot(data=perfs, x='sorter_name', y='recall', hue='rec_name', ax=ax2)
-    ax2.set_title('Recall')
-    ax2.set_ylim(-0.1, 1.1)
-
-
 .. _symmetric:
 
 2. Compare the output of two spike sorters (symmetric comparison)
@@ -537,35 +412,3 @@ sorting analyzers from day 1 (:code:`analyzer_day1`) to day 5 (:code:`analyzer_d
     # match all
     m_tcmp = sc.compare_multiple_templates(waveform_list=analyzer_list,
                                            name_list=["D1", "D2", "D3", "D4", "D5"])
-
-
-
-Benchmark spike collisions
---------------------------
-
-SpikeInterface also has a specific toolset to benchmark how well sorters are at recovering spikes in "collision".
-
-We have three classes to handle collision-specific comparisons, and also to quantify the effects on correlogram
-estimation:
-
-  * :py:class:`~spikeinterface.comparison.CollisionGTComparison`
-  * :py:class:`~spikeinterface.comparison.CorrelogramGTComparison`
-  * :py:class:`~spikeinterface.comparison.CollisionGTStudy`
-  * :py:class:`~spikeinterface.comparison.CorrelogramGTStudy`
-
-For more details, checkout the following paper:
-
-`Samuel Garcia, Alessio P. Buccino and Pierre Yger. "How Do Spike Collisions Affect Spike Sorting Performance?" <https://doi.org/10.1523/ENEURO.0105-22.2022>`_
-
-
-Hybrid recording
-----------------
-
-To benchmark spike sorting results, we need ground-truth spiking activity.
-This can be generated with artificial simulations, e.g., using `MEArec <https://mearec.readthedocs.io/>`_, or
-alternatively by generating so-called "hybrid" recordings.
-
-The :py:mod:`~spikeinterface.comparison` module includes functions to generate such "hybrid" recordings:
-
-  * :py:func:`~spikeinterface.comparison.create_hybrid_units_recording`: add new units to an existing recording
-  * :py:func:`~spikeinterface.comparison.create_hybrid_spikes_recording`: add new spikes to existing units in a recording
diff --git a/src/spikeinterface/benchmark/__init__.py b/src/spikeinterface/benchmark/__init__.py
@@ -0,0 +1,7 @@
+"""
+Module to benchmark:
+  * sorters
+  * some sorting components (clustering, motion, template matching)
+"""
+
+from .benchmark_sorter import SorterStudy
diff --git a/...ngcomponents/benchmark/benchmark_tools.py → ...pikeinterface/benchmark/benchmark_base.py b/...ngcomponents/benchmark/benchmark_tools.py → ...pikeinterface/benchmark/benchmark_base.py
@@ -131,7 +131,7 @@ def create(cls, study_folder, datasets={}, cases={}, levels=None):
 
         return cls(study_folder)
 
-    def create_benchmark(self):
+    def create_benchmark(self, key):
         raise NotImplementedError
 
     def scan_folder(self):
@@ -258,25 +258,9 @@ def get_run_times(self, case_keys=None):
         return df
 
     def plot_run_times(self, case_keys=None):
-        if case_keys is None:
-            case_keys = list(self.cases.keys())
-        run_times = self.get_run_times(case_keys=case_keys)
-
-        colors = self.get_colors()
-        import matplotlib.pyplot as plt
+        from .benchmark_plot_tools import plot_run_times
 
-        fig, ax = plt.subplots()
-        labels = []
-        for i, key in enumerate(case_keys):
-            labels.append(self.cases[key]["label"])
-            rt = run_times.at[key, "run_times"]
-            ax.bar(i, rt, width=0.8, color=colors[key])
-        ax.set_xticks(np.arange(len(case_keys)))
-        ax.set_xticklabels(labels, rotation=45.0)
-        return fig
-
-        # ax = run_times.plot(kind="bar")
-        # return ax.figure
+        return plot_run_times(self, case_keys=case_keys)
 
     def compute_results(self, case_keys=None, verbose=False, **result_params):
         if case_keys is None:
@@ -462,10 +446,3 @@ def run(self):
     def compute_result(self):
         # run becnhmark result
         raise NotImplementedError
-
-
-def _simpleaxis(ax):
-    ax.spines["top"].set_visible(False)
-    ax.spines["right"].set_visible(False)
-    ax.get_xaxis().tick_bottom()
-    ax.get_yaxis().tick_left()