From 52ac6a0d1028e7b0e6401022f8e02d6154ef3d68 Mon Sep 17 00:00:00 2001 From: chrishalcrow <57948917+chrishalcrow@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:44:21 +0000 Subject: [PATCH] Update docs --- .../curation/plot_1_automated_curation.py | 129 ++++++++++-------- .../curation/plot_2_train_a_model.py | 14 +- .../curation/plot_3_upload_a_model.py | 7 +- 3 files changed, 79 insertions(+), 71 deletions(-) diff --git a/examples/tutorials/curation/plot_1_automated_curation.py b/examples/tutorials/curation/plot_1_automated_curation.py index b408ee0e53..9391a8736d 100644 --- a/examples/tutorials/curation/plot_1_automated_curation.py +++ b/examples/tutorials/curation/plot_1_automated_curation.py @@ -29,7 +29,6 @@ import spikeinterface.curation as sc import spikeinterface.widgets as sw - # note: you can use more cores using e.g. # si.set_global_jobs_kwargs(n_jobs = 8) @@ -50,15 +49,16 @@ ############################################################################## -# This model was trained on artifically generated tetrode data. The model object has a nice html -# representation, which will appear if you're using a Jupyter notebook. +# This model was trained on artifically generated tetrode data. There are also models trained +# on real data, like the one discussed `below <#A-model-trained-on-real-Neuropixels-data>`_. +# Each model object has a nice html representation, which will appear if you're using a Jupyter notebook. model ############################################################################## # This tells us more information about the model. The one we've just downloaded was trained used -# a `RandomForestClassifier`. You can also discover this information by running -# `model.get_params()`. The model object (an `sklearn Pipeline `_) also contains information +# a ``RandomForestClassifier```. You can also discover this information by running +# ``model.get_params()``. The model object (an `sklearn Pipeline `_) also contains information # about which metrics were used to compute the model. We can access it from the model (or from the model_info) print(model.feature_names_in_) @@ -81,10 +81,9 @@ print(set(all_metric_names) == set(model.feature_names_in_)) ############################################################################## -# Great! We can now use the model to predict labels. You can either pass a HuggingFace repo, or a -# local folder containing the model file and the ``model_info.json`` file. Here, we'll pass -# a repo. The function returns a dictionary containing a label and a confidence for each unit -# contained in the ``sorting_analyzer``. +# Great! We can now use the model to predict labels. Here, we pass the HF repo id directly +# to the ``auto_label_units`` function. This returns a dictionary containing a label and +# a confidence for each unit contained in the ``sorting_analyzer``. labels = sc.auto_label_units( sorting_analyzer = sorting_analyzer, @@ -96,8 +95,8 @@ ############################################################################## -# The model has labelled one unit as bad. Let's look at that one, and the 'good' unit with the highest -# confidence of being 'good'. +# The model has labelled one unit as bad. Let's look at that one, and also the 'good' unit +# with the highest confidence of being 'good'. sw.plot_unit_templates(sorting_analyzer, unit_ids=[7,9]) @@ -202,73 +201,83 @@ def calculate_moving_avg(label_df, confidence_label, window_size): # In this case, you might decide to only trust labels which had confidence over above 0.88, # and manually labels the ones the model isn't so confident about. # -# Using other models, e.g. one trained on Neuropixels data -# ------------------------------------------ +# A model trained on real Neuropixels data +# ---------------------------------------- # # Above, we used a toy model trained on generated data. There are also models on HuggingFace # trained on real data. # -# For example, the following classifier is trained on Neuropixels data from 11 mice recorded in -# V1,SC and ALM: https://huggingface.co/AnoushkaJain3/curation_machine_learning_models -# -# Go to that page, and take a look at the ``Files``. The models are contained in the -# `skops files `_ and there are *two* in this repo. -# We can choose which to load in the ``load_model`` function as follows: +# For example, the following classifiers are trained on Neuropixels data from 11 mice recorded in +# V1,SC and ALM: https://huggingface.co/AnoushkaJain3/noise_neural_classifier/ and +# https://huggingface.co/AnoushkaJain3/sua_mua_classifier/ . One will classify units into +# `noise` or `not-noise` and the other will classify the `not-noise` units into single +# unit activity (sua) units and multi-unit activity (mua) units. # -# .. code-block:: -# import spikeinterface.curation as sc -# model, model_info = sc.load_model( -# repo_id = "AnoushkaJain3/curation_machine_learning_models", -# model_name= 'noise_neuron_model.skops', -# ) +# There is more information about the model on the model's HuggingFace page. Take a look! +# The idea here is to first apply the noise/not-noise classifier, then the sua/mua one. +# We can do so as follows: # -# If you run this locally you will receive an error: + +# Apply the noise/not-noise model +noise_neuron_labels = si.auto_label_units( + analyzer = sorting_analyzer, + repo_id = "AnoushkaJain3/noise_neural_classifier", + trust_model=True, +) + +noise_units = noise_neuron_labels[noise_neuron_labels['prediction']=='noise'] +analyzer_neural = analyzer.remove_units(noise_units.index) + +# Apply the sua/mua model +sua_mua_labels = si.auto_label_units( + sorting_analyzer, + repo_id = "AnoushkaJain3/sua_mua_classifier", + trust_model=True, +) + +all_labels = pd.concat([sua_mua_labels, noise_units]).sort_index() +print(all_labels) + +# If you run this without the ``trust_model=True`` parameter, you will receive an error: # # .. code-block:: # # UntrustedTypesFoundException: Untrusted types found in the file: ['sklearn.metrics._classification.balanced_accuracy_score', 'sklearn.metrics._scorer._Scorer', 'sklearn.model_selection._search_successive_halving.HalvingGridSearchCV', 'sklearn.model_selection._split.StratifiedKFold'] # -# This is a security warning. Sharing models, with are Python objects, is complicated. -# We have chosen to use the `skops format `_, instead -# of the common but insecure ``.pkl`` format (read about ``pickle`` security issues -# `here `_). While unpacking the ``.skops`` file, each function -# is checked. Ideally, skops should recognise all `sklearn`, `numpy` and `scipy` functions and -# allow the object to be loaded if it only contains these (and no unkown malicious code). But -# when ``skops`` it's not sure, it raises an error. Here, it doesn't recognise -# ``['sklearn.metrics._classification.balanced_accuracy_score', 'sklearn.metrics._scorer._Scorer', -# 'sklearn.model_selection._search_successive_halving.HalvingGridSearchCV', -# 'sklearn.model_selection._split.StratifiedKFold']``. Taking a look, these are all functions -# from `sklearn`, and we can happily add them to the ``trusted`` functions to load: +# This is a security warning, which can be overcome by passing the trusted types list +# ``trusted = ['sklearn.metrics._classification.balanced_accuracy_score', 'sklearn.metrics._scorer._Scorer', 'sklearn.model_selection._search_successive_halving.HalvingGridSearchCV', 'sklearn.model_selection._split.StratifiedKFold']`` +# or by passing the ``trust_model=True``` keyword. # -# .. code-block:: -# -# model, model_info = sc.load_model( -# model_name = 'noise_neuron_model.skops', -# repo_id = "AnoushkaJain3/curation_machine_learning_models", -# trusted = ['sklearn.metrics._classification.balanced_accuracy_score', 'sklearn.metrics._scorer._Scorer', 'sklearn.model_selection._search_successive_halving.HalvingGridSearchCV', 'sklearn.model_selection._split.StratifiedKFold'] -# ) -# -# As ``skops`` continues to be developed, we hope more of these functions will be :code:`trusted` -# by default. +# .. dropdown:: More about security # -# If you unequivocally trust the model (e.g. if you have created it), you can bypass this security -# step by passing ``trust_model = True`` to the ``load_model`` function. -# -# In general, you should be cautious when downloading ``.skops`` files and ``.pkl`` files from repos, -# especially from unknown sources. +# Sharing models, with are Python objects, is complicated. +# We have chosen to use the `skops format `_, instead +# of the common but insecure ``.pkl`` format (read about ``pickle`` security issues +# `here `_). While unpacking the ``.skops`` file, each function +# is checked. Ideally, skops should recognise all `sklearn`, `numpy` and `scipy` functions and +# allow the object to be loaded if it only contains these (and no unkown malicious code). But +# when ``skops`` it's not sure, it raises an error. Here, it doesn't recognise +# ``['sklearn.metrics._classification.balanced_accuracy_score', 'sklearn.metrics._scorer._Scorer', +# 'sklearn.model_selection._search_successive_halving.HalvingGridSearchCV', +# 'sklearn.model_selection._split.StratifiedKFold']``. Taking a look, these are all functions +# from `sklearn`, and we can happily add them to the ``trusted`` functions to load. # +# In general, you should be cautious when downloading ``.skops`` files and ``.pkl`` files from repos, +# especially from unknown sources. # # Directly applying a sklearn Pipeline # ------------------------------------ # -# Instead of using ``HuggingFace`` and ``skops``, you might have another way of receiving a sklearn -# pipeline, and want to apply it to your sorted data. - -from spikeinterface.curation.model_based_curation import ModelBasedClassification - -model_based_classification = ModelBasedClassification(sorting_analyzer, model) -labels = model_based_classification.predict_labels() -labels +# Instead of using ``HuggingFace`` and ``skops``, someone might have given you a model +# in differet way: perhaps by e-mail or a download. If you have the model in a +# folder, you can apply it in a very similar way: +# +# .. code-block:: +# +# labels = sc.auto_label_units( +# sorting_analyzer = sorting_analyzer, +# model_folder = "path/to/model/folder", +# ) ############################################################################## # Using this, you lose the advantages of the model metadata: the quality metric parameters diff --git a/examples/tutorials/curation/plot_2_train_a_model.py b/examples/tutorials/curation/plot_2_train_a_model.py index 9f7189b8c0..3521c8b8c5 100644 --- a/examples/tutorials/curation/plot_2_train_a_model.py +++ b/examples/tutorials/curation/plot_2_train_a_model.py @@ -58,7 +58,7 @@ ############################################################################## # This is as expected: great! (Find out more about plotting using widgets `here `_.) -# We've set out system up so that the first five units are 'good' and the next five are 'bad'. +# We've set up our system so that the first five units are 'good' and the next five are 'bad'. # So we can make a list of labels which contain this information. For real data, you could # use a manual curation tool to make your own list. @@ -72,13 +72,13 @@ # of the units, and then be applied to units from other sortings. The properties we use are the # `quality metrics `_ # and `template metrics `_. -# Hence we need to compute these, using some `sorting_analyzer` extensions. +# Hence we need to compute these, using some ``sorting_analyzer``` extensions. analyzer.compute(['spike_locations','spike_amplitudes','correlograms','principal_components','quality_metrics','template_metrics']) ############################################################################## # Now that we have metrics and labels, we're ready to train the model using the -# `train_model` function. The trainer will try several classifiers, imputation strategies and +# ``train_model``` function. The trainer will try several classifiers, imputation strategies and # scaling techniques then save the most accurate. To save time in this tutorial, # we'll only try one classifier (Random Forest), imputation strategy (median) and scaling # technique (standard scaler). @@ -109,7 +109,7 @@ # `imputation strategies `_ and # `scalers `_, although the # documentation is quite overwhelming. You can find the classifiers we've tried out -# using the `sc.get_default_classifier_search_spaces` function. +# using the ``sc.get_default_classifier_search_spaces`` function. # # The above code saves the model in ``model.skops``, some metadata in # ``model_info.json`` and the model accuracies in ``model_accuracies.csv`` @@ -156,10 +156,10 @@ ############################################################################## # Roughly, this means the model is using metrics such as "nn_hit_rate" and "l_ratio" -# but is using "sync_spike_4" and "rp_contanimation". This is a toy model, so don't -# take these results seriously! But using this information, you could retrain another, +# but is not using "sync_spike_4" and "rp_contanimation". This is a toy model, so don't +# take these results seriously. But using this information, you could retrain another, # simpler model using a subset of the metrics, by passing, e.g., -# `metric_names = ['nn_hit_rate', 'l_ratio',...]` to the `train_model` function. +# ``metric_names = ['nn_hit_rate', 'l_ratio',...]`` to the ``train_model`` function. # # Now that you have a model, you can `apply it to another sorting # `_ diff --git a/examples/tutorials/curation/plot_3_upload_a_model.py b/examples/tutorials/curation/plot_3_upload_a_model.py index 739e87d7c1..0a9ea402db 100644 --- a/examples/tutorials/curation/plot_3_upload_a_model.py +++ b/examples/tutorials/curation/plot_3_upload_a_model.py @@ -34,7 +34,7 @@ # SpikeInterface and HFH don't require you to keep this folder structure, we just advise it as # best practice. # -# If you've used SpikeInterface to train your model, the `train_model` function auto-generates +# If you've used SpikeInterface to train your model, the ``train_model`` function auto-generates # most of this data. The only thing missing is the the ``metadata.json`` file. The purpose of this # file is to detail how the model was trained, which can help prospective users decide if it # is relevant for them. For example, taking @@ -65,8 +65,7 @@ # Upload to HuggingFaceHub # ------------------------ # -# We'll now upload this folder to HFH using the web interface. (If you don't want to -# use HFH, you could just share this folder with a colleague.) +# We'll now upload this folder to HFH using the web interface. # # First, go to https://huggingface.co/ and make an account. Once you've logged in, press # ``+`` then ``New model`` or find ``+ New Model`` in the user menu. You will be asked @@ -128,7 +127,7 @@ # # labels = auto_label_units( # sorting_analyzer = sorting_analyzer, -# model_folder = "SpikeInterface/a_folder_for_a_model", +# model_folder = "path/to/a_folder_for_a_model", # trusted = ['numpy.dtype'] # ) # ` ` `