From 0778208d111a3fe9ad57bd8fc5f5b20bba53e9ad Mon Sep 17 00:00:00 2001
From: Alessio Buccino <alejoe9187@gmail.com>
Date: Sat, 28 Sep 2024 13:01:23 +0200
Subject: [PATCH 1/4] Add script to upload np-ultra templates and fix ibl
 script to skip units with too few spikes

---
 python/delete_templates.py                    |  80 ++++++++++++-
 ...d_templates.py => upload_ibl_templates.py} |  28 ++++-
 python/upload_npultra_templates.py            | 107 ++++++++++++++++++
 3 files changed, 205 insertions(+), 10 deletions(-)
 rename python/{upload_templates.py => upload_ibl_templates.py} (93%)
 create mode 100644 python/upload_npultra_templates.py

diff --git a/python/delete_templates.py b/python/delete_templates.py
index 78c6a07..a609fc3 100644
--- a/python/delete_templates.py
+++ b/python/delete_templates.py
@@ -1,4 +1,7 @@
 import boto3
+import numpy as np
+import zarr
+
 from consolidate_datasets import list_zarr_directories
 
 
@@ -31,7 +34,72 @@ def delete_templates_from_s3(
         delete_template_from_s3(bucket_name, key, boto_client=boto_client)
 
 
-if __name__ == "__main__":
+def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
+    """
+    This function will delete templates associated to spike trains with too few spikes.
+
+    The initial database was in fact created without a minimum number of spikes per unit, 
+    so some units have very few spikes and possibly a noisy template.
+    """
+    import spikeinterface.generation as sgen
+
+    templates_info = sgen.fetch_templates_database_info()
+
+    templates_to_remove = templates_info.query(f"spikes_per_unit < {min_spikes}")
+
+    if len(templates_to_remove) > 0:
+        if verbose:
+            print(f"Removing {len(templates_to_remove)}/{len(templates_info)} templates with less than {min_spikes} spikes")
+        datasets = np.unique(templates_to_remove["dataset"])
+
+        for d_i, dataset in enumerate(datasets):
+            if verbose:
+                print(f"\tCleaning dataset {d_i + 1}/{len(datasets)}")
+            templates_in_dataset = templates_to_remove.query(f"dataset == '{dataset}'")
+            template_indices_to_remove = templates_in_dataset.template_index.values
+            s3_path = templates_in_dataset.dataset_path.values[0]
+
+            # to filter from the zarr dataset: 
+            datasets_to_filter = [
+                "templates_array",
+                "best_channel_index",
+                "spikes_per_unit",
+                "brain_area",
+                "peak_to_peak",
+                "channel_noise_levels",
+                "unit_ids",
+            ]
+
+            # open zarr in append mode
+            if dry_run:
+                mode = "r"
+            else:
+                mode = "r+"
+            zarr_root = zarr.open(s3_path, mode=mode)
+            all_unit_indices = np.arange(len(zarr_root["unit_ids"]))
+            n_original_units = len(all_unit_indices)
+            unit_indices_to_keep = np.delete(all_unit_indices, template_indices_to_remove)
+            n_units_to_keep = len(unit_indices_to_keep)
+            if verbose:
+                print(f"\tRemoving {n_original_units - n_units_to_keep} templates from {n_original_units}")
+            for dset in datasets_to_filter:
+                dataset_original = zarr_root[dset]
+                dataset_filtered = dataset_original[unit_indices_to_keep]
+                if not dry_run:
+                    if verbose:
+                        print(f"\t\tUpdating: {dset} - shape: {dataset_filtered.shape}")
+                    zarr_root[dset] = dataset_filtered
+                else:
+                    if verbose:
+                        print(f"\t\tDry run: {dset} - shape: {dataset_filtered.shape}")
+            if not dry_run:
+                zarr.consolidate_metadata(zarr_root.store)
+            
+def delete_templates_with_num_samples(dry_run=False):
+    """
+    This function will delete templates with number of samples, 
+    which were not corrected for in the initial database.
+    """
     bucket = "spikeinterface-template-database"
     boto_client = boto3.client("s3")
     verbose = True 
@@ -51,6 +119,10 @@ def delete_templates_from_s3(
     ]
     existing_templates = list_zarr_directories(bucket, boto_client=boto_client)
     templates_to_erase_from_bucket = [template for template in templates_to_erase_from_bucket if template in existing_templates]
-    if verbose:
-        print(f"Erasing {len(templates_to_erase_from_bucket)} templates from bucket: {bucket}")
-    delete_templates_from_s3(bucket, templates_to_erase_from_bucket, boto_client=boto_client)
+    if dry_run:
+        if verbose:
+            print(f"Would erase {len(templates_to_erase_from_bucket)} templates from bucket: {bucket}")
+    else:
+        if verbose:
+            print(f"Erasing {len(templates_to_erase_from_bucket)} templates from bucket: {bucket}")
+        delete_templates_from_s3(bucket, templates_to_erase_from_bucket, boto_client=boto_client)
diff --git a/python/upload_templates.py b/python/upload_ibl_templates.py
similarity index 93%
rename from python/upload_templates.py
rename to python/upload_ibl_templates.py
index 5f4f5d7..34083d5 100644
--- a/python/upload_templates.py
+++ b/python/upload_ibl_templates.py
@@ -1,6 +1,21 @@
+"""
+This script constructs and uploads the templates from the International Brain Laboratory (IBL) datasets
+available from DANDI (https://dandiarchive.org/dandiset/000409?search=IBL&pos=3). 
+The templates are constructed using the spikeinterface package and are saved to a
+Zarr file. The Zarr file is then uploaded to an S3 bucket hosted by CatalystNeuro for storage and sharing.
+
+The s3 bucket "spikeinterface-template-database" is used by the SpikeInterface hybrid framework to construct hybrid
+recordings.
+"""
 from pathlib import Path
 
 import numpy as np
+import s3fs
+import zarr
+import time
+import os
+import numcodecs
+
 from dandi.dandiapi import DandiAPIClient
 
 from spikeinterface.extractors import (
@@ -16,13 +31,7 @@
     highpass_filter,
 )
 
-import s3fs
-import zarr
-import numcodecs
-
 from one.api import ONE
-import time
-import os
 
 from consolidate_datasets import list_zarr_directories
 
@@ -55,6 +64,7 @@ def find_channels_with_max_peak_to_peak_vectorized(templates):
 
 # Parameters
 minutes_by_the_end = 30  # How many minutes in the end of the recording to use for templates
+min_spikes_per_unit = 50
 upload_data = True
 overwite = False
 verbose = True
@@ -174,6 +184,10 @@ def find_channels_with_max_peak_to_peak_vectorized(templates):
 
         sorting_end = sorting.frame_slice(start_frame=start_frame_sorting, end_frame=end_frame_sorting)
 
+        spikes_per_unit = sorting_end.count_num_spikes_per_unit(outputs="array")
+        unit_indices_to_keep = np.where(spikes_per_unit >= min_spikes_per_unit)[0]
+        sorting_end = sorting_end.select_units(sorting_end.unit_ids[unit_indices_to_keep])
+
         # NWB Streaming is not working well with parallel pre=processing so we ave
         folder_path = Path.cwd() / "build" / "local_copy"
         folder_path.mkdir(exist_ok=True, parents=True)
@@ -263,6 +277,8 @@ def find_channels_with_max_peak_to_peak_vectorized(templates):
         expected_shape = (number_of_units, number_of_temporal_samples, number_of_channels)
         assert templates_extension_data.templates_array.shape == expected_shape
 
+        # TODO: skip templates with 0 amplitude!
+        # TODO: check for weird shapes
         templates_extension = analyzer.get_extension("templates")
         templates_object = templates_extension.get_data(outputs="Templates")
         unit_ids = templates_object.unit_ids
diff --git a/python/upload_npultra_templates.py b/python/upload_npultra_templates.py
new file mode 100644
index 0000000..5b19597
--- /dev/null
+++ b/python/upload_npultra_templates.py
@@ -0,0 +1,107 @@
+"""
+This script constructs and uploads the templates from the Neuropixels datasets
+available from DANDI. The templates are constructed using the spikeinterface package and are saved to a
+Zarr file. The Zarr file is then uploaded to an S3 bucket hosted by CatalystNeuro for storage and sharing.
+
+The s3 bucket "spikeinterface-template-database" is used by the SpikeInterface hybrid framework to construct hybrid
+recordings.
+"""
+from pathlib import Path
+
+import numpy as np
+import s3fs
+import zarr
+import pandas as pd
+import os
+import numcodecs
+import probeinterface as pi
+import spikeinterface as si
+
+min_spikes_per_unit = 50
+upload_data = False
+
+npultra_templates_path = Path("/home/alessio/Documents/Data/Templates/NPUltraWaveforms/")
+dataset_name = "steinmetz_ye_np_ultra_2022_figshare19493588v2.zarr"
+
+# AWS credentials
+aws_access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
+aws_secret_access_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
+bucket_name = "spikeinterface-template-database"
+client_kwargs = {"region_name": "us-east-2"}
+
+# Load the templates and the required metadata
+xpos = np.load(npultra_templates_path / "channels.xcoords.npy")
+ypos = np.load(npultra_templates_path / "channels.ycoords.npy")
+
+channel_locations = np.squeeze([xpos, ypos]).T
+
+templates_array = np.load(npultra_templates_path / "clusters.waveforms.npy")
+spike_clusters = np.load(npultra_templates_path / "spikes.clusters.npy")
+
+brain_area = pd.read_csv(npultra_templates_path / "clusters.acronym.tsv", sep="\t")
+brain_area_acronym = brain_area["acronym"].values
+
+# Instantiate Probe
+probe = pi.Probe(ndim=2)
+probe.set_contacts(positions=channel_locations, shapes="square", shape_params={"width": 5})
+probe.model_name = "Neuropixels Ultra"
+probe.manufacturer = "IMEC"
+
+# Unit ids and properties
+unit_ids, spikes_per_unit = np.unique(spike_clusters, return_counts=True)
+unit_ids_enough_spikes = spikes_per_unit >= min_spikes_per_unit
+unit_ids = unit_ids[unit_ids_enough_spikes]
+spikes_per_unit = spikes_per_unit[unit_ids_enough_spikes]
+
+# sort the units by unit_id
+sort_unit_indices = np.argsort(unit_ids)
+unit_itd = unit_ids[sort_unit_indices]
+spikes_per_unit = spikes_per_unit[sort_unit_indices]
+brain_area_acronym = brain_area_acronym[sort_unit_indices]
+
+# Create Templates object
+nbefore = 40
+sampling_frequency = 30000
+
+templates_ultra = si.Templates(
+    templates_array=templates_array,
+    sampling_frequency=sampling_frequency,
+    nbefore=nbefore,
+    unit_ids=unit_ids,
+    probe=probe,
+    is_scaled=True
+)
+
+best_channel_index = si.get_template_extremum_channel(templates_ultra, mode="peak_to_peak", outputs="index")
+best_channel_index = list(best_channel_index.values())
+
+if upload_data:
+    # Create a S3 file system object with explicit credentials
+    s3_kwargs = dict(anon=False, key=aws_access_key_id, secret=aws_secret_access_key, client_kwargs=client_kwargs)
+    s3 = s3fs.S3FileSystem(**s3_kwargs)
+
+    # Specify the S3 bucket and path
+    s3_path = f"{bucket_name}/{dataset_name}"
+    store = s3fs.S3Map(root=s3_path, s3=s3)
+else:
+    folder_path = Path.cwd() / "build" / f"{dataset_name}"
+    folder_path.mkdir(exist_ok=True, parents=True)
+    store = zarr.DirectoryStore(str(folder_path))
+
+# Save results to Zarr
+zarr_group = zarr.group(store=store, overwrite=True)
+zarr_group.create_dataset(name="brain_area", data=brain_area, object_codec=numcodecs.VLenUTF8())
+zarr_group.create_dataset(name="spikes_per_unit", data=spikes_per_unit, chunks=None, dtype="uint32")
+zarr_group.create_dataset(
+    name="best_channel_index",
+    data=best_channel_index,
+    chunks=None,
+    dtype="uint32",
+)
+peak_to_peak = np.ptp(templates_array, axis=1)
+zarr_group.create_dataset(name="peak_to_peak", data=peak_to_peak)
+
+# Now you can create a Zarr array using this store
+templates_ultra.add_templates_to_zarr_group(zarr_group=zarr_group)
+zarr_group_s3 = zarr_group
+zarr.consolidate_metadata(zarr_group_s3.store)

From 670cc2ab6ee4157e111a71140356b990dcd4f308 Mon Sep 17 00:00:00 2001
From: Alessio Buccino <alejoe9187@gmail.com>
Date: Sat, 28 Sep 2024 13:03:55 +0200
Subject: [PATCH 2/4] Update np-ultra readme

---
 python/upload_npultra_templates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/upload_npultra_templates.py b/python/upload_npultra_templates.py
index 5b19597..89ab041 100644
--- a/python/upload_npultra_templates.py
+++ b/python/upload_npultra_templates.py
@@ -1,6 +1,7 @@
 """
-This script constructs and uploads the templates from the Neuropixels datasets
-available from DANDI. The templates are constructed using the spikeinterface package and are saved to a
+This script constructs and uploads the templates from the Neuropixels Ultra dataset
+form Steinmetz and Ye, 2022. The dataset is hosted on Figshare at https://doi.org/10.6084/m9.figshare.19493588.v2
+The templates and relevant metadata are packaged into a `spikeinterface.Templates` and saved to a
 Zarr file. The Zarr file is then uploaded to an S3 bucket hosted by CatalystNeuro for storage and sharing.
 
 The s3 bucket "spikeinterface-template-database" is used by the SpikeInterface hybrid framework to construct hybrid

From e629808fb7d6ef78868a98fb528aebdd23722c42 Mon Sep 17 00:00:00 2001
From: Alessio Buccino <alejoe9187@gmail.com>
Date: Sat, 28 Sep 2024 16:13:33 +0200
Subject: [PATCH 3/4] Upload README and added smoothing for NP-Ultra templates

---
 README.md                          | 51 +++++++++++++++++++---
 pyproject.toml                     |  1 +
 python/consolidate_datasets.py     |  8 +++-
 python/delete_templates.py         | 26 ++++++++----
 python/upload_ibl_templates.py     | 17 +++++---
 python/upload_npultra_templates.py | 68 ++++++++++++++++++++++++++----
 6 files changed, 141 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index e51a05c..1a9b1db 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,49 @@
-# hybrid_template_library
-Library of templates to create hybrid data for spike sorting benchmarks
+# SpikeInterface Hybrid Template Library
 
-[click here for access to the library](https://spikeinterface.github.io/hybrid_template_library/)
 
+This repo contains a set of tools to construct and interact with a library of hybrid templates for spike sorting benchmarks.
 
-## Testing locally
+The library is made of several datasets stored a zarr file and can be accessed through the spikeinterface library. 
+The library is also accessible through a web-app that allows users to browse the templates and download them for 
+use in their spike sorting benchmarks.
+
+
+## Template sources
+
+The following datasets are available in the library:
+
+- [IBL](https://dandiarchive.org/dandiset/000409?search=IBL&pos=3): Neuropixels 1.0 templates from the IBL Brain Wide Map dataset
+- [Steinmetz and Ye. 2022](https://doi.org/10.6084/m9.figshare.19493588.v2): Neuropixels Ultra templates from Steinmetz and Ye. 2022
+
+The templates have been processed and stored with the `python` scripts in the `python/scripts` folder and are stored in `zarr`
+format in the `s3://spikeinterface-template-library` bucket hosted on `AWS S3` by [CatalystNeuro](https://www.catalystneuro.com/).
+
+
+## Accessing the data through `SpikeInterface`
+
+The library can be accessed through the `spikeinterface` library using the `generation` module.
+The following code shows how to access the library to fetch a dataframe with the available templates
+and download the templates corresponing to a specific user query:
+
+```python
+import spikeinterface.generation as sgen
+
+templates_info = sgen.fetch_templates_database_info()
+
+# select templates with amplitude between 200 and 250uV
+templates_info_selected = templates_info.query('amplitude_uv > 200 and amplitude_uv < 250')
+templates_selected = sgen.sgen.query_templates_from_database(templates_info_selected)
+```
+
+For a more comprehensive example on how to construct hybrid recordings from the template library and run spike sorting
+benchmarks, please refer to the SpikeInterface tutorial on [Hybrid recordings](https://spikeinterface.readthedocs.io/en/latest/how_to/benchmark_with_hybrid_recordings.html).
+
+## Live Web-App
+
+The template library can be browsed through a web-app (source code included in this repo). The web-app is hosted on github pages and can be accessed through the following link: [https://spikeinterface.github.io/hybrid_template_library/](https://spikeinterface.github.io/hybrid_template_library/)
+
+
+### Testing locally
 
 How to run a python server for testing zarr access
 
@@ -38,10 +77,10 @@ python -c "from http.server import HTTPServer, SimpleHTTPRequestHandler; import
 ```
 
 
-Then you run the npm script to start the server and open the browser
+Then you run the `npm` script to start the server and open the browser
 
 ```bash
-export TEST_URL="http://localhost:8000/zarr_store.zarr"
+export TEST_URL="http://localhost:8000/test_zarr.zarr"
 npm run start
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index f127e31..4194441 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ classifiers = [
 
 dependencies = [
   "spikeinterface >= 0.101",
+  "MEArec",
   "tqdm",
   "pynwb>=2.8",
   "remfile==0.1",
diff --git a/python/consolidate_datasets.py b/python/consolidate_datasets.py
index 54a30fa..098bfb1 100644
--- a/python/consolidate_datasets.py
+++ b/python/consolidate_datasets.py
@@ -101,8 +101,12 @@ def consolidate_datasets(dry_run: bool = False, verbose: bool = False):
 
         depth_best_channel = channel_depths[best_channel_indices]
         peak_to_peak_best_channel = zarr_group["peak_to_peak"][template_indices, best_channel_indices]
-        noise_best_channel = zarr_group["channel_noise_levels"][best_channel_indices]
-        signal_to_noise_ratio_best_channel = peak_to_peak_best_channel / noise_best_channel
+        if "channel_noise_levels" not in zarr_group:
+            noise_best_channel = np.nan * np.zeros(num_units)
+            signal_to_noise_ratio_best_channel = np.nan * np.zeros(num_units)
+        else:
+            noise_best_channel = zarr_group["channel_noise_levels"][best_channel_indices]
+            signal_to_noise_ratio_best_channel = peak_to_peak_best_channel / noise_best_channel
 
         new_entry = pd.DataFrame(
             {
diff --git a/python/delete_templates.py b/python/delete_templates.py
index a609fc3..1fb31ee 100644
--- a/python/delete_templates.py
+++ b/python/delete_templates.py
@@ -38,7 +38,7 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
     """
     This function will delete templates associated to spike trains with too few spikes.
 
-    The initial database was in fact created without a minimum number of spikes per unit, 
+    The initial database was in fact created without a minimum number of spikes per unit,
     so some units have very few spikes and possibly a noisy template.
     """
     import spikeinterface.generation as sgen
@@ -49,7 +49,9 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
 
     if len(templates_to_remove) > 0:
         if verbose:
-            print(f"Removing {len(templates_to_remove)}/{len(templates_info)} templates with less than {min_spikes} spikes")
+            print(
+                f"Removing {len(templates_to_remove)}/{len(templates_info)} templates with less than {min_spikes} spikes"
+            )
         datasets = np.unique(templates_to_remove["dataset"])
 
         for d_i, dataset in enumerate(datasets):
@@ -59,7 +61,7 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
             template_indices_to_remove = templates_in_dataset.template_index.values
             s3_path = templates_in_dataset.dataset_path.values[0]
 
-            # to filter from the zarr dataset: 
+            # to filter from the zarr dataset:
             datasets_to_filter = [
                 "templates_array",
                 "best_channel_index",
@@ -84,26 +86,32 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
                 print(f"\tRemoving {n_original_units - n_units_to_keep} templates from {n_original_units}")
             for dset in datasets_to_filter:
                 dataset_original = zarr_root[dset]
+                if len(dataset_original) == n_units_to_keep:
+                    print(f"\t\tDataset: {dset} - shape: {dataset_original.shape} - already updated")
+                    continue
                 dataset_filtered = dataset_original[unit_indices_to_keep]
                 if not dry_run:
                     if verbose:
                         print(f"\t\tUpdating: {dset} - shape: {dataset_filtered.shape}")
+                    if dataset_filtered.dtype.kind == "O":
+                        dataset_filtered = dataset_filtered.astype(str)
                     zarr_root[dset] = dataset_filtered
                 else:
                     if verbose:
                         print(f"\t\tDry run: {dset} - shape: {dataset_filtered.shape}")
             if not dry_run:
                 zarr.consolidate_metadata(zarr_root.store)
-            
+
+
 def delete_templates_with_num_samples(dry_run=False):
     """
-    This function will delete templates with number of samples, 
+    This function will delete templates with number of samples,
     which were not corrected for in the initial database.
     """
     bucket = "spikeinterface-template-database"
     boto_client = boto3.client("s3")
-    verbose = True 
-    
+    verbose = True
+
     templates_to_erase_from_bucket = [
         "000409_sub-KS084_ses-1b715600-0cbc-442c-bd00-5b0ac2865de1_behavior+ecephys+image_bbe6ebc1-d32f-42dd-a89c-211226737deb.zarr",
         "000409_sub-KS086_ses-e45481fa-be22-4365-972c-e7404ed8ab5a_behavior+ecephys+image_f2a098e7-a67e-4125-92d8-36fc6b606c45.zarr",
@@ -118,7 +126,9 @@ def delete_templates_with_num_samples(dry_run=False):
         "000409_sub-KS096_ses-f819d499-8bf7-4da0-a431-15377a8319d5_behavior+ecephys+image_4ea45238-55b1-4d54-ba92-efa47feb9f57.zarr",
     ]
     existing_templates = list_zarr_directories(bucket, boto_client=boto_client)
-    templates_to_erase_from_bucket = [template for template in templates_to_erase_from_bucket if template in existing_templates]
+    templates_to_erase_from_bucket = [
+        template for template in templates_to_erase_from_bucket if template in existing_templates
+    ]
     if dry_run:
         if verbose:
             print(f"Would erase {len(templates_to_erase_from_bucket)} templates from bucket: {bucket}")
diff --git a/python/upload_ibl_templates.py b/python/upload_ibl_templates.py
index 34083d5..a3477a4 100644
--- a/python/upload_ibl_templates.py
+++ b/python/upload_ibl_templates.py
@@ -1,12 +1,17 @@
 """
 This script constructs and uploads the templates from the International Brain Laboratory (IBL) datasets
 available from DANDI (https://dandiarchive.org/dandiset/000409?search=IBL&pos=3). 
-The templates are constructed using the spikeinterface package and are saved to a
-Zarr file. The Zarr file is then uploaded to an S3 bucket hosted by CatalystNeuro for storage and sharing.
 
-The s3 bucket "spikeinterface-template-database" is used by the SpikeInterface hybrid framework to construct hybrid
-recordings.
+Templates are extracted by combining the raw data from the NWB files on DANDI with the spike trains form
+the Alyx ONE database. Only the units that passed the IBL quality control are used.
+To minimize the amount of drift in the templates, only the last 30 minutes of the recording are used. 
+The raw recordings are pre-processed with a high-pass filter and a common median reference prior to 
+template extraction. Units with less than 50 spikes are excluded from the template database.
+
+Once the templates are constructed they are saved to a Zarr file which is then uploaded to 
+"spikeinterface-template-database" bucket (hosted by CatalystNeuro).
 """
+
 from pathlib import Path
 
 import numpy as np
@@ -294,7 +299,9 @@ def find_channels_with_max_peak_to_peak_vectorized(templates):
 
         if upload_data:
             # Create a S3 file system object with explicit credentials
-            s3_kwargs = dict(anon=False, key=aws_access_key_id, secret=aws_secret_access_key, client_kwargs=client_kwargs)
+            s3_kwargs = dict(
+                anon=False, key=aws_access_key_id, secret=aws_secret_access_key, client_kwargs=client_kwargs
+            )
             s3 = s3fs.S3FileSystem(**s3_kwargs)
 
             # Specify the S3 bucket and path
diff --git a/python/upload_npultra_templates.py b/python/upload_npultra_templates.py
index 89ab041..e642e09 100644
--- a/python/upload_npultra_templates.py
+++ b/python/upload_npultra_templates.py
@@ -1,12 +1,15 @@
 """
 This script constructs and uploads the templates from the Neuropixels Ultra dataset
 form Steinmetz and Ye, 2022. The dataset is hosted on Figshare at https://doi.org/10.6084/m9.figshare.19493588.v2
-The templates and relevant metadata are packaged into a `spikeinterface.Templates` and saved to a
-Zarr file. The Zarr file is then uploaded to an S3 bucket hosted by CatalystNeuro for storage and sharing.
 
-The s3 bucket "spikeinterface-template-database" is used by the SpikeInterface hybrid framework to construct hybrid
-recordings.
+Since the templates in the dataset have rather short cut outs, which might negatively interfere with 
+hybrid spike injections, the templates are padded and smoothed using the `MEArec` package so that they
+end up having 240 samples (90 before, 150 after the peak).
+
+Once the templates are constructed they are saved to a Zarr file which is then uploaded to 
+"spikeinterface-template-database" bucket (hosted by CatalystNeuro).
 """
+
 from pathlib import Path
 
 import numpy as np
@@ -18,7 +21,27 @@
 import probeinterface as pi
 import spikeinterface as si
 
+from MEArec.tools import pad_templates, sigmoid
+
+
+def smooth_edges(templates, pad_samples, smooth_percent=0.5, smooth_strength=1):
+    # smooth edges
+    sigmoid_samples = int(smooth_percent * pad_samples[0]) // 2 * 2
+    sigmoid_x = np.arange(-sigmoid_samples // 2, sigmoid_samples // 2)
+    b = smooth_strength
+    sig = sigmoid(sigmoid_x, b) + 0.5
+    window = np.ones(templates.shape[-1])
+    window[:sigmoid_samples] = sig
+    window[-sigmoid_samples:] = sig[::-1]
+
+    templates *= window
+    return templates
+
+
+# parameters
 min_spikes_per_unit = 50
+target_nbefore = 90
+target_nafter = 150
 upload_data = False
 
 npultra_templates_path = Path("/home/alessio/Documents/Data/Templates/NPUltraWaveforms/")
@@ -54,23 +77,50 @@
 unit_ids = unit_ids[unit_ids_enough_spikes]
 spikes_per_unit = spikes_per_unit[unit_ids_enough_spikes]
 
-# sort the units by unit_id
+# Sort the units by unit_id
 sort_unit_indices = np.argsort(unit_ids)
-unit_itd = unit_ids[sort_unit_indices]
+unit_ids = unit_ids[sort_unit_indices].astype(int)
 spikes_per_unit = spikes_per_unit[sort_unit_indices]
 brain_area_acronym = brain_area_acronym[sort_unit_indices]
 
-# Create Templates object
+# Process the templates to make them smooth
 nbefore = 40
 sampling_frequency = 30000
+num_samples = templates_array.shape[1]
+nafter = num_samples - nbefore
+
+pad_samples = [target_nbefore - nbefore, target_nafter - nafter]
+
+# MEArec needs swap axes
+print("Padding and smoothing templates")
+templates_array_swap = templates_array.swapaxes(1, 2)
+tmp_templates_file = "templates_padded.raw"
+templates_padded_swap = pad_templates(
+    templates_array_swap,
+    pad_samples,
+    drifting=False,
+    dtype="float",
+    verbose=False,
+    n_jobs=-1,
+    tmp_file=tmp_templates_file,
+    parallel=True,
+)
+templates_padded = templates_padded_swap.swapaxes(1, 2)
+Path(tmp_templates_file).unlink()
 
+# smooth edges
+templates_smoothed_swap = smooth_edges(templates_padded_swap, pad_samples)
+templates_smoothed = templates_smoothed_swap.swapaxes(1, 2)
+
+# Create Templates object
+print("Creating Templates object")
 templates_ultra = si.Templates(
-    templates_array=templates_array,
+    templates_array=templates_smoothed,
     sampling_frequency=sampling_frequency,
     nbefore=nbefore,
     unit_ids=unit_ids,
     probe=probe,
-    is_scaled=True
+    is_scaled=True,
 )
 
 best_channel_index = si.get_template_extremum_channel(templates_ultra, mode="peak_to_peak", outputs="index")

From 0b2a45676a2078a6a650a48242f66e0d10b1de5e Mon Sep 17 00:00:00 2001
From: Alessio Buccino <alejoe9187@gmail.com>
Date: Sat, 28 Sep 2024 16:51:01 +0200
Subject: [PATCH 4/4] Add restore ibl noise levels function

---
 python/delete_templates.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/python/delete_templates.py b/python/delete_templates.py
index 1fb31ee..f6890dd 100644
--- a/python/delete_templates.py
+++ b/python/delete_templates.py
@@ -44,7 +44,6 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
     import spikeinterface.generation as sgen
 
     templates_info = sgen.fetch_templates_database_info()
-
     templates_to_remove = templates_info.query(f"spikes_per_unit < {min_spikes}")
 
     if len(templates_to_remove) > 0:
@@ -68,7 +67,6 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
                 "spikes_per_unit",
                 "brain_area",
                 "peak_to_peak",
-                "channel_noise_levels",
                 "unit_ids",
             ]
 
@@ -103,6 +101,42 @@ def delete_templates_too_few_spikes(min_spikes=50, dry_run=False, verbose=True):
                 zarr.consolidate_metadata(zarr_root.store)
 
 
+def restore_noise_levels_ibl(datasets, one=None, dry_run=False, verbose=True):
+    """
+    This function will restore noise levels for IBL datasets.
+    """
+    import spikeinterface as si
+    import spikeinterface.extractors as se
+    import spikeinterface.generation as sgen
+
+    for dataset in datasets:
+        if verbose:
+            print(f"Processing dataset: {dataset}")
+        pid = dataset.split("_")[-1][:-5]
+        dataset_path = f"s3://spikeinterface-template-database/{dataset}"
+        recording = se.read_ibl_recording(pid=pid, load_sync_channel=False, stream_type="ap", one=one)
+
+        default_params = si.get_default_analyzer_extension_params("noise_levels")
+        if verbose:
+            print(f"\tComputing noise levels")
+        noise_levels = si.get_noise_levels(recording, return_scaled=True, **default_params)
+
+        if dry_run:
+            mode = "r"
+        else:
+            mode = "r+"
+        zarr_root = zarr.open(dataset_path, mode=mode)
+        if not dry_run:
+            if verbose:
+                print(f"\tRestoring noise levels")
+            zarr_root["channel_noise_levels"] = noise_levels
+            zarr.consolidate_metadata(zarr_root.store)
+        else:
+            if verbose:
+                print(f"\tCurrent shape: {zarr_root['channel_noise_levels'].shape}")
+                print(f"\tDry run: would restore noise levels: {noise_levels.shape}")
+
+
 def delete_templates_with_num_samples(dry_run=False):
     """
     This function will delete templates with number of samples,