Merge branch 'main' into merging_units

SpikeInterface · Jun 26, 2024 · 3bf14bc · 3bf14bc
2 parents 4338fe3 + 99cc04e
commit 3bf14bc
Show file tree

Hide file tree

Showing 10 changed files with 222 additions and 79 deletions.
diff --git a/.github/run_tests.sh b/.github/run_tests.sh
@@ -1,8 +1,13 @@
 #!/bin/bash
 
 MARKER=$1
+NOVIRTUALENV=$2
+
+# Check if the second argument is provided and if it is equal to --no-virtual-env
+if [ -z "$NOVIRTUALENV" ] || [ "$NOVIRTUALENV" != "--no-virtual-env" ]; then
+  source $GITHUB_WORKSPACE/test_env/bin/activate
+fi
 
-source $GITHUB_WORKSPACE/test_env/bin/activate
 pytest -m "$MARKER" -vv -ra --durations=0 --durations-min=0.001 | tee report.txt; test ${PIPESTATUS[0]} -eq 0 || exit 1
 echo "# Timing profile of ${MARKER}" >> $GITHUB_STEP_SUMMARY
 python $GITHUB_WORKSPACE/.github/build_job_summary.py report.txt >> $GITHUB_STEP_SUMMARY

diff --git a/.github/workflows/all-tests.yml b/.github/workflows/all-tests.yml
@@ -0,0 +1,129 @@
+name: Complete tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 12 * * 0"  # Weekly on Sunday at noon UTC
+  pull_request:
+    types: [synchronize, opened, reopened]
+    branches:
+      - main
+
+env:
+  KACHERY_CLOUD_CLIENT_ID: ${{ secrets.KACHERY_CLOUD_CLIENT_ID }}
+  KACHERY_CLOUD_PRIVATE_KEY: ${{ secrets.KACHERY_CLOUD_PRIVATE_KEY }}
+
+concurrency:  # Cancel previous workflows on the same pull request
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run:
+    name: ${{ matrix.os }} Python ${{ matrix.python-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.12"]  # Lower and higher versions we support
+        os: [macos-13, windows-latest, ubuntu-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          # cache: 'pip' # caching pip dependencies
+
+      - name: Get current hash (SHA) of the ephy_testing_data repo
+        id: repo_hash
+        run: |
+          echo "dataset_hash=$(git ls-remote https://gin.g-node.org/NeuralEnsemble/ephy_testing_data.git HEAD | cut -f1)"
+          echo "dataset_hash=$(git ls-remote https://gin.g-node.org/NeuralEnsemble/ephy_testing_data.git HEAD | cut -f1)" >> $GITHUB_OUTPUT
+        shell: bash
+      - name: Cache datasets
+        id: cache-datasets
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/spikeinterface_datasets
+          key: ${{ runner.os }}-datasets-${{ steps.repo_hash.outputs.dataset_hash }}
+          restore-keys: ${{ runner.os }}-datasets
+
+      - name: Install packages
+        run: |
+          git config --global user.email "[email protected]"
+          git config --global user.name "CI Almighty"
+          pip install -e .[test,extractors,streaming_extractors,full]
+          pip install tabulate
+        shell: bash
+
+      - name: Installad datalad
+        run: |
+          pip install datalad-installer
+          if [ ${{ runner.os }} = 'Linux' ]; then
+            datalad-installer --sudo ok git-annex --method datalad/packages
+          elif [ ${{ runner.os }} = 'macOS' ]; then
+            datalad-installer --sudo ok git-annex --method brew
+          elif [ ${{ runner.os }} = 'Windows' ]; then
+            datalad-installer --sudo ok git-annex --method datalad/git-annex:release
+          fi
+          pip install datalad
+          git config --global filter.annex.process "git-annex filter-process"  # recommended for efficiency
+        shell: bash
+
+      - name: Set execute permissions on run_tests.sh
+        run: chmod +x .github/run_tests.sh
+        shell: bash
+
+      - name: Test core
+        run: pytest -m "core"
+        shell: bash
+
+      - name: Test extractors
+        env:
+          HDF5_PLUGIN_PATH: ${{ github.workspace }}/hdf5_plugin_path_maxwell
+        run: pytest -m "extractors"
+        shell: bash
+
+      - name: Test preprocessing
+        run: ./.github/run_tests.sh "preprocessing and not deepinterpolation" --no-virtual-env
+        shell: bash
+
+      - name: Test postprocessing
+        run: ./.github/run_tests.sh postprocessing --no-virtual-env
+        shell: bash
+
+      - name: Test quality metrics
+        run: ./.github/run_tests.sh qualitymetrics --no-virtual-env
+        shell: bash
+
+      - name: Test comparison
+        run: ./.github/run_tests.sh comparison --no-virtual-env
+        shell: bash
+
+      - name: Test core sorters
+        run: ./.github/run_tests.sh sorters --no-virtual-env
+        shell: bash
+
+      - name: Test internal sorters
+        run: ./.github/run_tests.sh sorters_internal --no-virtual-env
+        shell: bash
+
+      - name: Test curation
+        run: ./.github/run_tests.sh curation --no-virtual-env
+        shell: bash
+
+      - name: Test widgets
+        run: ./.github/run_tests.sh widgets --no-virtual-env
+        shell: bash
+
+      - name: Test exporters
+        run: ./.github/run_tests.sh exporters --no-virtual-env
+        shell: bash
+
+      - name: Test sortingcomponents
+        run: ./.github/run_tests.sh sortingcomponents --no-virtual-env
+        shell: bash
+
+      - name: Test generation
+        run: ./.github/run_tests.sh generation --no-virtual-env
+        shell: bash
diff --git a/.github/workflows/caches_cron_job.yml b/.github/workflows/caches_cron_job.yml
@@ -2,64 +2,35 @@ name: Create caches for gin ecephys data and virtual env
 
 on:
   workflow_dispatch:
-  push:  # When someting is pushed into main this checks if caches need to re-created
+  push:  # When something is pushed into main this checks if caches need to be re-created
     branches:
       - main
   schedule:
     - cron: "0 12 * * *"  # Daily at noon UTC
 
 jobs:
-
-
-
-  create-virtual-env-cache-if-missing:
-    name: Caching virtual env
-    runs-on: "ubuntu-latest"
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-      - name: Get current year-month
-        id: date
-        run: |
-          echo "date=$(date +'%Y-%m')" >> $GITHUB_OUTPUT
-      - name: Get current dependencies hash
-        id: dependencies
-        run: |
-          echo "hash=${{hashFiles('**/pyproject.toml')}}" >> $GITHUB_OUTPUT
-      - uses: actions/cache@v4
-        id: cache-venv
-        with:
-          path: ${{ github.workspace }}/test_env
-          key: ${{ runner.os }}-venv-${{ steps.dependencies.outputs.hash }}-${{ steps.date.outputs.date }}
-          lookup-only: 'true'   # Avoids downloading the data, saving behavior is not affected.
-      - name: Cache found?
-        run: echo "Cache-hit == ${{steps.cache-venv.outputs.cache-hit == 'true'}}"
-      - name: Create the virtual environment to be cached
-        if: steps.cache-venv.outputs.cache-hit != 'true'
-        uses: ./.github/actions/build-test-environment
-
-
-
-
   create-gin-data-cache-if-missing:
     name: Caching data env
-    runs-on: "ubuntu-latest"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
     steps:
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.10'
+          python-version: '3.11'
       - name: Create the directory to store the data
         run: |
-          mkdir --parents --verbose $HOME/spikeinterface_datasets/ephy_testing_data/
-          chmod -R 777 $HOME/spikeinterface_datasets
-          ls -l $HOME/spikeinterface_datasets
+          mkdir -p  ~/spikeinterface_datasets/ephy_testing_data/
+          ls -l ~/spikeinterface_datasets
+        shell: bash
       - name: Get current hash (SHA) of the ephy_testing_data repo
         id: repo_hash
         run: |
           echo "dataset_hash=$(git ls-remote https://gin.g-node.org/NeuralEnsemble/ephy_testing_data.git HEAD | cut -f1)"
           echo "dataset_hash=$(git ls-remote https://gin.g-node.org/NeuralEnsemble/ephy_testing_data.git HEAD | cut -f1)" >> $GITHUB_OUTPUT
+        shell: bash
       - uses: actions/cache@v4
         id: cache-datasets
         with:
@@ -68,31 +39,42 @@ jobs:
           lookup-only: 'true'   # Avoids downloading the data, saving behavior is not affected.
       - name: Cache found?
         run: echo "Cache-hit == ${{steps.cache-datasets.outputs.cache-hit == 'true'}}"
+        shell: bash
       - name: Installing datalad and git-annex
         if: steps.cache-datasets.outputs.cache-hit != 'true'
         run: |
           git config --global user.email "[email protected]"
           git config --global user.name "CI Almighty"
           python -m pip install -U pip  # Official recommended way
           pip install datalad-installer
-          datalad-installer --sudo ok git-annex --method datalad/packages
+          if [ ${{ runner.os }} == 'Linux' ]; then
+            datalad-installer --sudo ok git-annex --method datalad/packages
+          elif [ ${{ runner.os }} == 'macOS' ]; then
+            datalad-installer --sudo ok git-annex --method brew
+          elif [ ${{ runner.os }} == 'Windows' ]; then
+            datalad-installer --sudo ok git-annex --method datalad/git-annex:release
+          fi
           pip install datalad
           git config --global filter.annex.process "git-annex filter-process"  # recommended for efficiency
+        shell: bash
       - name: Download dataset
         if: steps.cache-datasets.outputs.cache-hit != 'true'
         run: |
           datalad install --recursive --get-data https://gin.g-node.org/NeuralEnsemble/ephy_testing_data
+        shell: bash
       - name: Move the downloaded data to the right directory
         if: steps.cache-datasets.outputs.cache-hit != 'true'
         run: |
-          mv --force ./ephy_testing_data $HOME/spikeinterface_datasets/
+          mv ./ephy_testing_data ~/spikeinterface_datasets/
+        shell: bash
       - name: Show size of the cache to assert data is downloaded
         run: |
-          cd $HOME
+          cd ~
           pwd
           du -hs spikeinterface_datasets  # Should show the size of ephy_testing_data
           cd spikeinterface_datasets
           pwd
           ls -lh  # Should show ephy_testing_data
           cd ephy_testing_data
           ls -lh
+        shell: bash
diff --git a/pyproject.toml b/pyproject.toml
@@ -137,10 +137,9 @@ test = [
 
     # for sortingview backend
     "sortingview",
-
-    # recent datalad need a too recent version for git-annex
-    # so we use an old one here
-    "datalad==0.16.2",
+    # Download data
+    "pooch>=1.8.2",
+    "datalad>=1.0.2",
 
     ## install tridesclous for testing ##
     "tridesclous>=1.6.8",

diff --git a/src/spikeinterface/core/datasets.py b/src/spikeinterface/core/datasets.py
@@ -14,56 +14,80 @@ def download_dataset(
     remote_path: str = "mearec/mearec_test_10s.h5",
     local_folder: Path | None = None,
     update_if_exists: bool = False,
-    unlock: bool = False,
 ) -> Path:
     """
-    Function to download dataset from a remote repository using datalad.
+    Function to download dataset from a remote repository using a combination of datalad and pooch.
+
+    Pooch is designed to download single files from a remote repository.
+    Because our datasets in gin sometimes point just to a folder, we still use datalad to download
+    a list of all the files in the folder and then use pooch to download them one by one.
 
     Parameters
     ----------
     repo : str, default: "https://gin.g-node.org/NeuralEnsemble/ephy_testing_data"
         The repository to download the dataset from
     remote_path : str, default: "mearec/mearec_test_10s.h5"
         A specific subdirectory in the repository to download (e.g. Mearec, SpikeGLX, etc)
-    local_folder : str, default: None
+    local_folder : str, optional
         The destination folder / directory to download the dataset to.
-        defaults to the path "get_global_dataset_folder()" / f{repo_name} (see `spikeinterface.core.globals`)
+        if None, then the path "get_global_dataset_folder()" / f{repo_name} is used (see `spikeinterface.core.globals`)
     update_if_exists : bool, default: False
         Forces re-download of the dataset if it already exists, default: False
-    unlock : bool, default: False
-        Use to enable the edition of the downloaded file content, default: False
 
     Returns
     -------
     Path
         The local path to the downloaded dataset
+
+    Notes
+    -----
+    The reason we use pooch is because have had problems with datalad not being able to download
+    data on windows machines. Especially in the CI.
+
+    See https://handbook.datalad.org/en/latest/intro/windows.html
     """
+    import pooch
     import datalad.api
     from datalad.support.gitrepo import GitRepo
 
     if local_folder is None:
         base_local_folder = get_global_dataset_folder()
         base_local_folder.mkdir(exist_ok=True, parents=True)
         local_folder = base_local_folder / repo.split("/")[-1]
+        local_folder.mkdir(exist_ok=True, parents=True)
+    else:
+        if not local_folder.is_dir():
+            local_folder.mkdir(exist_ok=True, parents=True)
 
     local_folder = Path(local_folder)
     if local_folder.exists() and GitRepo.is_valid_repo(local_folder):
         dataset = datalad.api.Dataset(path=local_folder)
-        # make sure git repo is in clean state
-        repo = dataset.repo
-        if update_if_exists:
-            repo.call_git(["checkout", "--force", "master"])
-            dataset.update(merge=True)
     else:
         dataset = datalad.api.install(path=local_folder, source=repo)
 
     local_path = local_folder / remote_path
+    dataset_status = dataset.status(path=remote_path, annex="simple")
+
+    # Download only files that also have a git-annex key
+    dataset_status_files = [status for status in dataset_status if status["type"] == "file"]
+    dataset_status_files = [status for status in dataset_status_files if "key" in status]
 
-    # This downloads the data set content
-    dataset.get(remote_path)
+    git_annex_hashing_algorithm = {"MD5E": "md5"}
+    for status in dataset_status_files:
+        hash_algorithm = git_annex_hashing_algorithm[status["backend"]]
+        hash = status["keyname"].split(".")[0]
+        known_hash = f"{hash_algorithm}:{hash}"
+        fname = Path(status["path"]).relative_to(local_folder)
+        url = f"{repo}/raw/master/{fname.as_posix()}"
+        expected_full_path = local_folder / fname
 
-    # Unlock files of a dataset in order to be able to edit the actual content
-    if unlock:
-        dataset.unlock(remote_path, recursive=True)
+        full_path = pooch.retrieve(
+            url=url,
+            fname=str(fname),
+            path=local_folder,
+            known_hash=known_hash,
+            progressbar=True,
+        )
+        assert full_path == str(expected_full_path)
 
     return local_path
diff --git a/src/spikeinterface/extractors/tests/common_tests.py b/src/spikeinterface/extractors/tests/common_tests.py
@@ -18,8 +18,9 @@ class CommonTestSuite:
     downloads = []
     entities = []
 
-    def setUp(self):
-        for remote_path in self.downloads:
+    @classmethod
+    def setUpClass(cls):
+        for remote_path in cls.downloads:
             download_dataset(repo=gin_repo, remote_path=remote_path, local_folder=local_folder, update_if_exists=True)