Dev/main (#117)

janelia-cellmap · Feb 21, 2024 · 874a4a0 · 874a4a0
2 parents c97d8cf + 415a7b3
commit 874a4a0
Show file tree

Hide file tree

Showing 116 changed files with 2,622 additions and 869 deletions.
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,26 @@
+* {{ cookiecutter.project_name }} version:
+* Python version:
+* Operating System:
+
+### Bug or feature?
+
+Are you reporting a bug or adding a new feature? Perhaps you want more documentation? 
+Please tell us here and tag the issue appropriately.
+
+### Description
+
+Describe the bug you want fixed, the feature you want to see, etc. 
+If you are requesting a new feature, please say why you think that feature is important and belongs in this library, and any guesses on how to implement it.
+
+### What I Did/What I Would Like to Do
+
+For a bug:
+```
+Paste the command(s) you ran and the output.
+If there was a crash, please include the traceback here.
+```
+
+For a feature:
+```
+Provide sample code that you would hope to run and what it would output.
+```
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -16,12 +16,11 @@ jobs:
       with:
         fetch-depth: 0 # otherwise, you will failed to push refs to dest repo
     - name: install dacapo
-      run: pip install .
+      run: pip install .[docs]
     - name: Build and Commit
       uses: sphinx-notes/pages@v2
       with:
         documentation_path: ./docs/source
-        requirements_path: ./docs/requirements.txt
     - name: Push changes
       uses: ad-m/github-push-action@master
       with:

diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,5 @@ __pycache__
 # vscode stuff
 .vscode
 .mypy_cache
+daisy_logs/
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,12 +22,3 @@ repos:
     rev: v0.16
     hooks:
       - id: validate-pyproject
-
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
-    hooks:
-      - id: mypy
-        files: "^dacapo/"
-        # # you have to add the things you want to type check against here
-        # additional_dependencies:
-        #   - numpy
diff --git a/CONTRIBUTOR.md → CONTRIBUTING.md b/CONTRIBUTOR.md → CONTRIBUTING.md
@@ -20,6 +20,9 @@ To run tests with coverage locally:
 `pytest tests --color=yes --cov --cov-report=term-missing`
 This will also be run automatically when a PR is made to master and a codecov report will be generated telling you if your PR increased or decreased coverage.
 
+## Doc Generation
+Docstrings are generated using github action. but you can generate them using
+`sphinx-build -M html docs/source/ docs/Cbuild/`
 
 ## Branching and PRs
 - Users that have been added to the CellMap organization and the DaCapo project should be able to develop directly into the CellMap fork of DaCapo. Other users will need to create a fork.

diff --git a/dacapo/__init__.py b/dacapo/__init__.py
@@ -1,6 +1,24 @@
+"""
+dacapo module
+==============
+
+This module contains several useful methods for performing common tasks in dacapo library.
+
+Modules:
+-----------
+Options            - Deals with configuring aspects of the program's operations.
+experiments        - This module is responsible for conducting experiments.
+apply              - Applies the results of the training process to the given dataset.
+train              - Trains the model using given data set.
+validate           - This module is for validating the model.
+predict            - This module is used to generate predictions based on the model.
+
+"""
+
 from .options import Options  # noqa
 from . import experiments  # noqa
 from .apply import apply  # noqa
 from .train import train  # noqa
 from .validate import validate  # noqa
 from .predict import predict  # noqa
+
diff --git a/dacapo/apply.py b/dacapo/apply.py
@@ -26,15 +26,15 @@
 
 def apply(
     run_name: str,
-    input_container: Path or str,
+    input_container: Path | str,
     input_dataset: str,
-    output_path: Path or str,
-    validation_dataset: Optional[Dataset or str] = None,
-    criterion: Optional[str] = "voi",
+    output_path: Path | str,
+    validation_dataset: Optional[Dataset | str] = None,
+    criterion: str = "voi",
     iteration: Optional[int] = None,
-    parameters: Optional[PostProcessorParameters or str] = None,
-    roi: Optional[Roi or str] = None,
-    num_cpu_workers: int = 30,
+    parameters: Optional[PostProcessorParameters | str] = None,
+    roi: Optional[Roi | str] = None,
+    num_workers: int = 30,
     output_dtype: Optional[np.dtype | str] = np.uint8,  # type: ignore
     compute_context: ComputeContext = LocalTorch(),
     overwrite: bool = True,
@@ -75,24 +75,27 @@ def apply(
     logger.info("Loading weights for iteration %i", iteration)
     weights_store.retrieve_weights(run_name, iteration)
 
-    # find the best parameters
-    if isinstance(validation_dataset, str) and run.datasplit.validate is not None:
-        val_ds_name = validation_dataset
-        validation_dataset = [
-            dataset for dataset in run.datasplit.validate if dataset.name == val_ds_name
-        ][0]
-    elif isinstance(validation_dataset, Dataset) or parameters is not None:
-        pass
-    else:
-        raise ValueError(
-            "validation_dataset must be a dataset name or a Dataset object, or parameters must be provided explicitly."
-        )
     if parameters is None:
+        # find the best parameters
+        _validation_dataset: Dataset
+        if isinstance(validation_dataset, str) and run.datasplit.validate is not None:
+            val_ds_name = validation_dataset
+            _validation_dataset = [
+                dataset
+                for dataset in run.datasplit.validate
+                if dataset.name == val_ds_name
+            ][0]
+        elif isinstance(validation_dataset, Dataset):
+            _validation_dataset = validation_dataset
+        else:
+            raise ValueError(
+                "validation_dataset must be a dataset name or a Dataset object, or parameters must be provided explicitly."
+            )
         logger.info(
-            "Finding best parameters for validation dataset %s", validation_dataset
+            "Finding best parameters for validation dataset %s", _validation_dataset
         )
         parameters = run.task.evaluator.get_overall_best_parameters(  # TODO
-            validation_dataset, criterion
+            _validation_dataset, criterion
         )
         assert (
             parameters is not None
@@ -157,42 +160,45 @@ def apply(
         Path(input_container, input_dataset),
     )
     return apply_run(
-        run,
+        run.name,
+        iteration,
         parameters,
-        input_array,
+        input_array_identifier,
         prediction_array_identifier,
         output_array_identifier,
         roi,
-        num_cpu_workers,
+        num_workers,
         output_dtype,
         compute_context,
         overwrite,
     )
 
 
 def apply_run(
-    run: Run,
+    run_name: str,
+    iteration: int,
     parameters: PostProcessorParameters,
-    input_array: Array,
+    input_array_identifier: "LocalArrayIdentifier",
     prediction_array_identifier: "LocalArrayIdentifier",
     output_array_identifier: "LocalArrayIdentifier",
     roi: Optional[Roi] = None,
-    num_cpu_workers: int = 30,
+    num_workers: int = 30,
     output_dtype: Optional[np.dtype] = np.uint8,  # type: ignore
     compute_context: ComputeContext = LocalTorch(),
     overwrite: bool = True,
 ):
     """Apply the model to a dataset. If roi is None, the whole input dataset is used. Assumes model is already loaded."""
-    run.model.eval()
 
     # render prediction dataset
     logger.info("Predicting on dataset %s", prediction_array_identifier)
     predict(
-        run.model,
-        input_array,
-        prediction_array_identifier,
+        run_name,
+        iteration,
+        input_container=input_array_identifier.container,
+        input_dataset=input_array_identifier.dataset,
+        output_path=prediction_array_identifier.container,
         output_roi=roi,
-        num_workers=num_cpu_workers,
+        num_workers=num_workers,
         output_dtype=output_dtype,
         compute_context=compute_context,
         overwrite=overwrite,

diff --git a/dacapo/blockwise/__init__.py b/dacapo/blockwise/__init__.py
@@ -1,2 +1,2 @@
 from .blockwise_task import DaCapoBlockwiseTask
-from .scheduler import run_blockwise
+from .scheduler import run_blockwise, segment_blockwise
diff --git a/dacapo/blockwise/argmax_worker.py b/dacapo/blockwise/argmax_worker.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from dacapo.experiments.datasplits.datasets.arrays.zarr_array import ZarrArray
 from dacapo.store.array_store import LocalArrayIdentifier
-from dacapo.compute_context import ComputeContext, LocalTorch, Bsub
+from dacapo.compute_context import ComputeContext, LocalTorch
 
 import daisy
 
@@ -41,9 +41,9 @@ def cli(log_level):
 )
 @click.option("-od", "--output_dataset", required=True, type=str)
 def start_worker(
-    input_container: Path or str,
+    input_container: Path | str,
     input_dataset: str,
-    output_container: Path or str,
+    output_container: Path | str,
     output_dataset: str,
 ):
     # get arrays

diff --git a/dacapo/blockwise/blockwise_task.py b/dacapo/blockwise/blockwise_task.py
@@ -1,17 +1,16 @@
 from datetime import datetime
 from importlib.machinery import SourceFileLoader
 from pathlib import Path
-from typing import Callable, Optional
 from daisy import Task, Roi
-from dacapo.compute_context import ComputeContext, LocalTorch, Bsub
+from dacapo.compute_context import ComputeContext
 import dacapo.compute_context
 
 
 class DaCapoBlockwiseTask(Task):
     def __init__(
         self,
-        worker_file: str or Path,
-        compute_context: ComputeContext or str,
+        worker_file: str | Path,
+        compute_context: ComputeContext | str,
         total_roi: Roi,
         read_roi: Roi,
         write_roi: Roi,
@@ -25,14 +24,14 @@ def __init__(
         if isinstance(compute_context, str):
             compute_context = getattr(dacapo.compute_context, compute_context)()
 
-        # Make the task_id unique
-        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        task_id = worker_file + timestamp
-
         # Load worker functions
         worker_name = Path(worker_file).stem
         worker = SourceFileLoader(worker_name, str(worker_file)).load_module()
 
+        # Make the task_id unique
+        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        task_id = worker_name + timestamp
+
         process_function = worker.spawn_worker(
             *args, **kwargs, compute_context=compute_context
         )

diff --git a/dacapo/blockwise/predict_worker.py b/dacapo/blockwise/predict_worker.py
@@ -1,4 +1,6 @@
 from pathlib import Path
+
+import torch
 from dacapo.experiments.datasplits.datasets.arrays.zarr_array import ZarrArray
 from dacapo.gp.dacapo_array_source import DaCapoArraySource
 from dacapo.store.array_store import LocalArrayIdentifier
@@ -60,9 +62,9 @@ def cli(log_level):
 def start_worker(
     run_name: str,
     iteration: int,
-    input_container: Path or str,
+    input_container: Path | str,
     input_dataset: str,
-    output_container: Path or str,
+    output_container: Path | str,
     output_dataset: str,
     device: str = "cuda",
 ):
@@ -86,6 +88,9 @@ def start_worker(
     )
     output_array = ZarrArray.open_from_array_identifier(output_array_identifier)
 
+    # set benchmark flag to True for performance
+    torch.backends.cudnn.benchmark = True
+
     # get the model's input and output size
     model = run.model.eval()
     input_voxel_size = Coordinate(raw_array.voxel_size)