Fix/gh actions and docs imporovements (#30)

* docs changes * updates to pypi workflow * formatting docs and badges * py3.8 for docs * updates to docs formatting and theme * removed old theme ref in conf.py * updated gh actions workflows * added pre-commit hooks, updated makefile * updated gitignore * refactored test path * updated rst files and sphinx requirement * formatting and import sorting * refactored tests for datasets * docstrings fixes * bump up version
wilhelm-lab · Feb 5, 2024 · 510e522 · 510e522
1 parent 5ef83fc
commit 510e522
Show file tree

Hide file tree

Showing 55 changed files with 946 additions and 466 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -5,14 +5,15 @@ on:
   push:
     branches:
       - main
+      - develop
 
 jobs:
   build:
 
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7"]
+        python-version: ["3.8"]
 
     steps:
       - uses: actions/checkout@v2

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -1,35 +1,41 @@
 name: PyPI
 
 on:
+  workflow_dispatch:
   workflow_run:
     workflows: ["Build"]
     types:
       - completed
+  push:
+    branches:
+      - main
 
 jobs:
-  deploy:
+  release:
     runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: '3.7'
+          python-version: '3.8'
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install build
       - name: Build package
         run: python -m build
-      - name: Publish package
+      - name: publish_pypi
         uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
         with:
           user: __token__
           password: ${{ secrets.SECRET_PYPI }}
       - name: Create Github release
+        if: steps.publish_pypi.outcome == 'success'
         env:
           GITHUB_TOKEN: ${{ secrets.CLI_GH_TOKEN }}
         run: |
-          VERSION=$(python -c "from dlomix import __version__; print(__version__);")
+          VERSION=$(python setup.py --version)
           gh release create v$VERSION --title $VERSION --generate-notes
         
diff --git a/.gitignore b/.gitignore
@@ -54,6 +54,9 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 
+# put all coverage in one place
+cov/
+
 # Translations
 *.mo
 *.pot
@@ -150,13 +153,18 @@ notebooks/wandb
 
 # local to do file if exists :)
 todo.txt
+todo.md
 
 .DS_Store
 
 # model checkpoints in the run scripts directory
 run_scripts/checkpoint*
 run_scripts/*.index
 run_scripts/*.data-*
+run_scripts/*.csv
 
 # testing metadata
 metadata.parquet
+
+# test assets (will be downloaded the first time tests are run and then ignore by git)
+assets/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+    -   id: isort
+        name: isort (python)
+        args: ["--profile",  "black"]
+-   repo: https://github.com/psf/black
+    rev: 23.1.0
+    hooks:
+    -   id: black
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -9,7 +9,7 @@ version: 2
 build:
   os: ubuntu-20.04
   tools:
-    python: "3.7"
+    python: "3.8"
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:

diff --git a/Makefile b/Makefile
@@ -1,17 +1,44 @@
 install:
 	pip install --upgrade pip && pip install -e .
 
+uninstall:
+	pip uninstall dlomix -y
+
+install-nodeps:
+	pip install --upgrade pip && pip install -e . --no-deps
+
+
 install-dev:
 	pip install --upgrade pip && pip install -e .[dev]
 
 .PHONY: test
 test:
-	python -m pytest ./test/*.py --doctest-modules --junitxml=junit/test-results.xml --cov=dlomix --cov-report=xml --cov-report=html
+	make uninstall
+	make install
+	mkdir -p cov/
+
+	python -m pytest tests/ --junitxml=junit/test-results.xml --cov=dlomix --cov-report html:cov/cov_html --cov-report xml:cov/cov.xml --cov-report lcov:cov/cov.info --cov-report annotate:cov/cov_annotate
+
+test-local:
+	make uninstall
+	make install-nodeps
+	mkdir -p cov/
+
+	python -m pytest tests/ --junitxml=junit/test-results.xml --cov=dlomix --cov-report html:cov/cov_html --cov-report xml:cov/cov.xml --cov-report lcov:cov/cov.info --cov-report annotate:cov/cov_annotate
+
 
 format:
 	black ./dlomix/*
+	isort --profile black .
+	black ./dlomix/*.py
+	black ./run_scripts/*.py
+	black ./tests/*.py
 
 lint:
 	pylint --disable=R,C ./dlomix/*
 
+build-docs:
+	cd docs && make clean html
+	cd docs/_build/html/ && open index.html
+
 all: install format test
diff --git a/dlomix/__init__.py b/dlomix/__init__.py
@@ -1,10 +1,10 @@
-__version__ = "0.0.6"
+__version__ = "0.0.7"
 
 META_DATA = {
     "author": "Omar Shouman",
     "author_email": "[email protected]",
     "description": "Deep Learning for Proteomics",
     "package_name": "DLOmix",
-    "copyright_text": "2023, Wilhelm Lab, TU Munich.",
+    "copyright_text": "2024, Wilhelm Lab, TU Munich.",
     "github_url": "https://github.com/wilhelm-lab/dlomix",
 }
diff --git a/dlomix/data/RetentionTimeDataset.py b/dlomix/data/RetentionTimeDataset.py
@@ -1,7 +1,9 @@
 import json
-import pandas as pd
+
 import numpy as np
+import pandas as pd
 import tensorflow as tf
+
 from dlomix.constants import DEFAULT_PARQUET_ENGINE
 
 """
@@ -22,29 +24,29 @@ class RetentionTimeDataset:
     data_source : str, tuple of two numpy.ndarray, numpy.ndarray, optional
         source can be a tuple of two arrays (sequences, targets), single array (sequences), useful for test data, or a str with a file path to a csv file. Defaults to None.
     sep : str, optional
-        separator to be used if the data source is a CSV file. Defaults to ",".
+        separator to be used if the data source is a CSV file. Defaults to ``,``.
     sequence_col :  str, optional
-        name of the column containing the sequences in the provided CSV. Defaults to "sequence".
+        name of the column containing the sequences in the provided CSV. Defaults to ``sequence``.
     target_col : str, optional
-        name of the column containing the targets (indexed retention time). Defaults to "irt".
+        name of the column containing the targets (indexed retention time). Defaults to ``irt``.
     feature_cols : list, optional
-        a list of columns containing other features that can be used later as inputs to a model. Defaults to None.
+        a list of columns containing other features that can be used later as inputs to a model. Defaults to ``None``.
     normalize_targets : bool, optional
-        a boolean whether to normalize the targets or not (subtract mean and divied by standard deviation). Defaults to False.
+        a boolean whether to normalize the targets or not (subtract mean and divied by standard deviation). Defaults to ``False``.
     seq_length : int, optional
-        the sequence length to be used, where all sequences will be padded to this length, longer sequences will be removed and not truncated. Defaults to 0.
+        the sequence length to be used, where all sequences will be padded to this length, longer sequences will be removed and not truncated. Defaults to ``0``.
     batch_size : int, optional
-        the batch size to be used for consuming the dataset in training a model. Defaults to 32.
+        the batch size to be used for consuming the dataset in training a model. Defaults to ``32``.
     val_ratio : int, optional
-        a fraction to determine the size of the validation data (0.2 = 20%). Defaults to 0.
+        a fraction to determine the size of the validation data ``0.2 = 20%``. Defaults to ``0``.
     seed: int, optional
-        a seed to use for splitting the data to allow for a reproducible split. Defaults to 21.
+        a seed to use for splitting the data to allow for a reproducible split. Defaults to ``21``.
     test :bool, optional
-        a boolean whether the dataset is a test dataset or not. Defaults to False.
+        a boolean whether the dataset is a test dataset or not. Defaults to ``False``.
     path_aminoacid_atomcounts : str, optional
-        a string with a path to a CSV table with the atom counts of the different amino acids (can be used for feature extraction). Defaults to None.
+        a string with a path to a CSV table with the atom counts of the different amino acids (can be used for feature extraction). Defaults to ``None``.
     sample_run : bool, optional
-        a boolean to limit the number of examples to a small number, SAMPLE_RUN_N, for testing and debugging purposes. Defaults to False.
+        a boolean to limit the number of examples to a small number, SAMPLE_RUN_N, for testing and debugging purposes. Defaults to ``False``.
     """
     ATOM_TABLE = None
     SPLIT_NAMES = ["train", "val", "test"]
@@ -376,7 +378,6 @@ def _pad_sequences(self, seq, target):
         return seq, target
 
     def _normalize_target(self, seq, target):
-
         target = tf.math.divide(
             tf.math.subtract(target, self._data_mean), self._data_std
         )

diff --git a/dlomix/data/__init__.py b/dlomix/data/__init__.py
@@ -1,4 +1,4 @@
-from .RetentionTimeDataset import *
 from .IntensityDataset import *
+from .RetentionTimeDataset import *
 
 __all__ = ["RetentionTimeDataset", "IntensityDataset"]
diff --git a/dlomix/eval/rt_eval.py b/dlomix/eval/rt_eval.py
@@ -3,17 +3,24 @@
 
 
 class TimeDeltaMetric(tf.keras.metrics.Metric):
-    """Implementation of the time delta metric as a Keras Metric.
+    r"""Implementation of the time delta metric as a Keras Metric.
 
     Parameters
     ----------
-        mean (int, optional): Mean value of the targets in case normalization was performed. Defaults to 0.
-        std (int, optional): Standard deviation value of the targets in case normalization was performed. Defaults to 1.
-        percentage (float, optional): What percentage of the data points to consider, this is specific to the conmputation of the metric. Defaults to 0.95 which corresponds to 95% of the datapoints and is the mostly used value in papers.
-        name (str, optional): Name of the metric so that it can be reported and used later in Keras History objects. Defaults to 'timedelta'.
-        rescale_targets (bool, optional): Whether to rescale (denormalize) targets or not. Defaults to False.
-        rescale_predictions (bool, optional): Whether to rescale (denormalize) predictions or not. Defaults to False.
-        double_delta (bool, optional): Whether to multiple the computed delta by 2 in order to make it two-sided or not. Defaults to False.
+    mean : int, optional
+        Mean value of the targets in case normalization was performed. Defaults to 0.
+    std : int, optional
+        Standard deviation value of the targets in case normalization was performed. Defaults to 1.
+    percentage : float, optional
+        What percentage of the data points to consider, this is specific to the computation of the metric. Defaults to 0.95 which corresponds to 95% of the data points and is the mostly used value in papers.
+    name : str, optional
+        Name of the metric so that it can be reported and used later in Keras History objects. Defaults to 'timedelta'.
+    rescale_targets : bool, optional
+        Whether to rescale (denormalize) targets or not. Defaults to False.
+    rescale_predictions : bool, optional
+        Whether to rescale (denormalize) predictions or not. Defaults to False.
+    double_delta : bool, optional
+        Whether to multiply the computed delta by 2 in order to make it two-sided or not. Defaults to False.
     """
 
     def __init__(
@@ -27,7 +34,6 @@ def __init__(
         double_delta=False,
         **kwargs
     ):
-
         super(TimeDeltaMetric, self).__init__(name=name, **kwargs)
         self.delta = self.add_weight(name="delta", initializer="zeros")
         self.batch_count = self.add_weight(name="batch-count", initializer="zeros")
@@ -66,7 +72,7 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         self.delta.assign_add(tf.math.reduce_sum(d))
 
     def result(self):
-        # this is simple averaging over the batches, more complex reduction can be added based on domain expertises
+        # this is simple averaging over the batches, more complex reduction can be added based on domain expertise
         # Examples are: take max or min of both deltas (translates to a strict or a relaxed metric)
         return tf.math.divide(self.delta, self.batch_count)
 

diff --git a/dlomix/layers/attention.py b/dlomix/layers/attention.py
@@ -1,6 +1,6 @@
 import tensorflow as tf
 import tensorflow.keras.backend as K
-from tensorflow.keras import regularizers, constraints, initializers, activations
+from tensorflow.keras import activations, constraints, initializers, regularizers
 
 
 class DecoderAttentionLayer(tf.keras.layers.Layer):
@@ -34,7 +34,6 @@ def __init__(
         bias=True,
         **kwargs
     ):
-
         self.supports_masking = True
         self.init = initializers.get("glorot_uniform")
         self.W_regularizer = regularizers.get(W_regularizer)

diff --git a/dlomix/losses/__init__.py b/dlomix/losses/__init__.py
@@ -1,3 +1,3 @@
-from .intensity import masked_spectral_distance, masked_pearson_correlation_distance
+from .intensity import masked_pearson_correlation_distance, masked_spectral_distance
 
 __all__ = [masked_spectral_distance, masked_pearson_correlation_distance]
diff --git a/dlomix/losses/intensity.py b/dlomix/losses/intensity.py
@@ -40,7 +40,7 @@ def masked_pearson_correlation_distance(y_true, y_pred):
 
     mx = tf.math.reduce_mean(true_masked)
     my = tf.math.reduce_mean(pred_masked)
-    xm, ym = true_masked-mx, pred_masked-my
+    xm, ym = true_masked - mx, pred_masked - my
     r_num = tf.math.reduce_mean(tf.multiply(xm, ym))
     r_den = tf.math.reduce_std(xm) * tf.math.reduce_std(ym)
-    return 1 - (r_num/r_den)
+    return 1 - (r_num / r_den)
diff --git a/dlomix/models/base.py b/dlomix/models/base.py
@@ -1,17 +1,22 @@
 import tensorflow as tf
 from tensorflow.keras.layers.experimental import preprocessing
+
 from dlomix.constants import ALPHABET_UNMOD
 
 
 class RetentionTimePredictor(tf.keras.Model):
-    """A simple class for Retention Time prediction models.
+    r"""A simple class for Retention Time prediction models.
 
     Parameters
     ----------
-        embedding_dim (int, optional): Dimensionality of the embeddings to be used for representing the Amino Acids. Defaults to 16.
-        seq_length (int, optional): Sequence length of the peptide sequences. Defaults to 30.
-        encoder (str, optional): String for specifying the decoder to use, either based on 1D conv-layers or LSTMs. Defaults to "conv1d".
-        vocab_dict (dict, optional): Dictionary mapping for the vocabulary (the amino acids in this case). Defaults to ALPHABET_UNMOD.
+        embedding_dim: int, optional
+            Dimensionality of the embeddings to be used for representing the Amino Acids. Defaults to ``16``.
+        seq_length: int, optional
+            Sequence length of the peptide sequences. Defaults to ``30``.
+        encoder: str, optional
+            String for specifying the decoder to use, either based on 1D conv-layers or LSTMs. Defaults to ``conv1d``.
+        vocab_dict: dict, optional
+            Dictionary mapping for the vocabulary (the amino acids in this case). Defaults to ``ALPHABET_UNMOD``.
     """
 
     def __init__(

diff --git a/dlomix/models/deepLC.py b/dlomix/models/deepLC.py
@@ -1,7 +1,8 @@
 import tensorflow as tf
-from dlomix.constants import ALPHABET_UNMOD
 from tensorflow.keras.layers.experimental import preprocessing
 
+from dlomix.constants import ALPHABET_UNMOD
+
 
 class DeepLCRetentionTimePredictor(tf.keras.Model):
     def __init__(
@@ -38,7 +39,6 @@ def _build_aminoacid_branch(self):
         )
 
     def _build_diaminoacid_branch(self):
-
         self.diaminoacid_branch = tf.keras.Sequential(
             [
                 self._build_conv_pool_block(n_filters=128, kernel=2, padding="same"),
@@ -87,7 +87,6 @@ def _build_conv_pool_block(
         pool_strides=2,
         pool_size=2,
     ):
-
         # leaky relu by default
         activation_fn = self.leaky_relu