fixed docstrings

wilhelm-lab · Oct 27, 2024 · 389b9cd · 389b9cd
1 parent 80acbc1
commit 389b9cd
Show file tree

Hide file tree

Showing 10 changed files with 38 additions and 23 deletions.
diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py
@@ -66,6 +66,7 @@ def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"],
         Create annotation dataframe for vars in AnnData object.
 
         :param ion_types: ion types that are expected to be in the spectra
+        :param xl: crosslinked or linear peptide
         :return: pd.Dataframe of fragment annotations
         """
         df = pd.DataFrame(
@@ -85,6 +86,7 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> list[str
         Get column names of the spectra data.
 
         :param fragment_type: choose predicted, raw, or mz
+        :param xl: crosslinked or linear peptide
         :return: A list of column names
         """
         prefix = Spectra._resolve_prefix(fragment_type)
@@ -101,7 +103,10 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> list[str
     @staticmethod
     def _resolve_prefix(fragment_type: FragmentType) -> str:
         """
-        Resolve prefix given fragment type (1 for pred, 2 for xl_pred_a, 3 for xl_pred_a, 4 for raw, 5 for xl_raw_a, 6 for xl_raw_b, 7 for mz, 8 for xl_mz_a, 9 for xl_mz_b).
+        Resolve prefix given fragment type.
+
+        (1 for pred, 2 for xl_pred_a, 3 for xl_pred_a, 4 for raw, 5 for xl_raw_a,
+        6 for xl_raw_b, 7 for mz, 8 for xl_mz_a, 9 for xl_mz_b).
 
         :param fragment_type: choose predicted, raw, or mz
         :return: prefix as string
@@ -348,7 +353,6 @@ def from_hdf5(cls: type[SpectraT], input_file: str | Path) -> SpectraT:
         :param input_file: path to input file
         :return: a spectra instance
         """
-
         return cls(anndata.read_h5ad(str(input_file)))
 
     def remove_decoys(self) -> None:

diff --git a/oktoberfest/predict/alignment.py b/oktoberfest/predict/alignment.py
@@ -59,6 +59,7 @@ def _alignment(alignment_library: Spectra, xl: bool = False):
     adds it as a column to the alignment library.
 
     :param alignment_library: the library to perform the alignment on
+    :param xl: crosslinked or linear peptide
     """
     if xl:
         pred_intensity_a = alignment_library.get_matrix(FragmentType.PRED_A)

diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import logging
 from typing import TYPE_CHECKING
 
@@ -12,8 +11,6 @@
 logger = logging.getLogger(__name__)
 
 if TYPE_CHECKING:
-    from typing import Dict, Tuple
-
     import numpy as np
 
 

diff --git a/oktoberfest/predict/predictor.py b/oktoberfest/predict/predictor.py
@@ -122,6 +122,7 @@ def predict_intensities(
 
         :param data: Spectra object containing the required data for prediction and to store the
             predictions in after retrieval from the server.
+        :param xl: crosslinked or linear peptide
         :param chunk_idx: The chunked indices of the provided dataframe. This is required in some cases,
             e.g. if padding should be avoided when predicting peptides of different length.
             For alphapept, this is required as padding is only performed within one batch, leading to
@@ -223,6 +224,7 @@ def predict_at_once(self, data: Spectra, xl: bool = False, **kwargs) -> dict[str
         See the Koina or DLomix predict functions for details. TODO, link this properly.
 
         :param data: Spectra containing the data for the prediction.
+        :param xl: crosslinked or linear peptide
         :param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
 
         :return: a dictionary with targets (keys) and predictions (values)
@@ -304,6 +306,7 @@ def predict_in_chunks(
             e.g. if padding should be avoided when predicting peptides of different length.
             For alphapept, this is required as padding is only performed within one batch, leading to
             different sizes of arrays between individual prediction batches that cannot be concatenated.
+        :param xl: crosslinked or linear peptide
         :param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
 
         :return: a dictionary with targets (keys) and list of predictions (values) with a length equal
@@ -354,6 +357,7 @@ def ce_calibration(
         :param library: spectral library to perform CE calibration on
         :param ce_range: the min and max CE to be tested during calibration
         :param group_by_charge: if true, select the top 1000 spectra independently for each precursor charge
+        :param xl: crosslinked or linear peptide
         :param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
         :return: a spectra object containing the spectral angle for each tested CE
 
@@ -387,7 +391,6 @@ def ce_calibration(
             >>> alignment_library = intensity_predictor.ce_calibration(library=library, ce_range=(15,30), group_by_charge=False)
             >>> print(alignment_library)
         """
-
         alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl)
 
         if "alphapept" in self.model_name.lower():

diff --git a/oktoberfest/preprocessing/preprocessing.py b/oktoberfest/preprocessing/preprocessing.py
@@ -984,6 +984,7 @@ def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None
     :param psms: Spectral library to be annotated.
     :param mass_tol: The mass tolerance allowed for retaining peaks
     :param unit_mass_tol: The unit in which the mass tolerance is given
+    :return: Spectra object containing the annotated b and y ion peaks including metadata
     """
     logger.info("Annotating spectra...")
     df_annotated_spectra = annotate_spectra(psms, mass_tol, unit_mass_tol)

diff --git a/oktoberfest/rescore/rescore.py b/oktoberfest/rescore/rescore.py
@@ -41,6 +41,7 @@ def generate_features(
     :param output_file: the location to the generated tab file to be used for percolator / mokapot
     :param additional_columns: additional columns supplied in the search results to be used as features (either a list or "all")
     :param all_features: whether to use all features or only the standard set TODO
+    :param xl: crosslinked or linear peptide
     :param regression_method: The regression method to use for iRT alignment
     :param add_neutral_loss_features: Flag to indicate whether to add neutral loss features to percolator or not
     :param remove_miss_cleavage_features: Flag to indicate whether to remove miss cleavage features from percolator or not
@@ -233,6 +234,7 @@ def rescore_with_percolator(
     :param num_threads: The number of threads used in parallel for percolator
     :param test_fdr: the fdr cutoff for the test set
     :param train_fdr: the fdr cutoff for the train set
+    :param xl: crosslinked or linear peptide
     :raises FileNotFoundError: if the input file does not exist
     """
     if isinstance(input_file, str):

diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import pickle
+import shutil
 import sys
 import time
 from functools import partial
@@ -10,8 +11,6 @@
 from pathlib import Path
 from typing import Optional, Union
 
-import shutil
-
 import numpy as np
 import pandas as pd
 from sklearn.linear_model import LinearRegression, RANSACRegressor
@@ -710,6 +709,7 @@ def _rescore(fdr_dir: Path, config: Config, xl: bool = False):
             'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
         )
 
+
 def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame:
     """
     "calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)".
@@ -1305,7 +1305,7 @@ def run_rescoring(config_path: Union[str, Path]):
             shutil.copy(fdr_dir / "rescore.tab", rescore_features_path)
             input_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore")
             input_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None)
-        
+
         original_features_path = fdr_dir / "original_features_csm.tab"
         if not original_features_path.exists():
             shutil.copy(fdr_dir / "original.tab", original_features_path)
@@ -1326,7 +1326,6 @@ def run_rescoring(config_path: Union[str, Path]):
             logger.info("Finished Generating xiFDR input.")
             generate_xifdr_input_step.mark_done()
 
-
     else:
         _rescore(fdr_dir, config)
         # plotting

diff --git a/tests/unit_tests/configs/rescoring_cleavable_xl.json b/tests/unit_tests/configs/rescoring_cleavable_xl.json
@@ -10,7 +10,7 @@
     },
     "models": {
         "intensity": "Prosit_2023_intensity_XL_CMS2",
-	"irt": ""
+        "irt": ""
     },
     "prediction_server": "koina.wilhelmlab.org:443",
     "numThreads": 1,
@@ -21,10 +21,7 @@
     "massTolerance": 20,
     "unitMassTolerance": "ppm",
     "ce_alignment_options": {
-        "ce_range": [
-            19,
-            50
-        ],
+        "ce_range": [19, 50],
         "use_ransac_model": false
     }
 }
diff --git a/tests/unit_tests/data/xl/cleavable/inputs/L1_20211028_BB_HCMV_b4_5mM_repl2_B9.mzML b/tests/unit_tests/data/xl/cleavable/inputs/L1_20211028_BB_HCMV_b4_5mM_repl2_B9.mzML
@@ -450394,4 +450394,4 @@
   </indexList>
   <indexListOffset>71226581</indexListOffset>
   <fileChecksum>4e6964d4dfa469ae3bf071e9ad81e1983db536a7</fileChecksum>
-</indexedmzML>
+</indexedmzML>
diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py
@@ -3,10 +3,11 @@
 from pathlib import Path
 from unittest.mock import patch
 
+import pandas as pd
+from pandas.testing import assert_frame_equal
+
 from oktoberfest.__main__ import main
 from oktoberfest.utils import Config
-import numpy as np
-import pandas as pd
 
 
 class TestRunner(unittest.TestCase):
@@ -28,12 +29,22 @@ def test_rescoring_xl(self):
         with patch("sys.argv", ["oktoberfest", f"--config_path={config_path}"]):
             main()
 
-        #expected_perc_tab_file = pd.read_csv(Path(__file__).parent / "data" / "xl" / "cleavable" / "expected_outputs" / "expected_rescore.tab", sep="\t")
-        #created_perc_tab_file = pd.read_csv(Path(__file__).parent / "data" / "xl" / "cleavable" / "out" / "results" / "percolator" / "rescore.tab", sep="\t")
-        #np.testing.assert_almost_equal(expected_perc_tab_file.values, created_perc_tab_file.values)
-        #pd.testing.assert_frame_equal(expected_perc_tab_file, created_perc_tab_file)
+        expected_perc_tab_file = pd.read_csv(
+            Path(__file__).parent / "data" / "xl" / "cleavable" / "expected_outputs" / "expected_rescore.tab", sep="\t"
+        )
+
+        created_perc_tab_file = pd.read_csv(
+            Path(__file__).parent / "data" / "xl" / "cleavable" / "out" / "results" / "percolator" / "rescore.tab",
+            sep="\t",
+        )
+
+        try:
+            assert_frame_equal(expected_perc_tab_file, created_perc_tab_file, check_dtype=True, check_exact=False, rtol = 1e-2)
+        except AssertionError as e:
+            print("DataFrames are not equal:", e)
+            raise  # Re-raise the assertion error for the test framework to catch
 
         config = Config()
         config.read(config_path)
         shutil.rmtree(Path(__file__).parent / "data" / "xl" / "cleavable" / "out")
-
+