Skip to content

Commit

Permalink
fixed docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
mostafakalhor committed Oct 27, 2024
1 parent 80acbc1 commit 389b9cd
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 23 deletions.
8 changes: 6 additions & 2 deletions oktoberfest/data/spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"],
Create annotation dataframe for vars in AnnData object.
:param ion_types: ion types that are expected to be in the spectra
:param xl: crosslinked or linear peptide
:return: pd.Dataframe of fragment annotations
"""
df = pd.DataFrame(
Expand All @@ -85,6 +86,7 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> list[str
Get column names of the spectra data.
:param fragment_type: choose predicted, raw, or mz
:param xl: crosslinked or linear peptide
:return: A list of column names
"""
prefix = Spectra._resolve_prefix(fragment_type)
Expand All @@ -101,7 +103,10 @@ def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> list[str
@staticmethod
def _resolve_prefix(fragment_type: FragmentType) -> str:
"""
Resolve prefix given fragment type (1 for pred, 2 for xl_pred_a, 3 for xl_pred_a, 4 for raw, 5 for xl_raw_a, 6 for xl_raw_b, 7 for mz, 8 for xl_mz_a, 9 for xl_mz_b).
Resolve prefix given fragment type.
(1 for pred, 2 for xl_pred_a, 3 for xl_pred_a, 4 for raw, 5 for xl_raw_a,
6 for xl_raw_b, 7 for mz, 8 for xl_mz_a, 9 for xl_mz_b).
:param fragment_type: choose predicted, raw, or mz
:return: prefix as string
Expand Down Expand Up @@ -348,7 +353,6 @@ def from_hdf5(cls: type[SpectraT], input_file: str | Path) -> SpectraT:
:param input_file: path to input file
:return: a spectra instance
"""

return cls(anndata.read_h5ad(str(input_file)))

def remove_decoys(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions oktoberfest/predict/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def _alignment(alignment_library: Spectra, xl: bool = False):
adds it as a column to the alignment library.
:param alignment_library: the library to perform the alignment on
:param xl: crosslinked or linear peptide
"""
if xl:
pred_intensity_a = alignment_library.get_matrix(FragmentType.PRED_A)
Expand Down
3 changes: 0 additions & 3 deletions oktoberfest/predict/koina.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import copy
import logging
from typing import TYPE_CHECKING

Expand All @@ -12,8 +11,6 @@
logger = logging.getLogger(__name__)

if TYPE_CHECKING:
from typing import Dict, Tuple

import numpy as np


Expand Down
5 changes: 4 additions & 1 deletion oktoberfest/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def predict_intensities(
:param data: Spectra object containing the required data for prediction and to store the
predictions in after retrieval from the server.
:param xl: crosslinked or linear peptide
:param chunk_idx: The chunked indices of the provided dataframe. This is required in some cases,
e.g. if padding should be avoided when predicting peptides of different length.
For alphapept, this is required as padding is only performed within one batch, leading to
Expand Down Expand Up @@ -223,6 +224,7 @@ def predict_at_once(self, data: Spectra, xl: bool = False, **kwargs) -> dict[str
See the Koina or DLomix predict functions for details. TODO, link this properly.
:param data: Spectra containing the data for the prediction.
:param xl: crosslinked or linear peptide
:param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
:return: a dictionary with targets (keys) and predictions (values)
Expand Down Expand Up @@ -304,6 +306,7 @@ def predict_in_chunks(
e.g. if padding should be avoided when predicting peptides of different length.
For alphapept, this is required as padding is only performed within one batch, leading to
different sizes of arrays between individual prediction batches that cannot be concatenated.
:param xl: crosslinked or linear peptide
:param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
:return: a dictionary with targets (keys) and list of predictions (values) with a length equal
Expand Down Expand Up @@ -354,6 +357,7 @@ def ce_calibration(
:param library: spectral library to perform CE calibration on
:param ce_range: the min and max CE to be tested during calibration
:param group_by_charge: if true, select the top 1000 spectra independently for each precursor charge
:param xl: crosslinked or linear peptide
:param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
:return: a spectra object containing the spectral angle for each tested CE
Expand Down Expand Up @@ -387,7 +391,6 @@ def ce_calibration(
>>> alignment_library = intensity_predictor.ce_calibration(library=library, ce_range=(15,30), group_by_charge=False)
>>> print(alignment_library)
"""

alignment_library = _prepare_alignment_df(library, ce_range=ce_range, group_by_charge=group_by_charge, xl=xl)

if "alphapept" in self.model_name.lower():
Expand Down
1 change: 1 addition & 0 deletions oktoberfest/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,7 @@ def annotate_spectral_library_xl(psms: Spectra, mass_tol: Optional[float] = None
:param psms: Spectral library to be annotated.
:param mass_tol: The mass tolerance allowed for retaining peaks
:param unit_mass_tol: The unit in which the mass tolerance is given
:return: Spectra object containing the annotated b and y ion peaks including metadata
"""
logger.info("Annotating spectra...")
df_annotated_spectra = annotate_spectra(psms, mass_tol, unit_mass_tol)
Expand Down
2 changes: 2 additions & 0 deletions oktoberfest/rescore/rescore.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def generate_features(
:param output_file: the location to the generated tab file to be used for percolator / mokapot
:param additional_columns: additional columns supplied in the search results to be used as features (either a list or "all")
:param all_features: whether to use all features or only the standard set TODO
:param xl: crosslinked or linear peptide
:param regression_method: The regression method to use for iRT alignment
:param add_neutral_loss_features: Flag to indicate whether to add neutral loss features to percolator or not
:param remove_miss_cleavage_features: Flag to indicate whether to remove miss cleavage features from percolator or not
Expand Down Expand Up @@ -233,6 +234,7 @@ def rescore_with_percolator(
:param num_threads: The number of threads used in parallel for percolator
:param test_fdr: the fdr cutoff for the test set
:param train_fdr: the fdr cutoff for the train set
:param xl: crosslinked or linear peptide
:raises FileNotFoundError: if the input file does not exist
"""
if isinstance(input_file, str):
Expand Down
7 changes: 3 additions & 4 deletions oktoberfest/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import pickle
import shutil
import sys
import time
from functools import partial
Expand All @@ -10,8 +11,6 @@
from pathlib import Path
from typing import Optional, Union

import shutil

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, RANSACRegressor
Expand Down Expand Up @@ -710,6 +709,7 @@ def _rescore(fdr_dir: Path, config: Config, xl: bool = False):
'f{config.fdr_estimation_method} is not a valid rescoring tool, use either "percolator" or "mokapot"'
)


def xl_fdr(df: pd.DataFrame, score: str) -> pd.DataFrame:
"""
"calculate and add fdr_xl to the DataFrame : (TD-DD)/(TT)".
Expand Down Expand Up @@ -1305,7 +1305,7 @@ def run_rescoring(config_path: Union[str, Path]):
shutil.copy(fdr_dir / "rescore.tab", rescore_features_path)
input_psm_rescore = prepare_rescore_xl_psm_level(str(fdr_dir), "rescore")
input_psm_rescore.to_csv(str(fdr_dir) + "/rescore.tab", sep="\t", index=None)

original_features_path = fdr_dir / "original_features_csm.tab"
if not original_features_path.exists():
shutil.copy(fdr_dir / "original.tab", original_features_path)
Expand All @@ -1326,7 +1326,6 @@ def run_rescoring(config_path: Union[str, Path]):
logger.info("Finished Generating xiFDR input.")
generate_xifdr_input_step.mark_done()


else:
_rescore(fdr_dir, config)
# plotting
Expand Down
7 changes: 2 additions & 5 deletions tests/unit_tests/configs/rescoring_cleavable_xl.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
"models": {
"intensity": "Prosit_2023_intensity_XL_CMS2",
"irt": ""
"irt": ""
},
"prediction_server": "koina.wilhelmlab.org:443",
"numThreads": 1,
Expand All @@ -21,10 +21,7 @@
"massTolerance": 20,
"unitMassTolerance": "ppm",
"ce_alignment_options": {
"ce_range": [
19,
50
],
"ce_range": [19, 50],
"use_ransac_model": false
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -450394,4 +450394,4 @@
</indexList>
<indexListOffset>71226581</indexListOffset>
<fileChecksum>4e6964d4dfa469ae3bf071e9ad81e1983db536a7</fileChecksum>
</indexedmzML>
</indexedmzML>
25 changes: 18 additions & 7 deletions tests/unit_tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from pathlib import Path
from unittest.mock import patch

import pandas as pd
from pandas.testing import assert_frame_equal

from oktoberfest.__main__ import main
from oktoberfest.utils import Config
import numpy as np
import pandas as pd


class TestRunner(unittest.TestCase):
Expand All @@ -28,12 +29,22 @@ def test_rescoring_xl(self):
with patch("sys.argv", ["oktoberfest", f"--config_path={config_path}"]):
main()

#expected_perc_tab_file = pd.read_csv(Path(__file__).parent / "data" / "xl" / "cleavable" / "expected_outputs" / "expected_rescore.tab", sep="\t")
#created_perc_tab_file = pd.read_csv(Path(__file__).parent / "data" / "xl" / "cleavable" / "out" / "results" / "percolator" / "rescore.tab", sep="\t")
#np.testing.assert_almost_equal(expected_perc_tab_file.values, created_perc_tab_file.values)
#pd.testing.assert_frame_equal(expected_perc_tab_file, created_perc_tab_file)
expected_perc_tab_file = pd.read_csv(
Path(__file__).parent / "data" / "xl" / "cleavable" / "expected_outputs" / "expected_rescore.tab", sep="\t"
)

created_perc_tab_file = pd.read_csv(
Path(__file__).parent / "data" / "xl" / "cleavable" / "out" / "results" / "percolator" / "rescore.tab",
sep="\t",
)

try:
assert_frame_equal(expected_perc_tab_file, created_perc_tab_file, check_dtype=True, check_exact=False, rtol = 1e-2)
except AssertionError as e:
print("DataFrames are not equal:", e)
raise # Re-raise the assertion error for the test framework to catch

config = Config()
config.read(config_path)
shutil.rmtree(Path(__file__).parent / "data" / "xl" / "cleavable" / "out")


0 comments on commit 389b9cd

Please sign in to comment.