Merge pull request #77 from wilhelm-lab/release/0.4.3

Release/0.4.3
wilhelm-lab · Oct 31, 2023 · 9b90193 · 9b90193
2 parents 57135ee + 4739dbc
commit 9b90193
Show file tree

Hide file tree

Showing 16 changed files with 622 additions and 679 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Victor Giurcoiu
 email: [email protected]
 project_name: spectrum_fundamentals
 project_short_description: Fundamentals public repo
-version: 0.4.2
+version: 0.4.3
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.4.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.4.2 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.4.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.4.3 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.2
+current_version = 0.4.3
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_fundamentals/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -54,9 +54,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.4.2"
+version = "0.4.3"
 # The full version, including alpha/beta/rc tags.
-release = "0.4.2"
+release = "0.4.3"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_fundamentals"
-version = "0.4.2"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.4.3"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "Fundamentals public repo"
 authors = ["WassimG <[email protected]>"]
 license = "MIT"
@@ -20,7 +20,7 @@ classifiers = [
 
 
 [tool.poetry.dependencies]
-python = "^3.8.0"
+python = ">=3.8.0,<3.11.0"
 click = ">=8.0.0"
 rich = ">=10.3.0"
 PyYAML = ">=5.4.1"

diff --git a/spectrum_fundamentals/__init__.py b/spectrum_fundamentals/__init__.py
@@ -1,7 +1,7 @@
 """Initialize fundamentals."""
 __author__ = "Victor Giurcoiu"
 __email__ = "[email protected]"
-__version__ = "0.4.2"
+__version__ = "0.4.3"
 
 import logging
 import logging.handlers

diff --git a/spectrum_fundamentals/__main__.py b/spectrum_fundamentals/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.4.2", message=click.style("spectrum_fundamentals Version: 0.4.2"))
+@click.version_option(version="0.4.3", message=click.style("spectrum_fundamentals Version: 0.4.3"))
 def main() -> None:
     """spectrum_fundamentals."""
 

diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py
@@ -149,7 +149,6 @@ def annotate_spectra(
         raw_file_annotations.append(results)
     results_df = pd.DataFrame(raw_file_annotations)
     results_df.columns = ["INTENSITIES", "MZ", "CALCULATED_MASS", "removed_peaks"]
-    logger.info(f"Removed {results_df['removed_peaks'].describe()} redundant peaks")
 
     return results_df
 

diff --git a/spectrum_fundamentals/constants.py b/spectrum_fundamentals/constants.py
@@ -49,7 +49,6 @@
     "C[UNIMOD:4]": 2,
     "K[UNIMOD:737]": 22,
     "K[UNIMOD:2016]": 22,
-    "K[UNIMOD:2016]": 22,
     "K[UNIMOD:214]": 22,
     "K[UNIMOD:730]": 22,
     "S[UNIMOD:21]": 25,
@@ -73,16 +72,16 @@
     "(ox)": "[UNIMOD:35]",
     "(Oxidation (M))": "[UNIMOD:35]",
     "(tm)": "[UNIMOD:737]",
-    "_(tm)": "_[UNIMOD:737]",
+    "_(tm)": "_[UNIMOD:737]-",
     "K(tm)": "K[UNIMOD:737]",
     "(i4)": "[UNIMOD:214]",
-    "_(i4)": "_[UNIMOD:214]",
+    "_(i4)": "_[UNIMOD:214]-",
     "K(i4)": "K[UNIMOD:214]",
     "(i8)": "[UNIMOD:730]",
-    "_(i8)": "_[UNIMOD:730]",
+    "_(i8)": "_[UNIMOD:730]-",
     "K(i8)": "K[UNIMOD:730]",
     "(tmp)": "[UNIMOD:2016]",
-    "_(tmp)": "_[UNIMOD:2016]",
+    "_(tmp)": "_[UNIMOD:2016]-",
     "K(tmp)": "K[UNIMOD:2016]",
     "(ph)": "[UNIMOD:21]",
     "(Phospho (STY))": "[UNIMOD:21]",

diff --git a/spectrum_fundamentals/fragments.py b/spectrum_fundamentals/fragments.py
@@ -24,10 +24,11 @@ def _get_modifications(peptide_sequence: str) -> Optional[Tuple[Dict[int, float]
     modification_mass = constants.MOD_MASSES
     # Handle terminal modifications here
     for possible_tmt_mod in constants.TMT_MODS.values():
-        if peptide_sequence.startswith(possible_tmt_mod):  # TMT_6
+        n_term_tmt = possible_tmt_mod + "-"
+        if peptide_sequence.startswith(n_term_tmt):
             tmt_n_term = 2
             modification_deltas.update({0: constants.MOD_MASSES[possible_tmt_mod]})
-            peptide_sequence = peptide_sequence[len(possible_tmt_mod) :]
+            peptide_sequence = peptide_sequence[len(n_term_tmt) :]
             break
 
     if "(" in peptide_sequence:
@@ -183,7 +184,7 @@ def initialize_peaks(
             for ion_type in range(0, number_of_ion_types):  # generate all ion types
                 # Check for neutral loss here
                 mass = (ion_type_masses[ion_type] + charge_delta) / charge
-                min_mass, max_mass = get_min_max_mass(mass_analyzer, mass)
+                min_mass, max_mass = get_min_max_mass(mass_analyzer, mass, mass_tolerance, unit_mass_tolerance)
                 fragments_meta_data.append(
                     {
                         "ion_type": ion_types[ion_type],  # ion type

diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py
@@ -1,8 +1,5 @@
 import enum
-import hashlib
 import logging
-import re
-import subprocess
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -60,26 +57,15 @@ def __init__(
         all_features_flag: bool = False,
         regression_method: str = "lowess",
         fdr_cutoff: float = 0.01,
-        percolator_version: Optional[float] = 3.05,
     ):
         """Initialize a Percolator obj."""
         self.metadata = metadata
         self.input_type = input_type
         self.all_features_flag = all_features_flag
         self.regression_method = regression_method
         self.fdr_cutoff = fdr_cutoff
-
-        self._resolve_percolator_compatibility(percolator_version)
         super().__init__(pred_intensities, true_intensities, mz)
 
-    def _resolve_percolator_compatibility(self, percolator_version: Optional[float] = None):
-        if percolator_version is None:
-            result = subprocess.run(["percolator", "-h"], capture_output=True, text=True)
-            version_line = result.stderr.splitlines()[0].strip()
-            version = version_line.split("version ")[1]
-            percolator_version = float(re.sub(r"\.[^.]+$", "", version))
-        self.prot_col_name = "Proteins" if percolator_version >= 3.06 else "Protein"
-
     @staticmethod
     def sample_balanced_over_bins(retention_time_df: pd.DataFrame, sample_size: int = 5000) -> pd.Index:
         """
@@ -170,18 +156,6 @@ def get_aligned_predicted_retention_times(
 
         return aligned_rts_predicted
 
-    @staticmethod
-    def get_scannr(metadata_subset: Union[pd.Series, Tuple[str, int]]) -> int:
-        """
-        Creates a hash of the raw_file and scan number to use as a unique scan number in percolator.
-
-        :param metadata_subset: tuple of (raw_file, scan_number)
-        :return: hashed unique id
-        """
-        raw_file, scan_number = metadata_subset
-        s = f"{raw_file}{scan_number}".encode()
-        return int(hashlib.sha224(s).hexdigest()[:6], 16)
-
     @staticmethod
     def get_delta_score(scores_df: pd.DataFrame, scoring_feature: str) -> np.ndarray:
         """
@@ -296,11 +270,11 @@ def add_percolator_metadata_columns(self):
             spec_id_cols.append("SCAN_EVENT_NUMBER")
         self.metrics_val["SpecId"] = self.metadata[spec_id_cols].apply(Percolator.get_specid, axis=1)
         self.metrics_val["Label"] = self.target_decoy_labels
-        self.metrics_val["ScanNr"] = self.metadata[["RAW_FILE", "SCAN_NUMBER"]].apply(Percolator.get_scannr, axis=1)
-
+        self.metrics_val["ScanNr"] = self.metadata["SCAN_NUMBER"]
+        self.metrics_val["filename"] = self.metadata["RAW_FILE"]
         self.metrics_val["Peptide"] = self.metadata["MODIFIED_SEQUENCE"].apply(lambda x: "_." + x + "._")
 
-        self.metrics_val[self.prot_col_name] = self.metadata[
+        self.metrics_val["Proteins"] = self.metadata[
             "MODIFIED_SEQUENCE"
         ]  # we don't need the protein ID to get PSM / peptide results, fill with peptide sequence
 
@@ -397,8 +371,8 @@ def fdrs_to_qvals(fdrs: np.ndarray) -> np.ndarray:
 
     def _reorder_columns_for_percolator(self):
         all_columns = self.metrics_val.columns
-        first_columns = ["SpecId", "Label", "ScanNr"]
-        last_columns = ["Peptide", "Protein"] if "Protein" in all_columns else ["Peptide", "Proteins"]
+        first_columns = ["SpecId", "Label", "ScanNr", "filename"]
+        last_columns = ["Peptide", "Proteins"]
         mid_columns = list(set(all_columns) - set(first_columns) - set(last_columns))
         new_columns = first_columns + sorted(mid_columns) + last_columns
         self.metrics_val = self.metrics_val[new_columns]

diff --git a/tests/unit_tests/data/perc_output.csv b/tests/unit_tests/data/perc_output.csv
@@ -1,4 +1,4 @@
-,SpecId,Label,ScanNr,CID,Charge1,Charge2,Charge3,Charge4,Charge5,Charge6,HCD,KR,Mass,RT,UnknownFragmentationMethod,abs_rt_diff,collision_energy_aligned,count_not_observed_and_not_predicted,count_not_observed_and_not_predicted_b,count_not_observed_and_not_predicted_y,count_not_observed_but_predicted,count_not_observed_but_predicted_b,count_not_observed_but_predicted_y,count_observed,count_observed_and_predicted,count_observed_and_predicted_b,count_observed_and_predicted_y,count_observed_b,count_observed_but_not_predicted,count_observed_but_not_predicted_b,count_observed_but_not_predicted_y,count_observed_y,count_predicted,count_predicted_b,count_predicted_y,fraction_not_observed_and_not_predicted,fraction_not_observed_and_not_predicted_b,fraction_not_observed_and_not_predicted_b_vs_predicted_b,fraction_not_observed_and_not_predicted_vs_predicted,fraction_not_observed_and_not_predicted_y,fraction_not_observed_and_not_predicted_y_vs_predicted_y,fraction_not_observed_but_predicted,fraction_not_observed_but_predicted_b,fraction_not_observed_but_predicted_b_vs_predicted,fraction_not_observed_but_predicted_vs_predicted,fraction_not_observed_but_predicted_y,fraction_not_observed_but_predicted_y_vs_predicted,fraction_observed,fraction_observed_and_predicted,fraction_observed_and_predicted_b,fraction_observed_and_predicted_b_vs_predicted_b,fraction_observed_and_predicted_vs_predicted,fraction_observed_and_predicted_y,fraction_observed_and_predicted_y_vs_predicted_y,fraction_observed_b,fraction_observed_but_not_predicted,fraction_observed_but_not_predicted_b,fraction_observed_but_not_predicted_b_vs_predicted_b,fraction_observed_but_not_predicted_vs_predicted,fraction_observed_but_not_predicted_y,fraction_observed_but_not_predicted_y_vs_predicted_y,fraction_observed_y,fraction_predicted,fraction_predicted_b,fraction_predicted_y,iRT,lda_scores,missedCleavages,modified_cosine,pearson_corr,pred_RT,sequence_length,spectral_angle,Peptide,Protein
+,SpecId,Label,ScanNr,CID,Charge1,Charge2,Charge3,Charge4,Charge5,Charge6,HCD,KR,Mass,RT,UnknownFragmentationMethod,abs_rt_diff,collision_energy_aligned,count_not_observed_and_not_predicted,count_not_observed_and_not_predicted_b,count_not_observed_and_not_predicted_y,count_not_observed_but_predicted,count_not_observed_but_predicted_b,count_not_observed_but_predicted_y,count_observed,count_observed_and_predicted,count_observed_and_predicted_b,count_observed_and_predicted_y,count_observed_b,count_observed_but_not_predicted,count_observed_but_not_predicted_b,count_observed_but_not_predicted_y,count_observed_y,count_predicted,count_predicted_b,count_predicted_y,fraction_not_observed_and_not_predicted,fraction_not_observed_and_not_predicted_b,fraction_not_observed_and_not_predicted_b_vs_predicted_b,fraction_not_observed_and_not_predicted_vs_predicted,fraction_not_observed_and_not_predicted_y,fraction_not_observed_and_not_predicted_y_vs_predicted_y,fraction_not_observed_but_predicted,fraction_not_observed_but_predicted_b,fraction_not_observed_but_predicted_b_vs_predicted,fraction_not_observed_but_predicted_vs_predicted,fraction_not_observed_but_predicted_y,fraction_not_observed_but_predicted_y_vs_predicted,fraction_observed,fraction_observed_and_predicted,fraction_observed_and_predicted_b,fraction_observed_and_predicted_b_vs_predicted_b,fraction_observed_and_predicted_vs_predicted,fraction_observed_and_predicted_y,fraction_observed_and_predicted_y_vs_predicted_y,fraction_observed_b,fraction_observed_but_not_predicted,fraction_observed_but_not_predicted_b,fraction_observed_but_not_predicted_b_vs_predicted_b,fraction_observed_but_not_predicted_vs_predicted,fraction_observed_but_not_predicted_y,fraction_observed_but_not_predicted_y_vs_predicted_y,fraction_observed_y,fraction_predicted,fraction_predicted_b,fraction_predicted_y,iRT,lda_scores,missedCleavages,modified_cosine,pearson_corr,pred_RT,sequence_length,spectral_angle,Peptide,Proteins
 0,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-7978-AAIGEATRL-2-1,1,10203379,0,0,1,0,0,0,0,1,1,900.50288029264,0.5000000183883155,0,0.0,0.3,2.0,1,1,2.0,1,1,4.0,2.0,1,1,1,2.0,0,2,3,4.0,2,2,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.5,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.3333333333333333,0.25,0.0,0.0,0.5,0.4,1.0,0.6,0.5,0.6666666666666666,0.4,0.5000000183883155,1.7027402478448157e+30,1,0.8959462788501606,0.8831116463997931,0.5000000323590892,9,0.7449534694193893,_.AAIGEATRL._,AAIGEATRL
 1,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-12304-AAVPRAAFL-2-2,-1,5831230,0,0,1,0,0,0,0,1,1,914.53379,1.000000038995633,0,2.533209730870567e-08,0.3,1.0,0,1,1.0,0,1,2.0,1.0,0,1,1,1.0,1,0,1,2.0,0,2,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,1.0,0.25,1.0,1.0,0.5,0.0,0.0,0.3333333333333333,0.5,0.0,0.6666666666666666,1.0000000136635356,-1.7027402478448157e+30,1,1.0,0.0,1.5000000246189773,9,0.5903344918223923,_.AAVPRAAFL._,AAVPRAAFL
 2,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-12398-AAYFGVYDTAK-2-3,-1,16059765,0,0,1,0,0,0,0,1,1,1204.5764,1.5000000993570155,0,1.0185683851915428e-07,0.3,1.0,0,1,1.0,0,1,2.0,1.0,0,1,1,1.0,1,0,1,2.0,0,2,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,1.0,0.25,1.0,1.0,0.5,0.0,0.0,0.3333333333333333,0.5,0.0,0.6666666666666666,1.499999997500177,-1.7027402478448157e+30,0,1.0,0.0,2.500000056694346,11,0.5903344918223923,_.AAYFGVYDTAK._,AAYFGVYDTAK

diff --git a/tests/unit_tests/test_fragments.py b/tests/unit_tests/test_fragments.py
@@ -18,11 +18,11 @@ def test_get_modifications_carbamidomethylation(self):
 
     def test_get_modifications_tmt_tag(self):
         """Test get_modifications."""
-        assert fragments._get_modifications("[UNIMOD:737]ABC[UNIMOD:4]") == ({0: 229.162932, 2: 57.02146}, 2, "ABC")
+        assert fragments._get_modifications("[UNIMOD:737]-ABC[UNIMOD:4]") == ({0: 229.162932, 2: 57.02146}, 2, "ABC")
 
     def test_get_modifications_tmtpro_tag(self):
         """Test get_modifications."""
-        assert fragments._get_modifications("[UNIMOD:2016]ABC[UNIMOD:4]") == ({0: 304.207146, 2: 57.02146}, 2, "ABC")
+        assert fragments._get_modifications("[UNIMOD:2016]-ABC[UNIMOD:4]") == ({0: 304.207146, 2: 57.02146}, 2, "ABC")
 
 
 class TestComputeMasses(unittest.TestCase):
@@ -63,6 +63,11 @@ def test_compute_peptide_masses(self):
         seq = "SEQUENC[UNIMOD:4]E"
         self.assertEqual(fragments.compute_peptide_mass(seq), 1045.2561516699998)
 
+    def test_compute_peptide_masses_tmtpro(self):
+        """Test computation of peptide masses with valid input and tmt tag."""
+        seq = "[UNIMOD:737]-SEQUENC[UNIMOD:4]E"
+        self.assertEqual(fragments.compute_peptide_mass(seq), 1274.41908367)
+
     def test_compute_peptide_masses_with_invalid_syntax(self):
         """Negative testing of comuptation of peptide mass with unsupported syntax of mod string."""
         seq = "SEQUEM(Ox.)CE"

diff --git a/tests/unit_tests/test_mod_string.py b/tests/unit_tests/test_mod_string.py
@@ -39,9 +39,9 @@ def test_maxquant_to_internal_variable_dehydration_long(self):
 
     def test_maxquant_to_internal_tmt(self):
         """Test maxquant_to_internal_tmt."""
-        fixed_mods = {"C": "C[UNIMOD:4]", "^_": "_[UNIMOD:737]", "K": "K[UNIMOD:737]"}
+        fixed_mods = {"C": "C[UNIMOD:4]", "^_": "_[UNIMOD:737]-", "K": "K[UNIMOD:737]"}
         self.assertEqual(
-            mod.maxquant_to_internal(["_ABCDEFGHK_"], fixed_mods), ["[UNIMOD:737]ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"]
+            mod.maxquant_to_internal(["_ABCDEFGHK_"], fixed_mods), ["[UNIMOD:737]-ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"]
         )
 
     def test_maxquant_to_internal_silac(self):
@@ -79,13 +79,13 @@ class TestInternalTransformations(unittest.TestCase):
 
     def test_internal_without_mods(self):
         """Test internal with mods to internal without_mods."""
-        self.assertEqual(mod.internal_without_mods(["[UNIMOD:737]ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"]), ["ABCDEFGHK"])
+        self.assertEqual(mod.internal_without_mods(["[UNIMOD:737]-ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"]), ["ABCDEFGHK"])
 
     def test_internal_to_mod_masses(self):
         """Test internal with mods to internal without_mods."""
         self.assertEqual(
-            mod.internal_to_mod_mass(["[UNIMOD:737]ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"]),
-            ["[+229.162932]ABC[+57.02146]DEFGHK[+229.162932]"],
+            mod.internal_to_mod_mass(["[UNIMOD:737]-ABC[UNIMOD:4]DEFGHK[UNIMOD:737]"]),
+            ["[+229.162932]-ABC[+57.02146]DEFGHK[+229.162932]"],
         )
 
     def test_proteomicsdb_to_internal(self):

diff --git a/tests/unit_tests/test_percolator.py b/tests/unit_tests/test_percolator.py
@@ -274,12 +274,6 @@ def _create_noisy_logistic_data():
 class TestPercolator:
     """Class to test percolator."""
 
-    def test_get_scannr(self):
-        """Test get_scannr."""
-        np.testing.assert_equal(
-            perc.Percolator.get_scannr(("20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02", 7978)), 10203379
-        )
-
     def test_get_specid(self):
         """Test get_specid."""
         np.testing.assert_string_equal(
@@ -361,11 +355,12 @@ def test_calc(self):
             percolator.metrics_val["SpecId"][0], "20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-7978-AAIGEATRL-2-1"
         )
         np.testing.assert_equal(percolator.metrics_val["Label"][0], 1)
-        np.testing.assert_equal(percolator.metrics_val["ScanNr"][0], 10203379)
+        np.testing.assert_equal(percolator.metrics_val["ScanNr"][0], 7978)
+        np.testing.assert_equal(percolator.metrics_val["filename"][0], "20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02")
         # np.testing.assert_almost_equal(percolator.metrics_val['ExpMass'][0], 900.50345678)
         np.testing.assert_string_equal(percolator.metrics_val["Peptide"][0], "_.AAIGEATRL._")
         np.testing.assert_string_equal(
-            percolator.metrics_val["Protein"][0], "AAIGEATRL"
+            percolator.metrics_val["Proteins"][0], "AAIGEATRL"
         )  # we don't need the protein ID to get PSM / peptide results
 
         # features