Merge pull request #112 from wilhelm-lab/feature/add_new_iontypes

Feature/add new iontypes
wilhelm-lab · Jul 27, 2024 · 5fb9de2 · 5fb9de2
2 parents 664285a + 7139560
commit 5fb9de2
Show file tree

Hide file tree

Showing 21 changed files with 67,477 additions and 68,539 deletions.
diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml
@@ -9,7 +9,7 @@ jobs:
         strategy:
             matrix:
                 os: [macos-latest, ubuntu-latest, windows-latest]
-                python: [3.8, 3.9]
+                python: ["3.9", "3.10"]
 
         steps:
             - uses: actions/checkout@v4

diff --git a/.github/workflows/main_master_branch_protection.yml b/.github/workflows/main_master_branch_protection.yml
@@ -42,7 +42,7 @@ jobs:
             - name: Set up Python
               uses: actions/[email protected]
               with:
-                  python-version: "3.8"
+                  python-version: "3.9"
             # PRs to the repository master branch are only ok if coming from any patch or release branch
             - name: Install mlf-core
               run: pip install mlf-core

diff --git a/.github/workflows/publish_docs.yml b/.github/workflows/publish_docs.yml
@@ -13,7 +13,7 @@ jobs:
             - name: Setup Python
               uses: actions/[email protected]
               with:
-                  python-version: 3.8
+                  python-version: 3.9
 
             - name: Install pip
               run: |

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -19,47 +19,47 @@ jobs:
             matrix:
                 include:
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "pre-commit",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "safety",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "mypy",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "tests",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: windows-latest,
                           session: "tests",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: macos-latest,
                           session: "tests",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "typeguard",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "xdoctest",
                       }
                     - {
-                          python-version: 3.8,
+                          python-version: 3.9,
                           os: ubuntu-latest,
                           session: "docs-build",
                       }
@@ -136,10 +136,10 @@ jobs:
             - name: Check out the repository
               uses: actions/checkout@v4
 
-            - name: Set up Python 3.8
+            - name: Set up Python 3.9
               uses: actions/[email protected]
               with:
-                  python-version: 3.8
+                  python-version: 3.9
 
             - name: Install Poetry
               run: |

diff --git a/.github/workflows/sync_project.yml b/.github/workflows/sync_project.yml
@@ -12,7 +12,7 @@ jobs:
             - name: Setup Python
               uses: actions/[email protected]
               with:
-                  python-version: 3.8
+                  python-version: 3.9
 
             - name: Install cookietemple
               run: pip install cookietemple

diff --git a/docs/modules.rst b/docs/modules.rst
@@ -1,5 +1,5 @@
 spectrum_fundamentals
-=================
+=====================
 
 .. toctree::
    :maxdepth: 4

diff --git a/noxfile.py b/noxfile.py
@@ -18,12 +18,13 @@
     sys.exit(1)
 
 package = "spectrum_fundamentals"
-python_versions = ["3.8", "3.9"]
+python_versions = ["3.9", "3.10"]
 nox.options.sessions = (
     "pre-commit",
     "safety",
     "mypy",
     "tests",
+    "typeguard",
     "xdoctest",
     "docs-build",
 )

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,15 +12,13 @@ packages = [
     { include = "spectrum_fundamentals" },
 ]
 classifiers = [
-    "Programming Language :: Python :: 3.7",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
 
 
 [tool.poetry.dependencies]
-python = ">=3.8.0,<3.11.0"
+python = ">=3.9.0,<3.11.0"
 click = ">=8.0.0"
 rich = ">=10.3.0"
 PyYAML = ">=5.4.1"

diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py
@@ -5,7 +5,7 @@
 import pandas as pd
 
 from spectrum_fundamentals import constants
-from spectrum_fundamentals.fragments import initialize_peaks, initialize_peaks_xl
+from spectrum_fundamentals.fragments import initialize_peaks, initialize_peaks_xl, retrieve_ion_types
 
 logger = logging.getLogger(__name__)
 
@@ -31,11 +31,10 @@ def match_peaks(
     """
     start_peak = 0
     no_of_peaks = len(peaks_intensity)
-    max_intensity = 1.0
+    max_intensity = 0.0
     row_list = []
     temp_list = []
     next_start_peak = 0
-    seq_len = len(unmod_sequence)
     matched_peak = False
     fragment_no: float
     for fragment in fragments_meta_data:
@@ -55,7 +54,7 @@ def match_peaks(
                 start_peak += 1
                 continue
             if (
-                not (fragment["ion_type"] == "b" and fragment_no == 1)
+                not (fragment["ion_type"][0] == "b" and fragment_no == 1)
                 or (unmod_sequence[0] == "R" or unmod_sequence[0] == "H" or unmod_sequence[0] == "K")
                 and (tmt_n_term == 1)
             ):
@@ -69,7 +68,7 @@ def match_peaks(
                         "intensity": peak_intensity,
                     }
                 )
-                if peak_intensity > max_intensity and fragment_no < seq_len:
+                if peak_intensity > max_intensity:
                     max_intensity = float(peak_intensity)
             matched_peak = True
             next_start_peak = start_peak
@@ -119,7 +118,10 @@ def handle_multiple_matches(
 
 
 def annotate_spectra(
-    un_annot_spectra: pd.DataFrame, mass_tolerance: Optional[float] = None, unit_mass_tolerance: Optional[str] = None
+    un_annot_spectra: pd.DataFrame,
+    mass_tolerance: Optional[float] = None,
+    unit_mass_tolerance: Optional[str] = None,
+    fragmentation_method: str = "HCD",
 ) -> pd.DataFrame:
     """
     Annotate a set of spectra.
@@ -138,12 +140,13 @@ def annotate_spectra(
     :param un_annot_spectra: a Pandas DataFrame containing the raw peaks and metadata to be annotated
     :param mass_tolerance: mass tolerance to calculate min and max mass
     :param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
+    :param fragmentation_method: fragmentation method that was used
     :return: a Pandas DataFrame containing the annotated spectra with meta data
     """
     raw_file_annotations = []
     index_columns = {col: un_annot_spectra.columns.get_loc(col) for col in un_annot_spectra.columns}
     for row in un_annot_spectra.values:
-        results = parallel_annotate(row, index_columns, mass_tolerance, unit_mass_tolerance)
+        results = parallel_annotate(row, index_columns, mass_tolerance, unit_mass_tolerance, fragmentation_method)
         if not results:
             continue
         raw_file_annotations.append(results)
@@ -269,24 +272,29 @@ def generate_annotation_matrix_xl(
 
 
 def generate_annotation_matrix(
-    matched_peaks: pd.DataFrame, unmod_seq: str, charge: int
+    matched_peaks: pd.DataFrame, unmod_seq: str, charge: int, fragmentation_method: str = "HCD"
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
     Generate the annotation matrix in the prosit format from matched peaks.
 
     :param matched_peaks: matched peaks needed to be converted
     :param unmod_seq: Un modified peptide sequence
     :param charge: Precursor charge
+    :param fragmentation_method: fragmentation method that was used
     :return: numpy array of intensities and numpy array of masses
     """
-    intensity = np.full(constants.VEC_LENGTH, -1.0)
-    mass = np.full(constants.VEC_LENGTH, -1.0)
+    ion_types = retrieve_ion_types(fragmentation_method)
+    charge_const = 3
+    vec_length = (constants.SEQ_LEN - 1) * charge_const * len(ion_types)
+
+    intensity = np.full(vec_length, -1.0)
+    mass = np.full(vec_length, -1.0)
 
     # change values to zeros
     if len(unmod_seq) < constants.SEQ_LEN:
-        peaks_range = range(0, ((len(unmod_seq) - 1) * 6))
+        peaks_range = range(0, ((len(unmod_seq) - 1) * charge_const * len(ion_types)))
     else:
-        peaks_range = range(0, ((constants.SEQ_LEN - 1) * 6))
+        peaks_range = range(0, ((constants.SEQ_LEN - 1) * charge_const * len(ion_types)))
 
     if charge == 1:
         available_peaks = [index for index in peaks_range if (index % 3 == 0)]
@@ -305,18 +313,18 @@ def generate_annotation_matrix(
     exp_mass_col = matched_peaks.columns.get_loc("exp_mass")
 
     for peak in matched_peaks.values:
-        if peak[ion_type].startswith("y"):
-            peak_pos = ((peak[no_col] - 1) * 6) + (peak[charge_col] - 1)
-        else:
-            peak_pos = ((peak[no_col] - 1) * 6) + (peak[charge_col] - 1) + 3
+        ion_type_index = ion_types.index(peak[ion_type][0])
+        peak_pos = ((peak[no_col] - 1) * charge_const * len(ion_types)) + (peak[charge_col] - 1) + 3 * ion_type_index
 
         if peak_pos >= constants.VEC_LENGTH:
             continue
         intensity[peak_pos] = peak[intensity_col]
         mass[peak_pos] = peak[exp_mass_col]
 
     if len(unmod_seq) < constants.SEQ_LEN:
-        mask_peaks = range((len(unmod_seq) - 1) * 6, ((len(unmod_seq) - 1) * 6) + 6)
+        mask_peaks = range(
+            (len(unmod_seq) - 1) * charge_const * len(ion_types), ((len(unmod_seq)) * charge_const * len(ion_types))
+        )
         intensity[mask_peaks] = -1.0
         mass[mask_peaks] = -1.0
 
@@ -328,6 +336,7 @@ def parallel_annotate(
     index_columns: Dict[str, int],
     mass_tolerance: Optional[float] = None,
     unit_mass_tolerance: Optional[str] = None,
+    fragmentation_method: str = "HCD",
 ) -> Optional[
     Union[
         Tuple[np.ndarray, np.ndarray, float, int],
@@ -348,14 +357,17 @@ def parallel_annotate(
     :param index_columns: a dictionary that contains the index columns of the spectrum
     :param mass_tolerance: mass tolerance to calculate min and max mass
     :param unit_mass_tolerance: unit for the mass tolerance (da or ppm)
+    :param fragmentation_method: fragmentation method that was used
     :return: a tuple containing intensity values (np.ndarray), masses (np.ndarray), calculated mass (float),
              and any removed peaks (List[str])
     """
     xl_type_col = index_columns.get("CROSSLINKER_TYPE")
     if xl_type_col is None:
         if spectrum[index_columns["PEPTIDE_LENGTH"]] > 30:  # this was in initialize peaks but can be checked prior
             return None
-        return _annotate_linear_spectrum(spectrum, index_columns, mass_tolerance, unit_mass_tolerance)
+        return _annotate_linear_spectrum(
+            spectrum, index_columns, mass_tolerance, unit_mass_tolerance, fragmentation_method
+        )
 
     if (spectrum[index_columns["PEPTIDE_LENGTH_A"]] > 30) or (spectrum[index_columns["PEPTIDE_LENGTH_B"]] > 30):
         return None
@@ -369,6 +381,7 @@ def _annotate_linear_spectrum(
     index_columns: Dict[str, int],
     mass_tolerance: Optional[float],
     unit_mass_tolerance: Optional[str],
+    fragmentation_method: str = "HCD",
 ):
     """
     Annotate a linear peptide spectrum.
@@ -377,17 +390,19 @@ def _annotate_linear_spectrum(
     :param index_columns: Index columns of the spectrum
     :param mass_tolerance: Mass tolerance for calculating min and max mass
     :param unit_mass_tolerance: Unit for the mass tolerance (da or ppm)
+    :param fragmentation_method: fragmentation method that was used
     :return: Annotated spectrum
     """
     mod_seq_column = "MODIFIED_SEQUENCE"
     if "MODIFIED_SEQUENCE_MSA" in index_columns:
         mod_seq_column = "MODIFIED_SEQUENCE_MSA"
     fragments_meta_data, tmt_n_term, unmod_sequence, calc_mass = initialize_peaks(
-        spectrum[index_columns[mod_seq_column]],
-        spectrum[index_columns["MASS_ANALYZER"]],
-        spectrum[index_columns["PRECURSOR_CHARGE"]],
-        mass_tolerance,
-        unit_mass_tolerance,
+        sequence=spectrum[index_columns[mod_seq_column]],
+        mass_analyzer=spectrum[index_columns["MASS_ANALYZER"]],
+        charge=spectrum[index_columns["PRECURSOR_CHARGE"]],
+        mass_tolerance=mass_tolerance,
+        unit_mass_tolerance=unit_mass_tolerance,
+        fragmentation_method=fragmentation_method,
     )
     matched_peaks = match_peaks(
         fragments_meta_data,
@@ -398,14 +413,18 @@ def _annotate_linear_spectrum(
         spectrum[index_columns["PRECURSOR_CHARGE"]],
     )
 
+    ion_types = retrieve_ion_types(fragmentation_method)
+    charge_const = 3
+    vec_length = (constants.SEQ_LEN - 1) * charge_const * len(ion_types)
+
     if len(matched_peaks) == 0:
-        intensity = np.full(174, 0.0)
-        mass = np.full(174, 0.0)
+        intensity = np.full(vec_length, 0.0)
+        mass = np.full(vec_length, 0.0)
         return intensity, mass, calc_mass, 0
 
     matched_peaks, removed_peaks = handle_multiple_matches(matched_peaks)
     intensities, mass = generate_annotation_matrix(
-        matched_peaks, unmod_sequence, spectrum[index_columns["PRECURSOR_CHARGE"]]
+        matched_peaks, unmod_sequence, spectrum[index_columns["PRECURSOR_CHARGE"]], fragmentation_method
     )
     return intensities, mass, calc_mass, removed_peaks