Merge pull request #79 from wilhelm-lab/release/0.3.4

Release/0.3.4
wilhelm-lab · Nov 13, 2023 · 6d28b8e · 6d28b8e
2 parents 42ecdfb + e544c1b
commit 6d28b8e
Show file tree

Hide file tree

Showing 21 changed files with 1,160 additions and 792 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Mario Picciani
 email: [email protected]
 project_name: spectrum_io
 project_short_description: IO related functionalities for oktoberfest.
-version: 0.3.3
+version: 0.3.4
 license: MIT
diff --git a/.flake8 b/.flake8
@@ -5,7 +5,7 @@ max-line-length = 120
 max-complexity = 10
 docstring-convention = google
 per-file-ignores =
-	tests/*:S101
+	tests/*:S101,S301,S403
 	noxfile.py:DAR101
 	spectrum_io/raw/thermo_raw.py:S603,S404
 	spectrum_io/raw/msraw.py:S405,S314

diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.3.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.3.3 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.3.4 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.3.4 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.3
+current_version = 0.3.4
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_io/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -53,9 +53,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.3.3"
+version = "0.3.4"
 # The full version, including alpha/beta/rc tags.
-release = "0.3.3"
+release = "0.3.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_io"
-version = "0.3.3"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.3.4"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "IO related functionalities for oktoberfest."
 authors = ["Mario Picciani <[email protected]>"]
 license = "MIT"
@@ -30,7 +30,7 @@ pymzml = "^2.5.0"
 pyteomics = "^4.3.3"
 lxml= '^4.5.2'
 tables = "^3.6.1"
-spectrum-fundamentals = ">=0.4.3,<0.5.0"
+spectrum-fundamentals = ">=0.4.4,<0.5.0"
 
 [tool.poetry.dev-dependencies]
 pytest = ">=6.2.3"

diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Mario Picciani"
 __email__ = "[email protected]"
-__version__ = "0.3.3"
+__version__ = "0.3.4"
 
 import logging
 import logging.handlers

diff --git a/spectrum_io/__main__.py b/spectrum_io/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.3.3", message=click.style("spectrum_io Version: 0.3.3"))
+@click.version_option(version="0.3.4", message=click.style("spectrum_io Version: 0.3.4"))
 def main() -> None:
     """spectrum_io."""
 

diff --git a/spectrum_io/raw/msraw.py b/spectrum_io/raw/msraw.py
@@ -25,7 +25,7 @@ def check_analyzer(mass_analyzers: Dict[str, str]) -> Dict[str, str]:
         accession = mass_analyzers[elem]
         if accession in ["MS:1000079", "MS:1000484"]:  # fourier transform ion cyclotron, orbitrap
             mass_analyzers[elem] = "FTMS"
-        elif accession in ["MS:1000082", "MS:1000264" "MS:1000078"]:  # quadrupole ion-trap, ion-trap, linear ion-trap
+        elif accession in ["MS:1000082", "MS:1000264", "MS:1000078"]:  # quadrupole ion-trap, ion-trap, linear ion-trap
             mass_analyzers[elem] = "ITMS"
         elif accession in ["MS:1000084"]:  # TOF
             mass_analyzers[elem] = "TOF"
@@ -135,43 +135,124 @@ def read_mzml(
         :return: pd.DataFrame with intensities and m/z values
         """
         file_list = MSRaw.get_file_list(source, ext)
-        data = {}  # type: Dict[str, Any]
+
         if package == "pymzml":
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", category=ImportWarning)
-                for file_path in file_list:
-                    logger.info(f"Reading mzML file: {file_path}")
-                    MSRaw._get_scans_pymzml(file_path, data, scanidx, *args, **kwargs)
+            data = MSRaw._read_mzml_pymzml(file_list, scanidx, *args, **kwargs)
         elif package == "pyteomics":
+            data = MSRaw._read_mzml_pyteomics(file_list, *args, **kwargs)
+        else:
+            raise AssertionError("Choose either 'pymzml' or 'pyteomics'")
+
+        data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
+        return data
+
+    @staticmethod
+    def _read_mzml_pymzml(file_list: List[Path], scanidx: Optional[List] = None, *args, **kwargs) -> pd.DataFrame:
+        data_dict = {}
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=ImportWarning)
             for file_path in file_list:
-                mass_analyzer = get_mass_analyzer(file_path)
                 logger.info(f"Reading mzML file: {file_path}")
-                data_iter = mzml.read(source=str(file_path), *args, **kwargs)
+                data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
                 file_name = file_path.stem
-                for spec in data_iter:
-                    if spec["ms level"] != 1:  # filter out ms1 spectra if there are any
-                        spec_id = spec["id"].split("scan=")[-1]
-                        instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
-                        fragmentation = spec["scanList"]["scan"][0]["filter string"].split("@")[1][:3].upper()
-                        mz_range = spec["scanList"]["scan"][0]["filter string"].split("[")[1][:-1]
-                        rt = spec["scanList"]["scan"][0]["scan start time"]
-                        key = f"{file_name}_{spec_id}"
-                        data[key] = [
-                            file_name,
-                            spec_id,
-                            spec["intensity array"],
-                            spec["m/z array"],
-                            mz_range,
-                            rt,
-                            mass_analyzer[instrument_configuration_ref],
-                            fragmentation,
-                        ]
+                mass_analyzer = get_mass_analyzer(file_path)
+                namespace = "{http://psi.hupo.org/ms/mzml}"
+
+                if scanidx is None:
+                    spectra = data_iter
+                else:
+                    # this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
+                    # https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
+                    # /standardMzml.py#L81-L84
+                    spectra = (data_iter[idx] for idx in scanidx)
+
+                for spec in spectra:
+                    if spec.ms_level != 2:
+                        continue  # filter out ms1 spectra if there are any
+                    key = f"{file_name}_{spec.ID}"
+                    scan = spec.get_element_by_path(["scanList", "scan"])[0]
+                    instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
+                    activation = spec.get_element_by_path(["precursorList", "precursor", "activation"])[0]
+                    fragmentation = "unknown"
+                    collision_energy = 0.0
+                    for cv_param in activation:
+                        name = cv_param.get("name")
+                        if name == "collision energy":
+                            collision_energy = float(cv_param.get("value"))
+                            continue
+                        if "beam-type" in name:
+                            fragmentation = "HCD"
+                        elif "collision-induced dissociation" in name:
+                            fragmentation = "CID"
+                        else:
+                            fragmentation = name
+                    scan_window = scan.find(f".//{namespace}scanWindow")
+                    scan_lower_limit = float(
+                        scan_window.find(f'./{namespace}cvParam[@accession="MS:1000501"]').get("value")
+                    )
+                    scan_upper_limit = float(
+                        scan_window.find(f'./{namespace}cvParam[@accession="MS:1000500"]').get("value")
+                    )
+                    mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
+                    data_dict[key] = [
+                        file_name,
+                        spec.ID,
+                        spec.i,
+                        spec.mz,
+                        mz_range,
+                        spec.scan_time_in_minutes(),
+                        mass_analyzer.get(instrument_configuration_ref, "unknown"),
+                        fragmentation,
+                        collision_energy,
+                    ]
                 data_iter.close()
-        else:
-            raise AssertionError("Choose either 'pymzml' or 'pyteomics'")
+        data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
+        return data
 
-        data = pd.DataFrame.from_dict(data, orient="index", columns=MZML_DATA_COLUMNS)
-        data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
+    @staticmethod
+    def _read_mzml_pyteomics(file_list: List[Path], *args, **kwargs) -> pd.DataFrame:
+        data_dict = {}
+        for file_path in file_list:
+            mass_analyzer = get_mass_analyzer(file_path)
+            logger.info(f"Reading mzML file: {file_path}")
+            data_iter = mzml.read(source=str(file_path), *args, **kwargs)
+            file_name = file_path.stem
+            for spec in data_iter:
+                if spec["ms level"] != 2:
+                    continue  # filter out ms1 spectra if there are any
+                spec_id = spec["id"].split("scan=")[-1]
+                scan = spec["scanList"]["scan"][0]
+                instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
+                activation = spec["precursorList"]["precursor"][0]["activation"]
+                fragmentation = "unknown"
+                collision_energy = 0.0
+                for key, value in activation.items():
+                    if key == "collision energy":
+                        collision_energy = value
+                    elif "beam-type" in key:
+                        fragmentation = "HCD"
+                    elif "collision-induced dissociation" in key:
+                        fragmentation = "CID"
+                    else:
+                        fragmentation = key
+                scan_lower_limit = scan["scanWindowList"]["scanWindow"][0]["scan window lower limit"]
+                scan_upper_limit = scan["scanWindowList"]["scanWindow"][0]["scan window upper limit"]
+                mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
+                rt = spec["scanList"]["scan"][0]["scan start time"]
+                key = f"{file_name}_{spec_id}"
+                data_dict[key] = [
+                    file_name,
+                    spec_id,
+                    spec["intensity array"],
+                    spec["m/z array"],
+                    mz_range,
+                    rt,
+                    mass_analyzer.get(instrument_configuration_ref, "unknown"),
+                    fragmentation,
+                    collision_energy,
+                ]
+            data_iter.close()
+        data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
         return data
 
     @staticmethod
@@ -206,62 +287,3 @@ def get_file_list(source: Union[str, Path, List[Union[str, Path]]], ext: str = "
         else:
             raise TypeError("source can only be a single str or Path or a list of files.")
         return file_list
-
-    @staticmethod
-    def _get_scans_pymzml(
-        file_path: Union[str, Path], data: Dict, scanidx: Optional[List] = None, *args, **kwargs
-    ) -> None:
-        """
-        Reads mzml and generates a dataframe containing intensities and m/z values.
-
-        :param file_path: path to a single mzml file.
-        :param data: dictionary to be added to by this function
-        :param scanidx: optional list of scan numbers to extract. if not specified, all scans will be extracted
-        :param args: additional positional arguments
-        :param kwargs: additional keyword arguments
-        """
-        if isinstance(file_path, str):
-            file_path = Path(file_path)
-        data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
-        file_name = file_path.stem
-        mass_analyzer = get_mass_analyzer(file_path)
-        if scanidx is None:
-            for spec in data_iter:
-                if spec.ms_level != 1:  # filter out ms1 spectra if there are any
-                    key = f"{file_name}_{spec.ID}"
-                    instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
-                    filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
-                    fragmentation = filter_string.split("@")[1][:3].upper()
-                    mz_range = filter_string.split("[")[1][:-1]
-                    data[key] = [
-                        file_name,
-                        spec.ID,
-                        spec.i,
-                        spec.mz,
-                        mz_range,
-                        spec.scan_time_in_minutes(),
-                        mass_analyzer[instrument_configuration_ref],
-                        fragmentation,
-                    ]
-        else:
-            for idx in scanidx:
-                spec = data_iter[idx]
-                # this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
-                # https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
-                # /standardMzml.py#L81-L84
-                key = f"{file_name}_{spec.ID}"
-                instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
-                filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
-                fragmentation = filter_string.split("@")[1][:3].upper()
-                mz_range = filter_string.split("[")[1][:-1]
-                data[key] = [
-                    file_name,
-                    spec.ID,
-                    spec.i,
-                    spec.mz,
-                    mz_range,
-                    spec.scan_time_in_minutes(),
-                    mass_analyzer[instrument_configuration_ref],
-                    fragmentation,
-                ]
-        data_iter.close()
diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py
@@ -2,3 +2,4 @@
 from .mascot import Mascot
 from .maxquant import MaxQuant
 from .msfragger import MSFragger
+from .sage import Sage
diff --git a/spectrum_io/search_result/msfragger.py b/spectrum_io/search_result/msfragger.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import spectrum_fundamentals.constants as c
 from pyteomics import pepxml
-from spectrum_fundamentals.mod_string import internal_without_mods
+from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
 from tqdm import tqdm
 
 from .search_results import SearchResults, filter_valid_prosit_sequences
@@ -42,7 +42,7 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
 
         df = pd.concat(ms_frag_results)
 
-        df = update_columns_for_prosit(df, "")
+        df = update_columns_for_prosit(df, tmt_labeled)
         return filter_valid_prosit_sequences(df)
 
 
@@ -58,7 +58,17 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
     df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0])
     df["MASS"] = df["precursor_neutral_mass"]
     df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x))
-    df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"])
+
+    if tmt_labeled != "":
+        unimod_tag = c.TMT_MODS[tmt_labeled]
+        logger.info("Adding TMT fixed modifications")
+        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(
+            df["modified_peptide"].to_list(),
+            fixed_mods={"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"},
+        )
+    else:
+        df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list())
+
     df.rename(
         columns={
             "assumed_charge": "PRECURSOR_CHARGE",
@@ -84,20 +94,3 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
             "PEPTIDE_LENGTH",
         ]
     ]
-
-
-def msfragger_to_internal(modstrings: pd.Series):
-    """
-    Transform modstring from msfragger format to internal format.
-
-    This function takes a modstrings column from a pandas dataframe and converts each
-    supported modification (M[147] and C[160]) to the internal representation that is
-    M[UNIMOD:35] and C[UNIMOD:4], respectively. Since C is considered a fixed modification,
-    every occurence of a C is transformed to C[UNIMOD:4] as well.
-
-    :param modstrings: pd.Series containing the msfragger modstrings
-    :return: pd.Series with internal modstrings
-    """
-    modstrings = modstrings.str.replace("M[147]", "M[UNIMOD:35]", regex=False)
-    modstrings = modstrings.str.replace(r"C\[160\]|C", "C[UNIMOD:4]", regex=True)
-    return modstrings