Merge pull request #84 from wilhelm-lab/patch/0.4.1

Patch/0.4.1
wilhelm-lab · Jan 12, 2024 · 0ae5078 · 0ae5078
2 parents 238b811 + 69e81c5
commit 0ae5078
Show file tree

Hide file tree

Showing 16 changed files with 954 additions and 1,022 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Mario Picciani
 email: [email protected]
 project_name: spectrum_io
 project_short_description: IO related functionalities for oktoberfest.
-version: 0.4.0
+version: 0.4.1
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.4.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.4.0 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.4.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.4.1 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -1,8 +1,14 @@
 name: Run spectrum_io Tests
 
 on:
-    - push
-    - pull_request
+    push:
+        branches:
+            - development
+            - main
+            - "release/*"
+    pull_request:
+        branches:
+            - "*"
 
 jobs:
     tests:

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0
+current_version = 0.4.1
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_io/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -53,9 +53,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.4.0"
+version = "0.4.1"
 # The full version, including alpha/beta/rc tags.
-release = "0.4.0"
+release = "0.4.1"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_io"
-version = "0.4.0"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.4.1"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "IO related functionalities for oktoberfest."
 authors = ["Wilhelmlab at Technical University of Munich"]
 license = "MIT"

diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Mario Picciani"
 __email__ = "[email protected]"
-__version__ = "0.4.0"
+__version__ = "0.4.1"
 
 import logging
 import logging.handlers

diff --git a/spectrum_io/__main__.py b/spectrum_io/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.4.0", message=click.style("spectrum_io Version: 0.4.0"))
+@click.version_option(version="0.4.1", message=click.style("spectrum_io Version: 0.4.1"))
 def main() -> None:
     """spectrum_io."""
 

diff --git a/spectrum_io/raw/msraw.py b/spectrum_io/raw/msraw.py
@@ -219,7 +219,11 @@ def _read_mzml_pyteomics(file_list: List[Path], *args, **kwargs) -> pd.DataFrame
             logger.info(f"Reading mzML file: {file_path}")
             data_iter = mzml.read(source=str(file_path), *args, **kwargs)
             file_name = file_path.stem
-            instrument_name = list(data_iter.get_by_id("commonInstrumentParams").keys())[1]
+            try:
+                instrument_params = data_iter.get_by_id("commonInstrumentParams")
+            except KeyError:
+                instrument_params = data_iter.get_by_id("CommonInstrumentParams")
+            instrument_name = list(instrument_params.keys())[1]
             for spec in data_iter:
                 if spec["ms level"] != 2:
                     continue  # filter out ms1 spectra if there are any

diff --git a/spectrum_io/spectral_library/digest.py b/spectrum_io/spectral_library/digest.py
@@ -2,11 +2,14 @@
 import collections
 import csv
 import itertools
+import logging
 import sys
-from typing import List
+from typing import Dict, List
 
 import numpy as np
 
+logger = logging.getLogger(__name__)
+
 cleavage_sites = {
     "trypsinp": (["K", "R"], []),
     "trypsin": (["K", "R"], ["P"]),
@@ -35,16 +38,13 @@ def main(args):
             "modified_sequence,collision_energy,precursor_charge,protein,fragmentation".split(",")
         )
 
-        pre, not_post = cleavage_sites[args.enzyme]
         for peptide, proteins in get_peptide_to_protein_map(
             args.fasta,
-            # db = 'concat',
             db=args.db,
             digestion=args.digestion,
             min_len=args.min_length,
             max_len=args.max_length,
-            pre=pre,
-            not_post=not_post,
+            enzyme=args.enzyme,
             miscleavages=args.cleavages,
             methionine_cleavage=True,
             special_aas=list(args.special_aas),
@@ -55,23 +55,18 @@ def main(args):
                     writer.writerow([peptide, 30, charge, args.fragmentation])
                     writer_with_proteins.writerow([peptide, 30, charge, proteins[0], args.fragmentation])
 
-    # python digest.py --fasta /media/kusterlab/internal_projects/active/Mouse_proteome/stuff/
-    # 10090_UP000000589_UniProtKB_Mouse_CanIso_2018_03_27.fasta
-    # --peptide_protein_map ../data/fasta/10090_UP000000589_UniProtKB_Mouse_CanIso_2018_03_27.peptide_to_protein_map.txt
     if args.peptide_protein_map:
         with open(args.peptide_protein_map + ".params.txt", "w") as f:
             f.write(" ".join(sys.argv))
         writer = get_tsv_writer(args.peptide_protein_map, delimiter="\t")
 
-        pre, not_post = cleavage_sites[args.enzyme]
         for peptide, proteins in get_peptide_to_protein_map(
             args.fasta,
             db="concat",
             digestion=args.digestion,
             min_len=args.min_length,
             max_len=args.max_length,
-            pre=pre,
-            not_post=not_post,
+            enzyme=args.enzyme,
             miscleavages=args.cleavages,
             methionine_cleavage=True,
             special_aas=list(args.special_aas),
@@ -454,8 +449,7 @@ def get_peptide_to_protein_map(
     db="concat",
     min_len=6,
     max_len=52,
-    pre=None,
-    not_post=None,
+    enzyme: str = "trypsin",
     digestion="full",
     miscleavages=2,
     methionine_cleavage=True,
@@ -464,6 +458,8 @@ def get_peptide_to_protein_map(
     parse_id=parse_until_first_space,
 ):
     """Get peptide to protein map."""
+    pre, not_post = cleavage_sites[enzyme]
+
     if pre is None:
         pre = ["K", "R"]
     if not_post is None:
@@ -474,15 +470,14 @@ def get_peptide_to_protein_map(
     protein_to_seq_map = dict()
     for protein_idx, (protein, seq) in enumerate(read_fasta(fasta_file, db, parse_id, special_aas=special_aas)):
         if (protein_idx + 1) % 10000 == 0:
-            print("Digesting protein", protein_idx + 1)
+            logger.info(f"Digesting protein {protein_idx + 1}")
         seen_peptides = set()
         protein_to_seq_map[protein] = seq
         # for peptide in digestfast.get_digested_peptides(seq, min_len, max_len, pre, not_post, digestion,
         # miscleavages, methionine_cleavage):
         for peptide in get_digested_peptides(
             seq, min_len, max_len, pre, not_post, digestion, miscleavages, methionine_cleavage
         ):
-            peptide = peptide
             if use_hash_key:
                 hash_key = peptide[:6]
             else:
@@ -500,13 +495,13 @@ def get_peptide_to_protein_map(
 def get_peptide_to_protein_map_from_file(peptide_to_protein_map_file, use_hash_key=False):
     """Get peptide to protein map from file."""
     if use_hash_key:
-        print("Hash key not supported yet, continuing without hash key...")
+        logger.warning("Hash key not supported yet, continuing without hash key...")
         use_hash_key = False
     peptide_to_protein_map = collections.defaultdict(list)
     reader = get_tsv_reader(peptide_to_protein_map_file)
     for i, row in enumerate(reader):
         if (i + 1) % 1000000 == 0:
-            print("Processing peptide", i + 1)
+            logger.info(f"Processing peptide  {i + 1}")
 
         peptide, proteins = row[0], row[1].split(";")
         if use_hash_key:
@@ -562,8 +557,7 @@ def get_ibaq_peptide_to_protein_map(args):
         digestion="full",
         min_len=max([6, args.min_length]),
         max_len=min([30, args.max_length]),
-        pre=pre,
-        not_post=not_post,
+        enzyme=args.enzyme,
         miscleavages=0,
         methionine_cleavage=False,
         special_aas=list(args.special_aas),

diff --git a/spectrum_io/spectral_library/msp.py b/spectrum_io/spectral_library/msp.py
@@ -1,3 +1,6 @@
+from typing import IO, Dict
+
+import numpy as np
 import pandas as pd
 from spectrum_fundamentals.constants import PARTICLE_MASSES
 from spectrum_fundamentals.mod_string import internal_to_mod_names, internal_without_mods
@@ -8,78 +11,44 @@
 class MSP(SpectralLibrary):
     """Main to initialze a MSP obj."""
 
-    # Check msp folder for output format.
-    def write(self):
-        """Writing method; writes intermediate dataframe as msp format spectra."""
-        out = open(self.out_path, "a")
-
-        for _, spectrum in self.spectra_output.iterrows():
-            spectrum = spectrum.to_dict()
-            out.write(f"Name: {spectrum['StrippedPeptide']}/{spectrum['PrecursorCharge']}\n")
-            out.write(f"MW: {spectrum['PrecursorMz']}\n")
-            out.write(
-                f"Comment: Parent={spectrum['PrecursorMz']} "
-                f"Collision_energy={spectrum['CollisionEnergy']} "
-                f"Mods={spectrum['Modifications'][0]} "
-                f"ModString={spectrum['Modifications'][1]}/{spectrum['PrecursorCharge']} "
-                f"iRT={spectrum['iRT']} "
+    @staticmethod
+    def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes):
+        annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode()
+        return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n'
+
+    def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
+        # prepare metadata
+        stripped_peptides = metadata["SEQUENCE"]
+        modss = internal_to_mod_names(metadata["MODIFIED_SEQUENCE"])
+        p_charges = metadata["PRECURSOR_CHARGE"]
+        p_mzs = (metadata["MASS"] + (p_charges * PARTICLE_MASSES["PROTON"])) / p_charges
+        ces = metadata["COLLISION_ENERGY"]
+
+        # prepare spectra
+        irts = data["irt"][:, 0]  # should create a 1D view of the (n_peptides, 1) shaped array
+        f_mzss = data["mz"]
+        f_intss = data["intensities"]
+        f_annotss = data["annotation"]
+
+        lines = []
+        vec_assemble = np.vectorize(MSP._assemble_fragment_string)
+
+        for stripped_peptide, p_charge, p_mz, ce, mods, irt, f_mzs, f_ints, f_annots in zip(
+            stripped_peptides, p_charges, p_mzs, ces, modss, irts, f_mzss, f_intss, f_annotss
+        ):
+            lines.append(f"Name: {stripped_peptide}/{p_charge}\nMW: {p_mz}\n")
+            lines.append(
+                f"Comment: Parent={p_mz:.8f} Collision_energy={ce} Mods={mods[0]} "
+                f"ModString={mods[1]}/{p_charge} iRT={irt:.2f}\n"
             )
-            if len(list(self.grpc_output)) > 2:
-                out.write(f"proteotypicity={spectrum['proteotypicity']}\n")
-            else:
-                out.write("\n")
-            out.write(f"Num peaks: {sum(elem!='N' for elem in spectrum['fragment_types'])}\n")
-            for fmz, fintensity, ftype, fcharge, fnumber in zip(
-                spectrum["fragment_mz"],
-                spectrum["intensities"],
-                spectrum["fragment_types"],
-                spectrum["fragment_charges"],
-                spectrum["fragment_numbers"],
-            ):
-                if ftype != "N":
-                    fcharge = f"^{fcharge}" if fcharge != 1 else ""
-                    out.write(f"{fmz}\t{fintensity}\t" f'"{ftype}{fnumber}{fcharge}/0.0ppm"\n')
-        out.close()
 
-    def prepare_spectrum(self):
-        """Converts grpc output and metadata dataframe into msp format."""
-        intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
-        fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
-        annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
-        fragment_types = annotation["type"]
-        fragment_numbers = annotation["number"]
-        fragment_charges = annotation["charge"]
-        irt = self.grpc_output[list(self.grpc_output)[1]]
-        irt = irt.flatten()
-        if len(list(self.grpc_output)) > 2:
-            proteotypicity = self.grpc_output[list(self.grpc_output)[2]]
-            proteotypicity = proteotypicity.flatten()
-        modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]
-        collision_energies = self.spectra_input["COLLISION_ENERGY"]
+            cond = self._fragment_filter_passed(f_mzs, f_ints)
+            fragment_list = vec_assemble(f_mzs[cond], f_ints[cond], f_annots[cond])
 
-        stripped_peptide = internal_without_mods(modified_sequences)
-        msp_mod_strings = internal_to_mod_names(modified_sequences)
-        charges = self.spectra_input["PRECURSOR_CHARGE"]
-        precursor_masses = self.spectra_input["MASS"]
-        precursor_mz = (precursor_masses + (charges * PARTICLE_MASSES["PROTON"])) / charges
+            lines.append(f"Num peaks: {len(fragment_list)}\n")
 
-        inter_df = pd.DataFrame(
-            data={
-                "ModifiedPeptide": modified_sequences,
-                "StrippedPeptide": stripped_peptide,
-                "PrecursorCharge": charges,
-                "PrecursorMz": precursor_mz,
-                "PrecursorMass": precursor_masses,
-                "CollisionEnergy": collision_energies,
-                "Modifications": msp_mod_strings,
-            }
-        )
-        inter_df["iRT"] = irt.tolist()
-        if len(list(self.grpc_output)) > 2:
-            inter_df["proteotypicity"] = proteotypicity.tolist()
-        inter_df["intensities"], inter_df["fragment_mz"] = intensities.tolist(), fragment_mz.tolist()
-        inter_df["fragment_types"] = fragment_types.tolist()
-        inter_df["fragment_numbers"] = fragment_numbers.tolist()
-        inter_df["fragment_charges"] = fragment_charges.tolist()
+            lines.extend(fragment_list)
+        out.writelines(lines)
 
-        self.spectra_output = inter_df
+    def _write_header(self, out: IO):
+        pass