Skip to content

Commit

Permalink
Merge pull request #84 from wilhelm-lab/patch/0.4.1
Browse files Browse the repository at this point in the history
Patch/0.4.1
  • Loading branch information
picciama authored Jan 12, 2024
2 parents 238b811 + 69e81c5 commit 0ae5078
Show file tree
Hide file tree
Showing 16 changed files with 954 additions and 1,022 deletions.
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.4.0
version: 0.4.1
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.4.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.4.0 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.4.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.4.1 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
name: Run spectrum_io Tests

on:
- push
- pull_request
push:
branches:
- development
- main
- "release/*"
pull_request:
branches:
- "*"

jobs:
tests:
Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.0
current_version = 0.4.1

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.4.0"
version = "0.4.1"
# The full version, including alpha/beta/rc tags.
release = "0.4.0"
release = "0.4.1"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
1,402 changes: 667 additions & 735 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.4.0" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.4.1" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Wilhelmlab at Technical University of Munich"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = "Mario Picciani"
__email__ = "[email protected]"
__version__ = "0.4.0"
__version__ = "0.4.1"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.4.0", message=click.style("spectrum_io Version: 0.4.0"))
@click.version_option(version="0.4.1", message=click.style("spectrum_io Version: 0.4.1"))
def main() -> None:
"""spectrum_io."""

Expand Down
6 changes: 5 additions & 1 deletion spectrum_io/raw/msraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,11 @@ def _read_mzml_pyteomics(file_list: List[Path], *args, **kwargs) -> pd.DataFrame
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
file_name = file_path.stem
instrument_name = list(data_iter.get_by_id("commonInstrumentParams").keys())[1]
try:
instrument_params = data_iter.get_by_id("commonInstrumentParams")
except KeyError:
instrument_params = data_iter.get_by_id("CommonInstrumentParams")
instrument_name = list(instrument_params.keys())[1]
for spec in data_iter:
if spec["ms level"] != 2:
continue # filter out ms1 spectra if there are any
Expand Down
32 changes: 13 additions & 19 deletions spectrum_io/spectral_library/digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
import collections
import csv
import itertools
import logging
import sys
from typing import List
from typing import Dict, List

import numpy as np

logger = logging.getLogger(__name__)

cleavage_sites = {
"trypsinp": (["K", "R"], []),
"trypsin": (["K", "R"], ["P"]),
Expand Down Expand Up @@ -35,16 +38,13 @@ def main(args):
"modified_sequence,collision_energy,precursor_charge,protein,fragmentation".split(",")
)

pre, not_post = cleavage_sites[args.enzyme]
for peptide, proteins in get_peptide_to_protein_map(
args.fasta,
# db = 'concat',
db=args.db,
digestion=args.digestion,
min_len=args.min_length,
max_len=args.max_length,
pre=pre,
not_post=not_post,
enzyme=args.enzyme,
miscleavages=args.cleavages,
methionine_cleavage=True,
special_aas=list(args.special_aas),
Expand All @@ -55,23 +55,18 @@ def main(args):
writer.writerow([peptide, 30, charge, args.fragmentation])
writer_with_proteins.writerow([peptide, 30, charge, proteins[0], args.fragmentation])

# python digest.py --fasta /media/kusterlab/internal_projects/active/Mouse_proteome/stuff/
# 10090_UP000000589_UniProtKB_Mouse_CanIso_2018_03_27.fasta
# --peptide_protein_map ../data/fasta/10090_UP000000589_UniProtKB_Mouse_CanIso_2018_03_27.peptide_to_protein_map.txt
if args.peptide_protein_map:
with open(args.peptide_protein_map + ".params.txt", "w") as f:
f.write(" ".join(sys.argv))
writer = get_tsv_writer(args.peptide_protein_map, delimiter="\t")

pre, not_post = cleavage_sites[args.enzyme]
for peptide, proteins in get_peptide_to_protein_map(
args.fasta,
db="concat",
digestion=args.digestion,
min_len=args.min_length,
max_len=args.max_length,
pre=pre,
not_post=not_post,
enzyme=args.enzyme,
miscleavages=args.cleavages,
methionine_cleavage=True,
special_aas=list(args.special_aas),
Expand Down Expand Up @@ -454,8 +449,7 @@ def get_peptide_to_protein_map(
db="concat",
min_len=6,
max_len=52,
pre=None,
not_post=None,
enzyme: str = "trypsin",
digestion="full",
miscleavages=2,
methionine_cleavage=True,
Expand All @@ -464,6 +458,8 @@ def get_peptide_to_protein_map(
parse_id=parse_until_first_space,
):
"""Get peptide to protein map."""
pre, not_post = cleavage_sites[enzyme]

if pre is None:
pre = ["K", "R"]
if not_post is None:
Expand All @@ -474,15 +470,14 @@ def get_peptide_to_protein_map(
protein_to_seq_map = dict()
for protein_idx, (protein, seq) in enumerate(read_fasta(fasta_file, db, parse_id, special_aas=special_aas)):
if (protein_idx + 1) % 10000 == 0:
print("Digesting protein", protein_idx + 1)
logger.info(f"Digesting protein {protein_idx + 1}")
seen_peptides = set()
protein_to_seq_map[protein] = seq
# for peptide in digestfast.get_digested_peptides(seq, min_len, max_len, pre, not_post, digestion,
# miscleavages, methionine_cleavage):
for peptide in get_digested_peptides(
seq, min_len, max_len, pre, not_post, digestion, miscleavages, methionine_cleavage
):
peptide = peptide
if use_hash_key:
hash_key = peptide[:6]
else:
Expand All @@ -500,13 +495,13 @@ def get_peptide_to_protein_map(
def get_peptide_to_protein_map_from_file(peptide_to_protein_map_file, use_hash_key=False):
"""Get peptide to protein map from file."""
if use_hash_key:
print("Hash key not supported yet, continuing without hash key...")
logger.warning("Hash key not supported yet, continuing without hash key...")
use_hash_key = False
peptide_to_protein_map = collections.defaultdict(list)
reader = get_tsv_reader(peptide_to_protein_map_file)
for i, row in enumerate(reader):
if (i + 1) % 1000000 == 0:
print("Processing peptide", i + 1)
logger.info(f"Processing peptide {i + 1}")

peptide, proteins = row[0], row[1].split(";")
if use_hash_key:
Expand Down Expand Up @@ -562,8 +557,7 @@ def get_ibaq_peptide_to_protein_map(args):
digestion="full",
min_len=max([6, args.min_length]),
max_len=min([30, args.max_length]),
pre=pre,
not_post=not_post,
enzyme=args.enzyme,
miscleavages=0,
methionine_cleavage=False,
special_aas=list(args.special_aas),
Expand Down
109 changes: 39 additions & 70 deletions spectrum_io/spectral_library/msp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import IO, Dict

import numpy as np
import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES
from spectrum_fundamentals.mod_string import internal_to_mod_names, internal_without_mods
Expand All @@ -8,78 +11,44 @@
class MSP(SpectralLibrary):
"""Main to initialze a MSP obj."""

# Check msp folder for output format.
def write(self):
"""Writing method; writes intermediate dataframe as msp format spectra."""
out = open(self.out_path, "a")

for _, spectrum in self.spectra_output.iterrows():
spectrum = spectrum.to_dict()
out.write(f"Name: {spectrum['StrippedPeptide']}/{spectrum['PrecursorCharge']}\n")
out.write(f"MW: {spectrum['PrecursorMz']}\n")
out.write(
f"Comment: Parent={spectrum['PrecursorMz']} "
f"Collision_energy={spectrum['CollisionEnergy']} "
f"Mods={spectrum['Modifications'][0]} "
f"ModString={spectrum['Modifications'][1]}/{spectrum['PrecursorCharge']} "
f"iRT={spectrum['iRT']} "
@staticmethod
def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes):
annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode()
return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n'

def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
# prepare metadata
stripped_peptides = metadata["SEQUENCE"]
modss = internal_to_mod_names(metadata["MODIFIED_SEQUENCE"])
p_charges = metadata["PRECURSOR_CHARGE"]
p_mzs = (metadata["MASS"] + (p_charges * PARTICLE_MASSES["PROTON"])) / p_charges
ces = metadata["COLLISION_ENERGY"]

# prepare spectra
irts = data["irt"][:, 0] # should create a 1D view of the (n_peptides, 1) shaped array
f_mzss = data["mz"]
f_intss = data["intensities"]
f_annotss = data["annotation"]

lines = []
vec_assemble = np.vectorize(MSP._assemble_fragment_string)

for stripped_peptide, p_charge, p_mz, ce, mods, irt, f_mzs, f_ints, f_annots in zip(
stripped_peptides, p_charges, p_mzs, ces, modss, irts, f_mzss, f_intss, f_annotss
):
lines.append(f"Name: {stripped_peptide}/{p_charge}\nMW: {p_mz}\n")
lines.append(
f"Comment: Parent={p_mz:.8f} Collision_energy={ce} Mods={mods[0]} "
f"ModString={mods[1]}/{p_charge} iRT={irt:.2f}\n"
)
if len(list(self.grpc_output)) > 2:
out.write(f"proteotypicity={spectrum['proteotypicity']}\n")
else:
out.write("\n")
out.write(f"Num peaks: {sum(elem!='N' for elem in spectrum['fragment_types'])}\n")
for fmz, fintensity, ftype, fcharge, fnumber in zip(
spectrum["fragment_mz"],
spectrum["intensities"],
spectrum["fragment_types"],
spectrum["fragment_charges"],
spectrum["fragment_numbers"],
):
if ftype != "N":
fcharge = f"^{fcharge}" if fcharge != 1 else ""
out.write(f"{fmz}\t{fintensity}\t" f'"{ftype}{fnumber}{fcharge}/0.0ppm"\n')
out.close()

def prepare_spectrum(self):
"""Converts grpc output and metadata dataframe into msp format."""
intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
fragment_types = annotation["type"]
fragment_numbers = annotation["number"]
fragment_charges = annotation["charge"]
irt = self.grpc_output[list(self.grpc_output)[1]]
irt = irt.flatten()
if len(list(self.grpc_output)) > 2:
proteotypicity = self.grpc_output[list(self.grpc_output)[2]]
proteotypicity = proteotypicity.flatten()
modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]
collision_energies = self.spectra_input["COLLISION_ENERGY"]
cond = self._fragment_filter_passed(f_mzs, f_ints)
fragment_list = vec_assemble(f_mzs[cond], f_ints[cond], f_annots[cond])

stripped_peptide = internal_without_mods(modified_sequences)
msp_mod_strings = internal_to_mod_names(modified_sequences)
charges = self.spectra_input["PRECURSOR_CHARGE"]
precursor_masses = self.spectra_input["MASS"]
precursor_mz = (precursor_masses + (charges * PARTICLE_MASSES["PROTON"])) / charges
lines.append(f"Num peaks: {len(fragment_list)}\n")

inter_df = pd.DataFrame(
data={
"ModifiedPeptide": modified_sequences,
"StrippedPeptide": stripped_peptide,
"PrecursorCharge": charges,
"PrecursorMz": precursor_mz,
"PrecursorMass": precursor_masses,
"CollisionEnergy": collision_energies,
"Modifications": msp_mod_strings,
}
)
inter_df["iRT"] = irt.tolist()
if len(list(self.grpc_output)) > 2:
inter_df["proteotypicity"] = proteotypicity.tolist()
inter_df["intensities"], inter_df["fragment_mz"] = intensities.tolist(), fragment_mz.tolist()
inter_df["fragment_types"] = fragment_types.tolist()
inter_df["fragment_numbers"] = fragment_numbers.tolist()
inter_df["fragment_charges"] = fragment_charges.tolist()
lines.extend(fragment_list)
out.writelines(lines)

self.spectra_output = inter_df
def _write_header(self, out: IO):
pass
Loading

0 comments on commit 0ae5078

Please sign in to comment.