Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch/0.4.1 #84

Merged
merged 14 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.4.0
version: 0.4.1
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.4.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.4.0 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.4.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.4.1 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
name: Run spectrum_io Tests

on:
- push
- pull_request
push:
branches:
- development
- main
- "release/*"
pull_request:
branches:
- "*"

jobs:
tests:
Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.0
current_version = 0.4.1

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.4.0"
version = "0.4.1"
# The full version, including alpha/beta/rc tags.
release = "0.4.0"
release = "0.4.1"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
1,402 changes: 667 additions & 735 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.4.0" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.4.1" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Wilhelmlab at Technical University of Munich"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = "Mario Picciani"
__email__ = "[email protected]"
__version__ = "0.4.0"
__version__ = "0.4.1"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.4.0", message=click.style("spectrum_io Version: 0.4.0"))
@click.version_option(version="0.4.1", message=click.style("spectrum_io Version: 0.4.1"))
def main() -> None:
"""spectrum_io."""

Expand Down
6 changes: 5 additions & 1 deletion spectrum_io/raw/msraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,11 @@
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
file_name = file_path.stem
instrument_name = list(data_iter.get_by_id("commonInstrumentParams").keys())[1]
try:
instrument_params = data_iter.get_by_id("commonInstrumentParams")
except KeyError:
instrument_params = data_iter.get_by_id("CommonInstrumentParams")

Check warning on line 225 in spectrum_io/raw/msraw.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/raw/msraw.py#L224-L225

Added lines #L224 - L225 were not covered by tests
instrument_name = list(instrument_params.keys())[1]
for spec in data_iter:
if spec["ms level"] != 2:
continue # filter out ms1 spectra if there are any
Expand Down
32 changes: 13 additions & 19 deletions spectrum_io/spectral_library/digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
import collections
import csv
import itertools
import logging
import sys
from typing import List
from typing import Dict, List

import numpy as np

logger = logging.getLogger(__name__)

cleavage_sites = {
"trypsinp": (["K", "R"], []),
"trypsin": (["K", "R"], ["P"]),
Expand Down Expand Up @@ -35,16 +38,13 @@
"modified_sequence,collision_energy,precursor_charge,protein,fragmentation".split(",")
)

pre, not_post = cleavage_sites[args.enzyme]
for peptide, proteins in get_peptide_to_protein_map(
args.fasta,
# db = 'concat',
db=args.db,
digestion=args.digestion,
min_len=args.min_length,
max_len=args.max_length,
pre=pre,
not_post=not_post,
enzyme=args.enzyme,
miscleavages=args.cleavages,
methionine_cleavage=True,
special_aas=list(args.special_aas),
Expand All @@ -55,23 +55,18 @@
writer.writerow([peptide, 30, charge, args.fragmentation])
writer_with_proteins.writerow([peptide, 30, charge, proteins[0], args.fragmentation])

# python digest.py --fasta /media/kusterlab/internal_projects/active/Mouse_proteome/stuff/
# 10090_UP000000589_UniProtKB_Mouse_CanIso_2018_03_27.fasta
# --peptide_protein_map ../data/fasta/10090_UP000000589_UniProtKB_Mouse_CanIso_2018_03_27.peptide_to_protein_map.txt
if args.peptide_protein_map:
with open(args.peptide_protein_map + ".params.txt", "w") as f:
f.write(" ".join(sys.argv))
writer = get_tsv_writer(args.peptide_protein_map, delimiter="\t")

pre, not_post = cleavage_sites[args.enzyme]
for peptide, proteins in get_peptide_to_protein_map(
args.fasta,
db="concat",
digestion=args.digestion,
min_len=args.min_length,
max_len=args.max_length,
pre=pre,
not_post=not_post,
enzyme=args.enzyme,
miscleavages=args.cleavages,
methionine_cleavage=True,
special_aas=list(args.special_aas),
Expand Down Expand Up @@ -454,8 +449,7 @@
db="concat",
min_len=6,
max_len=52,
pre=None,
not_post=None,
enzyme: str = "trypsin",
digestion="full",
miscleavages=2,
methionine_cleavage=True,
Expand All @@ -464,6 +458,8 @@
parse_id=parse_until_first_space,
):
"""Get peptide to protein map."""
pre, not_post = cleavage_sites[enzyme]

if pre is None:
pre = ["K", "R"]
if not_post is None:
Expand All @@ -474,15 +470,14 @@
protein_to_seq_map = dict()
for protein_idx, (protein, seq) in enumerate(read_fasta(fasta_file, db, parse_id, special_aas=special_aas)):
if (protein_idx + 1) % 10000 == 0:
print("Digesting protein", protein_idx + 1)
logger.info(f"Digesting protein {protein_idx + 1}")

Check warning on line 473 in spectrum_io/spectral_library/digest.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/digest.py#L473

Added line #L473 was not covered by tests
seen_peptides = set()
protein_to_seq_map[protein] = seq
# for peptide in digestfast.get_digested_peptides(seq, min_len, max_len, pre, not_post, digestion,
# miscleavages, methionine_cleavage):
for peptide in get_digested_peptides(
seq, min_len, max_len, pre, not_post, digestion, miscleavages, methionine_cleavage
):
peptide = peptide
if use_hash_key:
hash_key = peptide[:6]
else:
Expand All @@ -500,13 +495,13 @@
def get_peptide_to_protein_map_from_file(peptide_to_protein_map_file, use_hash_key=False):
"""Get peptide to protein map from file."""
if use_hash_key:
print("Hash key not supported yet, continuing without hash key...")
logger.warning("Hash key not supported yet, continuing without hash key...")

Check warning on line 498 in spectrum_io/spectral_library/digest.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/digest.py#L498

Added line #L498 was not covered by tests
use_hash_key = False
peptide_to_protein_map = collections.defaultdict(list)
reader = get_tsv_reader(peptide_to_protein_map_file)
for i, row in enumerate(reader):
if (i + 1) % 1000000 == 0:
print("Processing peptide", i + 1)
logger.info(f"Processing peptide {i + 1}")

Check warning on line 504 in spectrum_io/spectral_library/digest.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/digest.py#L504

Added line #L504 was not covered by tests

peptide, proteins = row[0], row[1].split(";")
if use_hash_key:
Expand Down Expand Up @@ -562,8 +557,7 @@
digestion="full",
min_len=max([6, args.min_length]),
max_len=min([30, args.max_length]),
pre=pre,
not_post=not_post,
enzyme=args.enzyme,
miscleavages=0,
methionine_cleavage=False,
special_aas=list(args.special_aas),
Expand Down
109 changes: 39 additions & 70 deletions spectrum_io/spectral_library/msp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import IO, Dict

import numpy as np
import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES
from spectrum_fundamentals.mod_string import internal_to_mod_names, internal_without_mods
Expand All @@ -8,78 +11,44 @@
class MSP(SpectralLibrary):
"""Main to initialze a MSP obj."""

# Check msp folder for output format.
def write(self):
"""Writing method; writes intermediate dataframe as msp format spectra."""
out = open(self.out_path, "a")

for _, spectrum in self.spectra_output.iterrows():
spectrum = spectrum.to_dict()
out.write(f"Name: {spectrum['StrippedPeptide']}/{spectrum['PrecursorCharge']}\n")
out.write(f"MW: {spectrum['PrecursorMz']}\n")
out.write(
f"Comment: Parent={spectrum['PrecursorMz']} "
f"Collision_energy={spectrum['CollisionEnergy']} "
f"Mods={spectrum['Modifications'][0]} "
f"ModString={spectrum['Modifications'][1]}/{spectrum['PrecursorCharge']} "
f"iRT={spectrum['iRT']} "
@staticmethod
def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes):
annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode()
return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n'

def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
# prepare metadata
stripped_peptides = metadata["SEQUENCE"]
modss = internal_to_mod_names(metadata["MODIFIED_SEQUENCE"])
p_charges = metadata["PRECURSOR_CHARGE"]
p_mzs = (metadata["MASS"] + (p_charges * PARTICLE_MASSES["PROTON"])) / p_charges
ces = metadata["COLLISION_ENERGY"]

# prepare spectra
irts = data["irt"][:, 0] # should create a 1D view of the (n_peptides, 1) shaped array
f_mzss = data["mz"]
f_intss = data["intensities"]
f_annotss = data["annotation"]

lines = []
vec_assemble = np.vectorize(MSP._assemble_fragment_string)

for stripped_peptide, p_charge, p_mz, ce, mods, irt, f_mzs, f_ints, f_annots in zip(
stripped_peptides, p_charges, p_mzs, ces, modss, irts, f_mzss, f_intss, f_annotss
):
lines.append(f"Name: {stripped_peptide}/{p_charge}\nMW: {p_mz}\n")
lines.append(
f"Comment: Parent={p_mz:.8f} Collision_energy={ce} Mods={mods[0]} "
f"ModString={mods[1]}/{p_charge} iRT={irt:.2f}\n"
)
if len(list(self.grpc_output)) > 2:
out.write(f"proteotypicity={spectrum['proteotypicity']}\n")
else:
out.write("\n")
out.write(f"Num peaks: {sum(elem!='N' for elem in spectrum['fragment_types'])}\n")
for fmz, fintensity, ftype, fcharge, fnumber in zip(
spectrum["fragment_mz"],
spectrum["intensities"],
spectrum["fragment_types"],
spectrum["fragment_charges"],
spectrum["fragment_numbers"],
):
if ftype != "N":
fcharge = f"^{fcharge}" if fcharge != 1 else ""
out.write(f"{fmz}\t{fintensity}\t" f'"{ftype}{fnumber}{fcharge}/0.0ppm"\n')
out.close()

def prepare_spectrum(self):
"""Converts grpc output and metadata dataframe into msp format."""
intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
fragment_types = annotation["type"]
fragment_numbers = annotation["number"]
fragment_charges = annotation["charge"]
irt = self.grpc_output[list(self.grpc_output)[1]]
irt = irt.flatten()
if len(list(self.grpc_output)) > 2:
proteotypicity = self.grpc_output[list(self.grpc_output)[2]]
proteotypicity = proteotypicity.flatten()
modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]
collision_energies = self.spectra_input["COLLISION_ENERGY"]
cond = self._fragment_filter_passed(f_mzs, f_ints)
fragment_list = vec_assemble(f_mzs[cond], f_ints[cond], f_annots[cond])

stripped_peptide = internal_without_mods(modified_sequences)
msp_mod_strings = internal_to_mod_names(modified_sequences)
charges = self.spectra_input["PRECURSOR_CHARGE"]
precursor_masses = self.spectra_input["MASS"]
precursor_mz = (precursor_masses + (charges * PARTICLE_MASSES["PROTON"])) / charges
lines.append(f"Num peaks: {len(fragment_list)}\n")

inter_df = pd.DataFrame(
data={
"ModifiedPeptide": modified_sequences,
"StrippedPeptide": stripped_peptide,
"PrecursorCharge": charges,
"PrecursorMz": precursor_mz,
"PrecursorMass": precursor_masses,
"CollisionEnergy": collision_energies,
"Modifications": msp_mod_strings,
}
)
inter_df["iRT"] = irt.tolist()
if len(list(self.grpc_output)) > 2:
inter_df["proteotypicity"] = proteotypicity.tolist()
inter_df["intensities"], inter_df["fragment_mz"] = intensities.tolist(), fragment_mz.tolist()
inter_df["fragment_types"] = fragment_types.tolist()
inter_df["fragment_numbers"] = fragment_numbers.tolist()
inter_df["fragment_charges"] = fragment_charges.tolist()
lines.extend(fragment_list)
out.writelines(lines)

self.spectra_output = inter_df
def _write_header(self, out: IO):
pass
Loading
Loading