Skip to content

Commit

Permalink
Merge pull request #151 from wilhelm-lab/release/0.6.3
Browse files Browse the repository at this point in the history
Release/0.6.3
  • Loading branch information
WassimG authored Oct 1, 2024
2 parents 236425a + 93d93bd commit d1eeab7
Show file tree
Hide file tree
Showing 17 changed files with 223 additions and 104 deletions.
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.6.2
version: 0.6.3
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.6.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.6.2 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.6.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.6.3 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.2
current_version = 0.6.3

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.6.2"
version = "0.6.3"
# The full version, including alpha/beta/rc tags.
release = "0.6.2"
release = "0.6.3"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.6.2" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.6.3" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Wilhelmlab at Technical University of Munich"]
license = "MIT"
Expand Down Expand Up @@ -29,7 +29,7 @@ pyarrow = ">=16.0.0"
pymzml = "^2.5.0"
pyteomics = "^4.3.3"
lxml= '>=4.5.2,<6.0.0'
spectrum-fundamentals = ">=0.7.1,<0.8.0"
spectrum-fundamentals = ">=0.7.4,<0.8.0"
alphatims = "^1.0.8"
sortedcontainers = "^2.4.0"

Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
__copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
__license__ = "MIT"
__version__ = "0.6.2"
__version__ = "0.6.3"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.6.2", message=click.style("spectrum_io Version: 0.6.2"))
@click.version_option(version="0.6.3", message=click.style("spectrum_io Version: 0.6.3"))
def main() -> None:
"""spectrum_io."""

Expand Down
12 changes: 7 additions & 5 deletions spectrum_io/search_result/mascot.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import logging
import re
import sqlite3
from pathlib import Path
from typing import Dict, Optional, Tuple, Union
from typing import Dict, Optional

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods

from .search_results import SearchResults, filter_valid_prosit_sequences
from .search_results import SearchResults

logger = logging.getLogger(__name__)

Expand All @@ -25,12 +23,16 @@ def read_result(
self,
tmt_label: str = "",
custom_mods: Optional[Dict[str, int]] = None,
ptm_unimod_id: Optional[int] = 0,
ptm_sites: Optional[list[str]] = None,
) -> pd.DataFrame:
"""
Function to read a mascot msf file and perform some basic formatting.
:param tmt_label: tmt label as str
:param custom_mods: dict with custom variable and static identifier and respecitve internal equivalent and mass
:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
:raises NotImplementedError: always
:return: pd.DataFrame with the formatted data
"""
Expand Down Expand Up @@ -119,4 +121,4 @@ def read_result(
df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))

return filter_valid_prosit_sequences(df)
return self.filter_valid_prosit_sequences()
56 changes: 48 additions & 8 deletions spectrum_io/search_result/maxquant.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
from __future__ import annotations

import logging
from pathlib import Path
from typing import Dict, Optional, Tuple, Union
from typing import Dict, Optional, Union

import pandas as pd
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods
from spectrum_fundamentals.mod_string import add_permutations, internal_without_mods

from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
from .search_results import SearchResults, parse_mods

logger = logging.getLogger(__name__)


class MaxQuant(SearchResults):
"""Handle search results from MaxQuant."""

def __init__(self, path: Union[str, Path]):
def __init__(self, path: str | Path):
"""
Init Searchresults object.
Expand All @@ -34,6 +36,9 @@ def standard_mods(self):
"C": 4,
"M(ox)": 35,
"M(Oxidation (M))": 35,
"R(Citrullination)": 7,
"Q(Deamidation (NQ))": 7,
"N(Deamidation (NQ))": 7,
}

@staticmethod
Expand All @@ -50,10 +55,25 @@ def add_tmt_mod(mass: float, seq: str, unimod_tag: str) -> float:
mass += num_of_tmt * c.MOD_MASSES[f"{unimod_tag}"]
return mass

def filter_valid_prosit_sequences(self):
"""Filter valid Prosit sequences."""
logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}")
# retain only peptides that fall within [7, 30] length supported by Prosit
self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)]
# remove unsupported mods to exclude
self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"\(", regex=True)]
# remove precursor charges greater than 6
self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6]
logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}")

return self.results

def read_result(
self,
tmt_label: str = "",
custom_mods: Optional[Dict[str, int]] = None,
custom_mods: dict[str, int] | None = None,
ptm_unimod_id: int | None = 0,
ptm_sites: list[str] | None = None,
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
Expand All @@ -62,6 +82,8 @@ def read_result(
:param custom_mods: optional dictionary mapping MaxQuant-specific mod pattern to UNIMOD IDs.
If None, static carbamidomethylation of cytein and variable oxidation of methionine
are mapped automatically. To avoid this, explicitely provide an empty dictionary.
:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
:return: pd.DataFrame with the formatted data
"""
parsed_mods = parse_mods(self.standard_mods | (custom_mods or {}))
Expand Down Expand Up @@ -89,14 +111,16 @@ def read_result(

logger.info("Finished reading msms.txt file")

self.convert_to_internal(mods=parsed_mods)
return filter_valid_prosit_sequences(self.results)
self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)
return self.filter_valid_prosit_sequences()

def convert_to_internal(self, mods: Dict[str, str]):
def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None):
"""
Convert all columns in the MaxQuant output to the internal format used by Oktoberfest.
:param mods: dictionary mapping MaxQuant-specific mod patterns (keys) to ProForma standard (values)
:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
"""
df = self.results
# Standardize column names
Expand All @@ -112,6 +136,21 @@ def convert_to_internal(self, mods: Dict[str, str]):

df["Sequence"] = internal_without_mods(df["Modified sequence"])
df["PEPTIDE_LENGTH"] = df["Sequence"].str.len()
if ptm_unimod_id != 0:

# PTM permutation generation
if ptm_unimod_id == 7:
allow_one_less_modification = True
else:
allow_one_less_modification = False

df["Modified sequence"] = df["Modified sequence"].apply(
add_permutations,
unimod_id=ptm_unimod_id,
residues=ptm_sites,
allow_one_less_modification=allow_one_less_modification,
)
df = df.explode("Modified sequence", ignore_index=True)

df.rename(
columns={
Expand All @@ -128,6 +167,7 @@ def convert_to_internal(self, mods: Dict[str, str]):
},
inplace=True,
)
self.results = df

def generate_internal_timstof_metadata(self):
"""
Expand Down
34 changes: 29 additions & 5 deletions spectrum_io/search_result/msamanda.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import annotations

import logging
from typing import Dict, Optional

import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES

from .search_results import SearchResults, filter_valid_prosit_sequences, parse_mods
from .search_results import SearchResults, parse_mods

logger = logging.getLogger(__name__)

Expand All @@ -18,7 +20,12 @@ def standard_mods(self):
return {"m": 35, "c": 4}

def read_result(
self, tmt_label: str = "", custom_mods: Optional[Dict[str, int]] = None, suffix: str = "output.csv"
self,
tmt_label: str = "",
custom_mods: dict[str, int] | None = None,
ptm_unimod_id: int | None = 0,
ptm_sites: list[str] | None = None,
suffix: str = "output.csv",
) -> pd.DataFrame:
"""
Function to read a msms txt and perform some basic formatting.
Expand All @@ -28,6 +35,8 @@ def read_result(
If None, static carbamidomethylation of cytein and variable oxidation of methionine
are mapped automatically. To avoid this, explicitely provide an empty dictionary.
:param suffix: Optional suffix to determine which fileresult files should be taken from the supplied path
:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
:raises FileNotFoundError: If the supplied path is not found
:raises AssertionError: If the supplied path does not contain any files matching the provided suffix.
:raises NotImplementedError: If tmt label was supplied.
Expand Down Expand Up @@ -72,14 +81,29 @@ def read_result(

self.results = pd.concat(df_list)

self.convert_to_internal(mods=parsed_mods)
return filter_valid_prosit_sequences(self.results)
self.convert_to_internal(mods=parsed_mods, ptm_unimod_id=ptm_unimod_id, ptm_sites=ptm_sites)
return self.filter_valid_prosit_sequences()

def filter_valid_prosit_sequences(self):
"""Filter valid Prosit sequences."""
logger.info(f"#sequences before filtering for valid prosit sequences: {len(self.results.index)}")
# retain only peptides that fall within [7, 30] length supported by Prosit
self.results = self.results[(self.results["PEPTIDE_LENGTH"] <= 30) & (self.results["PEPTIDE_LENGTH"] >= 7)]
# remove unsupported mods to exclude
self.results = self.results[~self.results["MODIFIED_SEQUENCE"].str.contains(r"[a-z]+", regex=True)]
# remove precursor charges greater than 6
self.results = self.results[self.results["PRECURSOR_CHARGE"] <= 6]
logger.info(f"#sequences after filtering for valid prosit sequences: {len(self.results.index)}")

return self.results

def convert_to_internal(self, mods: Dict[str, str]):
def convert_to_internal(self, mods: dict[str, str], ptm_unimod_id: int | None, ptm_sites: list[str] | None):
"""
Convert all columns in the Sage output to the internal format used by Oktoberfest.
:param mods: dictionary mapping Sage-specific mod patterns (keys) to ProForma standard (values)
:param ptm_unimod_id: unimod id used for site localization
:param ptm_sites: possible sites that the ptm can exist on
"""
df = self.results
df["REVERSE"] = df["Protein Accessions"].str.startswith("REV_")
Expand Down
Loading

0 comments on commit d1eeab7

Please sign in to comment.