Skip to content

Commit

Permalink
Added api for spectrum io
Browse files Browse the repository at this point in the history
  • Loading branch information
victorgiurcoiu committed Aug 16, 2023
1 parent e98c491 commit 89dae69
Show file tree
Hide file tree
Showing 12 changed files with 296 additions and 313 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ per-file-ignores =
spectrum_io/raw/thermo_raw.py:S603,S404
spectrum_io/raw/msraw.py:S405,S314
docs/conf.py:S404,S607,S603
spectrum_io/search_result/__init__.py:F403
1 change: 0 additions & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import time

from . import file, raw
from .search_result import MaxQuant
from .spectral_library import DLib, Spectronaut

CONSOLE_LOG_LEVEL = logging.INFO
Expand Down
4 changes: 1 addition & 3 deletions spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
"""Initialize seach result."""
from .mascot import Mascot
from .maxquant import MaxQuant
from .msfragger import MSFragger
from .process import *
29 changes: 29 additions & 0 deletions spectrum_io/search_result/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import logging
import re

import pandas as pd

logger = logging.getLogger(__name__)


def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
"""
Filter valid Prosit sequences.
:param df: df to filter
:return: df after filtering out unsupported peptides
"""
logger.info(f"#sequences before filtering for valid prosit sequences: {len(df.index)}")
# retain only peptides that fall within [7, 30] length supported by Prosit
df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)]
# remove unsupported mods to exclude
unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]"]
exclude_mods_pattern = re.compile("|".join(unsupported_mods))
df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)]
# remove non-canonical aas
df = df[(~df["SEQUENCE"].str.contains("U|O"))]
# remove precursor charges greater than 6
df = df[df["PRECURSOR_CHARGE"] <= 6]
logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}")

return df
160 changes: 77 additions & 83 deletions spectrum_io/search_result/mascot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,98 +7,92 @@
import spectrum_fundamentals.constants as c
from spectrum_fundamentals.mod_string import internal_without_mods

from .search_results import SearchResults, filter_valid_prosit_sequences
from .filter import filter_valid_prosit_sequences

logger = logging.getLogger(__name__)


class Mascot(SearchResults):
"""Handle search results from Mascot."""
def read_mascot(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
"""
Function to read a mascot msf file and perform some basic formatting.
@staticmethod
def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:
"""
Function to read a mascot msf file and perform some basic formatting.
:param path: path to msms.txt to read
:param tmt_labeled: tmt label as str
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading mascot msf file")
connection = sqlite3.connect(path)
# cursor = connection.cursor()
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
df = pd.read_sql("SELECT * FROM MSnSpectrumInfo", connection)[
["SpectrumID", "SpectrumFileName", "RetentionTime", "Mass", "Charge"]
]
df_id_map = pd.read_sql("SELECT * FROM TargetPsmsMSnSpectrumInfo", connection)[
["MSnSpectrumInfoSpectrumID", "TargetPsmsPeptideID"]
]
df = df.merge(df_id_map, left_on="SpectrumID", right_on="MSnSpectrumInfoSpectrumID")
df_target_psms = pd.read_sql("SELECT * FROM TargetPsms", connection)[
["PeptideID", "Sequence", "ModifiedSequence", "Modifications", "XCorr"]
]
df = df.merge(df_target_psms, left_on="TargetPsmsPeptideID", right_on="PeptideID")
df_modif = pd.read_sql("SELECT * FROM TargetPsmsFoundModifications", connection)[
["TargetPsmsPeptideID", "FoundModificationsModificationID", "Position"]
]
df_modif_mass = pd.read_sql("SELECT * FROM FoundModifications", connection)[
["ModificationID", "DeltaMonoisotopicMass"]
]
df_modif = df_modif.merge(df_modif_mass, left_on="FoundModificationsModificationID", right_on="ModificationID")
df = df.merge(df_modif, on="TargetPsmsPeptideID")

:param path: path to msms.txt to read
:param tmt_labeled: tmt label as str
:return: pd.DataFrame with the formatted data
"""
logger.info("Reading mascot msf file")
connection = sqlite3.connect(path)
# cursor = connection.cursor()
# cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
df = pd.read_sql("SELECT * FROM MSnSpectrumInfo", connection)[
["SpectrumID", "SpectrumFileName", "RetentionTime", "Mass", "Charge"]
]
df_id_map = pd.read_sql("SELECT * FROM TargetPsmsMSnSpectrumInfo", connection)[
["MSnSpectrumInfoSpectrumID", "TargetPsmsPeptideID"]
]
df = df.merge(df_id_map, left_on="SpectrumID", right_on="MSnSpectrumInfoSpectrumID")
df_target_psms = pd.read_sql("SELECT * FROM TargetPsms", connection)[
["PeptideID", "Sequence", "ModifiedSequence", "Modifications", "XCorr"]
]
df = df.merge(df_target_psms, left_on="TargetPsmsPeptideID", right_on="PeptideID")
df_modif = pd.read_sql("SELECT * FROM TargetPsmsFoundModifications", connection)[
["TargetPsmsPeptideID", "FoundModificationsModificationID", "Position"]
]
df_modif_mass = pd.read_sql("SELECT * FROM FoundModifications", connection)[
["ModificationID", "DeltaMonoisotopicMass"]
]
df_modif = df_modif.merge(df_modif_mass, left_on="FoundModificationsModificationID", right_on="ModificationID")
df = df.merge(df_modif, on="TargetPsmsPeptideID")
logger.info("Finished reading mascot msf file.")

logger.info("Finished reading mascot msf file.")
df.rename(
columns={
"SpectrumID": "SCAN NUMBER",
"ModifiedSequence": "MODIFIED SEQUENCE",
"Charge": "PRECURSOR CHARGE",
"XCorr": "SCORE",
"SpectrumFileName": "RAW FILE",
},
inplace=True,
)

df.rename(
columns={
"SpectrumID": "SCAN NUMBER",
"ModifiedSequence": "MODIFIED SEQUENCE",
"Charge": "PRECURSOR CHARGE",
"XCorr": "SCORE",
"SpectrumFileName": "RAW FILE",
},
inplace=True,
)
# Standardize column names
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")
# TODO reverse
df["REVERSE"] = df["SEQUENCE"].str.contains("Reverse")
logger.info("Converting MSFragger peptide sequence to internal format")
df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
df["MODIFICATIONS"] = (df["POSITION"].astype(int) - 1).astype(str) + "$" + df["DELTAMONOISOTOPICMASS"].astype(str)
df = df.groupby("SCAN_NUMBER", as_index=False).apply(lambda x: x.sort_values("POSITION"))
df = df.groupby(
["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
as_index=False,
).agg({"MODIFICATIONS": "|".join})
mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()}

# Standardize column names
df.columns = df.columns.str.upper()
df.columns = df.columns.str.replace(" ", "_")
# TODO reverse
df["REVERSE"] = df["SEQUENCE"].str.contains("Reverse")
logger.info("Converting MSFragger peptide sequence to internal format")
df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
df["MODIFICATIONS"] = (
(df["POSITION"].astype(int) - 1).astype(str) + "$" + df["DELTAMONOISOTOPICMASS"].astype(str)
)
df = df.groupby("SCAN_NUMBER", as_index=False).apply(lambda x: x.sort_values("POSITION"))
df = df.groupby(
["SCAN_NUMBER", "PRECURSOR_CHARGE", "SCORE", "RAW_FILE", "SEQUENCE", "REVERSE"],
as_index=False,
).agg({"MODIFICATIONS": "|".join})
mod_masses_reverse = {round(float(v), 3): k for k, v in c.MOD_MASSES.items()}
sequences = []
for _, row in df.iterrows():
modifications = row["MODIFICATIONS"].split("|")
if len(modifications) == 0:
sequences.append(row["SEQUENCE"])
else:
sequence = row["SEQUENCE"]
skip = 0
for mod in modifications:
pos, mass = mod.split("$")
sequence = (
sequence[: int(pos) + 1 + skip]
+ mod_masses_reverse[round(float(mass), 3)]
+ sequence[int(pos) + 1 + skip :]
)
skip = skip + len(mod_masses_reverse[round(float(mass), 3)])
sequences.append(sequence)

sequences = []
for _, row in df.iterrows():
modifications = row["MODIFICATIONS"].split("|")
if len(modifications) == 0:
sequences.append(row["SEQUENCE"])
else:
sequence = row["SEQUENCE"]
skip = 0
for mod in modifications:
pos, mass = mod.split("$")
sequence = (
sequence[: int(pos) + 1 + skip]
+ mod_masses_reverse[round(float(mass), 3)]
+ sequence[int(pos) + 1 + skip :]
)
skip = skip + len(mod_masses_reverse[round(float(mass), 3)])
sequences.append(sequence)
df["MODIFIED_SEQUENCE"] = sequences

df["MODIFIED_SEQUENCE"] = sequences
df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))

df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"])
df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x))

return filter_valid_prosit_sequences(df)
return filter_valid_prosit_sequences(df)
Loading

0 comments on commit 89dae69

Please sign in to comment.