Skip to content

Commit

Permalink
working on nox-based errors
Browse files Browse the repository at this point in the history
  • Loading branch information
Mostafa Kalhor committed Apr 10, 2024
1 parent 7edebff commit 00e2f8a
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 90 deletions.
1 change: 0 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
* -text

2 changes: 1 addition & 1 deletion spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
from .mascot import Mascot
from .maxquant import MaxQuant
from .msfragger import MSFragger
from .xisearch import Xisearch
from .sage import Sage
from .xisearch import Xisearch
199 changes: 111 additions & 88 deletions spectrum_io/search_result/xisearch.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import glob
import logging
import os
import re

import numpy as np
import pandas as pd
import spectrum_fundamentals.constants as c
from .search_results import SearchResults
import os
import glob
import numpy as np

from .search_results import SearchResults

logger = logging.getLogger(__name__)


class Xisearch(SearchResults):
"""Handle search results from xisearch."""

Expand All @@ -20,75 +22,92 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame:
:param path: path to msms.csv to read
:return: pd.DataFrame with the formatted data
"""

logger.info("Reading msms.csv file")
columns_to_read = ["run_name",
"scan_number",
"precursor_mass",
"precursor_charge",
"crosslinker_name",
"decoy_p1",
"base_sequence_p1",
"aa_len_p1",
"link_pos_p1",
"linked_aa_p1",
"mods_p1",
"mod_pos_p1",
"decoy_p2",
"base_sequence_p2",
"aa_len_p2",
"link_pos_p2",
"linked_aa_p2",
"mods_p2",
"mod_pos_p2",
"linear",
"match_score"]

columns_to_read = [
"run_name",
"scan_number",
"precursor_mass",
"precursor_charge",
"crosslinker_name",
"decoy_p1",
"base_sequence_p1",
"aa_len_p1",
"link_pos_p1",
"linked_aa_p1",
"mods_p1",
"mod_pos_p1",
"decoy_p2",
"base_sequence_p2",
"aa_len_p2",
"link_pos_p2",
"linked_aa_p2",
"mods_p2",
"mod_pos_p2",
"linear",
"match_score",
]

# Initialize path variable
path = self.path

if str(self.path).endswith(".txt"):
path = self.path.with_suffix('.tsv')
df = pd.read_csv(path, sep='\t', usecols= columns_to_read)
path = self.path.with_suffix(".tsv")
df = pd.read_csv(path, sep="\t", usecols=columns_to_read)
logger.info("Finished reading msms.tsv file")
# Standardize column names
df = Xisearch.filter_xisearch_result(df)
df = Xisearch.update_columns_for_prosit(df)
df = Xisearch.filter_valid_prosit_sequences(df)
df.to_csv("/cmnfs/home/m.kalhor/wilhelmlab/spectrum_io/tests/unit_tests/data/xisearch_output_internal.tsv", index=False)
df.to_csv(
"/cmnfs/home/m.kalhor/wilhelmlab/spectrum_io/tests/unit_tests/data/xisearch_output_internal.tsv",
index=False,
)
return df

def filter_xisearch_result (df: pd.DataFrame) -> pd.DataFrame:
def filter_xisearch_result(df: pd.DataFrame) -> pd.DataFrame:
"""
remove unsupported modifications and keep only k-k as linked amino acid .
:param df: df to filter
:return: filtered df as pd.DataFrame
"""
df = df[df['linear'] != True]
df = df[df['linked_aa_p1'].notna() & df['linked_aa_p1'].str.contains('K')]
df = df[df['linked_aa_p2'].notna() & df['linked_aa_p2'].str.contains('K')]
df = df[~df['mods_p1'].str.contains('dsso-hyd', na=False)]
df = df[~df['mods_p2'].str.contains('dsso-hyd', na=False)]
valid_modifications = ['cm', 'ox', pd.NA]
df = df[df['mods_p1'].apply(lambda x: any(mod in str(x).split(';') if pd.notnull(x) else mod is pd.NA for mod in valid_modifications))]
df = df[df['mods_p2'].apply(lambda x: any(mod in str(x).split(';') if pd.notnull(x) else mod is pd.NA for mod in valid_modifications))]

"""
df = df[df["linear"] != True]
df = df[df["linked_aa_p1"].notna() & df["linked_aa_p1"].str.contains("K")]
df = df[df["linked_aa_p2"].notna() & df["linked_aa_p2"].str.contains("K")]
df = df[~df["mods_p1"].str.contains("dsso-hyd", na=False)]
df = df[~df["mods_p2"].str.contains("dsso-hyd", na=False)]
valid_modifications = ["cm", "ox", pd.NA]
df = df[
df["mods_p1"].apply(
lambda x: any(
mod in str(x).split(";") if pd.notnull(x) else mod is pd.NA for mod in valid_modifications
)
)
]
df = df[
df["mods_p2"].apply(
lambda x: any(
mod in str(x).split(";") if pd.notnull(x) else mod is pd.NA for mod in valid_modifications
)
)
]

return df
def add_mod_sequence(seq_a: str,
seq_b: str,
mod_a: str,
mod_b: str,
crosslinker_position_a: int,
crosslinker_position_b: int,
mod_a_positions: str,
mod_b_positions: str
):

def add_mod_sequence(
seq_a: str,
seq_b: str,
mod_a: str,
mod_b: str,
crosslinker_position_a: int,
crosslinker_position_b: int,
mod_a_positions: str,
mod_b_positions: str,
):
"""
Function adds modification in peptide sequence for xl-prosit
Function adds modification in peptide sequence for xl-prosit
:seq_a: unmodified peptide a
:seq_b: unmodified peptide b
:mod_a: all modifications of pep a
Expand All @@ -100,12 +119,11 @@ def add_mod_sequence(seq_a: str,
:mod_b_positions: position of all modifications of peptide b
:return: modified sequence a and b
"""

split_seq_a = [x for x in seq_a]
split_seq_b = [x for x in seq_b]
mod_a_positions = str(mod_a_positions)
mod_b_positions = str(mod_b_positions)


if mod_a_positions not in ["nan", "null"]:
if ";" in mod_a_positions:
Expand All @@ -117,15 +135,15 @@ def add_mod_sequence(seq_a: str,
if split_mod_a[index] == "cm":
modification = "C[UNIMOD:4]"
pos_mod_a = int(pos_a)
split_seq_a[pos_mod_a-1] = modification
split_seq_a[pos_mod_a - 1] = modification
else:
if mod_a == "ox" :
if mod_a == "ox":
modification = "M[UNIMOD:35]"
if mod_a == "cm" :
if mod_a == "cm":
modification = "C[UNIMOD:4]"
try:
mod_a_positions_float = float(mod_a_positions)
split_seq_a[int(mod_a_positions_float)-1] = modification
split_seq_a[int(mod_a_positions_float) - 1] = modification
except ValueError:
print(f"Error occurred with mod_a_positions value: {mod_a_positions}")

Expand All @@ -139,23 +157,23 @@ def add_mod_sequence(seq_a: str,
if split_mod_b[index] == "cm":
modification = "C[UNIMOD:4]"
pos_mod_b = int(pos_b)
split_seq_b[pos_mod_b-1] = modification
split_seq_b[pos_mod_b - 1] = modification
else:
if mod_b == "ox" :
if mod_b == "ox":
modification = "M[UNIMOD:35]"
if mod_b == "cm" :
if mod_b == "cm":
modification = "C[UNIMOD:4]"
try:
mod_b_positions_float = float(mod_b_positions)
split_seq_b[int(mod_b_positions_float)-1] = modification
split_seq_b[int(mod_b_positions_float) - 1] = modification
except ValueError:
print(f"Error occurred with mod_a_positions value: {mod_b_positions}")

split_seq_a[int(crosslinker_position_a)-1] = "K[UNIMOD:1896]"
split_seq_b[int(crosslinker_position_b)-1] = "K[UNIMOD:1896]"

seq_mod_a = ''.join(split_seq_a)
seq_mod_b = ''.join(split_seq_b)
split_seq_a[int(crosslinker_position_a) - 1] = "K[UNIMOD:1896]"
split_seq_b[int(crosslinker_position_b) - 1] = "K[UNIMOD:1896]"

seq_mod_a = "".join(split_seq_a)
seq_mod_b = "".join(split_seq_b)

return seq_mod_a, seq_mod_b

Expand All @@ -168,8 +186,8 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame:
:return: modified df as pd.DataFrame
"""

df['decoy'] = df['decoy_p1'] | df['decoy_p2']
df["RAW_FILE"] = df["run_name" ]
df["decoy"] = df["decoy_p1"] | df["decoy_p2"]
df["RAW_FILE"] = df["run_name"]
df["MASS"] = df["precursor_mass"]
df["PRECURSOR_CHARGE"] = df["precursor_charge"]
df["CROSSLINKER_TYPE"] = df["crosslinker_name"]
Expand All @@ -188,23 +206,28 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame:
df["PEPTIDE_LENGTH_B"] = df["aa_len_p2"]
logger.info("Converting xisearch peptide sequence to internal format")

df['RAW_FILE'] = df['RAW_FILE'].str.replace('.raw', '')
df['Modifications_A'] = df['Modifications_A'].astype('str')
df['Modifications_B'] = df['Modifications_B'].astype('str')

df['CROSSLINKER_POSITION_A'] = df['CROSSLINKER_POSITION_A'].astype('int')
df['CROSSLINKER_POSITION_B'] = df['CROSSLINKER_POSITION_B'].astype('int')


df[['MODIFIED_SEQUENCE_A','MODIFIED_SEQUENCE_B']] = df.apply(lambda row: Xisearch.add_mod_sequence(row['SEQUENCE_A'],
row['SEQUENCE_B'],
row['Modifications_A'],
row['Modifications_B'],
row['CROSSLINKER_POSITION_A'],
row['CROSSLINKER_POSITION_B'],
row['ModificationPositions1'],
row['ModificationPositions2']), axis=1, result_type='expand')

df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "")
df["Modifications_A"] = df["Modifications_A"].astype("str")
df["Modifications_B"] = df["Modifications_B"].astype("str")

df["CROSSLINKER_POSITION_A"] = df["CROSSLINKER_POSITION_A"].astype("int")
df["CROSSLINKER_POSITION_B"] = df["CROSSLINKER_POSITION_B"].astype("int")

df[["MODIFIED_SEQUENCE_A", "MODIFIED_SEQUENCE_B"]] = df.apply(
lambda row: Xisearch.add_mod_sequence(
row["SEQUENCE_A"],
row["SEQUENCE_B"],
row["Modifications_A"],
row["Modifications_B"],
row["CROSSLINKER_POSITION_A"],
row["CROSSLINKER_POSITION_B"],
row["ModificationPositions1"],
row["ModificationPositions2"],
),
axis=1,
result_type="expand",
)

return df

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions tests/unit_tests/test_xisearch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
from pathlib import Path

import numpy as np
import pandas as pd

Expand Down

0 comments on commit 00e2f8a

Please sign in to comment.