From 00e2f8affe99cb2b630edaa7e1209b836494754e Mon Sep 17 00:00:00 2001 From: Mostafa Kalhor Date: Wed, 10 Apr 2024 09:34:18 +0200 Subject: [PATCH] working on nox-based errors --- .gitattributes | 1 - spectrum_io/search_result/__init__.py | 2 +- spectrum_io/search_result/xisearch.py | 199 ++++++++++++++------------ tests/unit_tests/test_xisearch.py | 1 + 4 files changed, 113 insertions(+), 90 deletions(-) diff --git a/.gitattributes b/.gitattributes index 1135ec5..fa1385d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1 @@ * -text - diff --git a/spectrum_io/search_result/__init__.py b/spectrum_io/search_result/__init__.py index 94b7b33..1da55b4 100644 --- a/spectrum_io/search_result/__init__.py +++ b/spectrum_io/search_result/__init__.py @@ -3,5 +3,5 @@ from .mascot import Mascot from .maxquant import MaxQuant from .msfragger import MSFragger -from .xisearch import Xisearch from .sage import Sage +from .xisearch import Xisearch diff --git a/spectrum_io/search_result/xisearch.py b/spectrum_io/search_result/xisearch.py index b392c8c..83abb8e 100644 --- a/spectrum_io/search_result/xisearch.py +++ b/spectrum_io/search_result/xisearch.py @@ -1,15 +1,17 @@ +import glob import logging +import os import re + +import numpy as np import pandas as pd import spectrum_fundamentals.constants as c -from .search_results import SearchResults -import os -import glob -import numpy as np +from .search_results import SearchResults logger = logging.getLogger(__name__) + class Xisearch(SearchResults): """Handle search results from xisearch.""" @@ -20,75 +22,92 @@ def read_result(self, tmt_labeled: str = "") -> pd.DataFrame: :param path: path to msms.csv to read :return: pd.DataFrame with the formatted data """ - + logger.info("Reading msms.csv file") - columns_to_read = ["run_name", - "scan_number", - "precursor_mass", - "precursor_charge", - "crosslinker_name", - "decoy_p1", - "base_sequence_p1", - "aa_len_p1", - "link_pos_p1", - "linked_aa_p1", - "mods_p1", - "mod_pos_p1", - "decoy_p2", - "base_sequence_p2", - "aa_len_p2", - "link_pos_p2", - "linked_aa_p2", - "mods_p2", - "mod_pos_p2", - "linear", - "match_score"] - + columns_to_read = [ + "run_name", + "scan_number", + "precursor_mass", + "precursor_charge", + "crosslinker_name", + "decoy_p1", + "base_sequence_p1", + "aa_len_p1", + "link_pos_p1", + "linked_aa_p1", + "mods_p1", + "mod_pos_p1", + "decoy_p2", + "base_sequence_p2", + "aa_len_p2", + "link_pos_p2", + "linked_aa_p2", + "mods_p2", + "mod_pos_p2", + "linear", + "match_score", + ] + # Initialize path variable path = self.path if str(self.path).endswith(".txt"): - path = self.path.with_suffix('.tsv') - df = pd.read_csv(path, sep='\t', usecols= columns_to_read) + path = self.path.with_suffix(".tsv") + df = pd.read_csv(path, sep="\t", usecols=columns_to_read) logger.info("Finished reading msms.tsv file") # Standardize column names df = Xisearch.filter_xisearch_result(df) df = Xisearch.update_columns_for_prosit(df) df = Xisearch.filter_valid_prosit_sequences(df) - df.to_csv("/cmnfs/home/m.kalhor/wilhelmlab/spectrum_io/tests/unit_tests/data/xisearch_output_internal.tsv", index=False) + df.to_csv( + "/cmnfs/home/m.kalhor/wilhelmlab/spectrum_io/tests/unit_tests/data/xisearch_output_internal.tsv", + index=False, + ) return df - def filter_xisearch_result (df: pd.DataFrame) -> pd.DataFrame: + def filter_xisearch_result(df: pd.DataFrame) -> pd.DataFrame: """ remove unsupported modifications and keep only k-k as linked amino acid . :param df: df to filter :return: filtered df as pd.DataFrame - """ - df = df[df['linear'] != True] - df = df[df['linked_aa_p1'].notna() & df['linked_aa_p1'].str.contains('K')] - df = df[df['linked_aa_p2'].notna() & df['linked_aa_p2'].str.contains('K')] - df = df[~df['mods_p1'].str.contains('dsso-hyd', na=False)] - df = df[~df['mods_p2'].str.contains('dsso-hyd', na=False)] - valid_modifications = ['cm', 'ox', pd.NA] - df = df[df['mods_p1'].apply(lambda x: any(mod in str(x).split(';') if pd.notnull(x) else mod is pd.NA for mod in valid_modifications))] - df = df[df['mods_p2'].apply(lambda x: any(mod in str(x).split(';') if pd.notnull(x) else mod is pd.NA for mod in valid_modifications))] - + """ + df = df[df["linear"] != True] + df = df[df["linked_aa_p1"].notna() & df["linked_aa_p1"].str.contains("K")] + df = df[df["linked_aa_p2"].notna() & df["linked_aa_p2"].str.contains("K")] + df = df[~df["mods_p1"].str.contains("dsso-hyd", na=False)] + df = df[~df["mods_p2"].str.contains("dsso-hyd", na=False)] + valid_modifications = ["cm", "ox", pd.NA] + df = df[ + df["mods_p1"].apply( + lambda x: any( + mod in str(x).split(";") if pd.notnull(x) else mod is pd.NA for mod in valid_modifications + ) + ) + ] + df = df[ + df["mods_p2"].apply( + lambda x: any( + mod in str(x).split(";") if pd.notnull(x) else mod is pd.NA for mod in valid_modifications + ) + ) + ] + return df - - def add_mod_sequence(seq_a: str, - seq_b: str, - mod_a: str, - mod_b: str, - crosslinker_position_a: int, - crosslinker_position_b: int, - mod_a_positions: str, - mod_b_positions: str - ): - + + def add_mod_sequence( + seq_a: str, + seq_b: str, + mod_a: str, + mod_b: str, + crosslinker_position_a: int, + crosslinker_position_b: int, + mod_a_positions: str, + mod_b_positions: str, + ): """ - Function adds modification in peptide sequence for xl-prosit - + Function adds modification in peptide sequence for xl-prosit + :seq_a: unmodified peptide a :seq_b: unmodified peptide b :mod_a: all modifications of pep a @@ -100,12 +119,11 @@ def add_mod_sequence(seq_a: str, :mod_b_positions: position of all modifications of peptide b :return: modified sequence a and b """ - + split_seq_a = [x for x in seq_a] split_seq_b = [x for x in seq_b] mod_a_positions = str(mod_a_positions) mod_b_positions = str(mod_b_positions) - if mod_a_positions not in ["nan", "null"]: if ";" in mod_a_positions: @@ -117,15 +135,15 @@ def add_mod_sequence(seq_a: str, if split_mod_a[index] == "cm": modification = "C[UNIMOD:4]" pos_mod_a = int(pos_a) - split_seq_a[pos_mod_a-1] = modification + split_seq_a[pos_mod_a - 1] = modification else: - if mod_a == "ox" : + if mod_a == "ox": modification = "M[UNIMOD:35]" - if mod_a == "cm" : + if mod_a == "cm": modification = "C[UNIMOD:4]" try: mod_a_positions_float = float(mod_a_positions) - split_seq_a[int(mod_a_positions_float)-1] = modification + split_seq_a[int(mod_a_positions_float) - 1] = modification except ValueError: print(f"Error occurred with mod_a_positions value: {mod_a_positions}") @@ -139,23 +157,23 @@ def add_mod_sequence(seq_a: str, if split_mod_b[index] == "cm": modification = "C[UNIMOD:4]" pos_mod_b = int(pos_b) - split_seq_b[pos_mod_b-1] = modification + split_seq_b[pos_mod_b - 1] = modification else: - if mod_b == "ox" : + if mod_b == "ox": modification = "M[UNIMOD:35]" - if mod_b == "cm" : + if mod_b == "cm": modification = "C[UNIMOD:4]" try: mod_b_positions_float = float(mod_b_positions) - split_seq_b[int(mod_b_positions_float)-1] = modification + split_seq_b[int(mod_b_positions_float) - 1] = modification except ValueError: print(f"Error occurred with mod_a_positions value: {mod_b_positions}") - - split_seq_a[int(crosslinker_position_a)-1] = "K[UNIMOD:1896]" - split_seq_b[int(crosslinker_position_b)-1] = "K[UNIMOD:1896]" - seq_mod_a = ''.join(split_seq_a) - seq_mod_b = ''.join(split_seq_b) + split_seq_a[int(crosslinker_position_a) - 1] = "K[UNIMOD:1896]" + split_seq_b[int(crosslinker_position_b) - 1] = "K[UNIMOD:1896]" + + seq_mod_a = "".join(split_seq_a) + seq_mod_b = "".join(split_seq_b) return seq_mod_a, seq_mod_b @@ -168,8 +186,8 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: :return: modified df as pd.DataFrame """ - df['decoy'] = df['decoy_p1'] | df['decoy_p2'] - df["RAW_FILE"] = df["run_name" ] + df["decoy"] = df["decoy_p1"] | df["decoy_p2"] + df["RAW_FILE"] = df["run_name"] df["MASS"] = df["precursor_mass"] df["PRECURSOR_CHARGE"] = df["precursor_charge"] df["CROSSLINKER_TYPE"] = df["crosslinker_name"] @@ -188,23 +206,28 @@ def update_columns_for_prosit(df: pd.DataFrame) -> pd.DataFrame: df["PEPTIDE_LENGTH_B"] = df["aa_len_p2"] logger.info("Converting xisearch peptide sequence to internal format") - df['RAW_FILE'] = df['RAW_FILE'].str.replace('.raw', '') - df['Modifications_A'] = df['Modifications_A'].astype('str') - df['Modifications_B'] = df['Modifications_B'].astype('str') - - df['CROSSLINKER_POSITION_A'] = df['CROSSLINKER_POSITION_A'].astype('int') - df['CROSSLINKER_POSITION_B'] = df['CROSSLINKER_POSITION_B'].astype('int') - - - df[['MODIFIED_SEQUENCE_A','MODIFIED_SEQUENCE_B']] = df.apply(lambda row: Xisearch.add_mod_sequence(row['SEQUENCE_A'], - row['SEQUENCE_B'], - row['Modifications_A'], - row['Modifications_B'], - row['CROSSLINKER_POSITION_A'], - row['CROSSLINKER_POSITION_B'], - row['ModificationPositions1'], - row['ModificationPositions2']), axis=1, result_type='expand') - + df["RAW_FILE"] = df["RAW_FILE"].str.replace(".raw", "") + df["Modifications_A"] = df["Modifications_A"].astype("str") + df["Modifications_B"] = df["Modifications_B"].astype("str") + + df["CROSSLINKER_POSITION_A"] = df["CROSSLINKER_POSITION_A"].astype("int") + df["CROSSLINKER_POSITION_B"] = df["CROSSLINKER_POSITION_B"].astype("int") + + df[["MODIFIED_SEQUENCE_A", "MODIFIED_SEQUENCE_B"]] = df.apply( + lambda row: Xisearch.add_mod_sequence( + row["SEQUENCE_A"], + row["SEQUENCE_B"], + row["Modifications_A"], + row["Modifications_B"], + row["CROSSLINKER_POSITION_A"], + row["CROSSLINKER_POSITION_B"], + row["ModificationPositions1"], + row["ModificationPositions2"], + ), + axis=1, + result_type="expand", + ) + return df @staticmethod diff --git a/tests/unit_tests/test_xisearch.py b/tests/unit_tests/test_xisearch.py index f57b78f..e45c23e 100644 --- a/tests/unit_tests/test_xisearch.py +++ b/tests/unit_tests/test_xisearch.py @@ -1,5 +1,6 @@ import unittest from pathlib import Path + import numpy as np import pandas as pd