From 6c71a98fb856a143a830549a77d137e042aa9303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Mon, 22 Jan 2024 14:22:33 +0100 Subject: [PATCH 1/3] added and rearranged protein column in the end for both sage and mq --- spectrum_io/search_result/maxquant.py | 4 ++++ spectrum_io/search_result/sage.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index e165e71..bd1e42d 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -51,6 +51,7 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: "MASS", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead "SCORE", "REVERSE", + "PROTEINS", ], sep="\t", ) @@ -106,5 +107,8 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram df["MODIFIED_SEQUENCE"] = maxquant_to_internal(df["MODIFIED_SEQUENCE"].to_numpy()) df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) + # adding protein column in the end + df['PROTEIN'] = df.pop('PROTEINS') + return df diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index 920a33a..b77754c 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -73,6 +73,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram # converting proforma to unimode print(df) df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"]) - + #adding protein column in the end + df['PROTEIN'] = df.pop('PROTEINS') print(df.columns) return df From 3cc2395a47a2c782382da1a3a464dea1c77e12a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Mon, 22 Jan 2024 14:33:11 +0100 Subject: [PATCH 2/3] added a minor fix typo PROTEIN to PROTEINS --- spectrum_io/search_result/maxquant.py | 2 +- spectrum_io/search_result/sage.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index bd1e42d..d82fa72 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -108,7 +108,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram df["SEQUENCE"] = internal_without_mods(df["MODIFIED_SEQUENCE"]) df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) # adding protein column in the end - df['PROTEIN'] = df.pop('PROTEINS') + df['PROTEINS'] = df.pop('PROTEINS') return df diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index b77754c..26a430c 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -74,6 +74,6 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram print(df) df["MODIFIED_SEQUENCE"] = sage_to_internal(df["MODIFIED_SEQUENCE"]) #adding protein column in the end - df['PROTEIN'] = df.pop('PROTEINS') + df['PROTEINS'] = df.pop('PROTEINS') print(df.columns) return df From a56a26daa6b2b7023da2e4fbc30657c3a95c9e99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CKarim?= <“karimwael48@gmail.com”> Date: Mon, 29 Jan 2024 10:06:33 +0100 Subject: [PATCH 3/3] added protein column to maxquant and sage, added a replacement function for the empty protein in mq --- spectrum_io/search_result/maxquant.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py index d82fa72..1f3aa5f 100644 --- a/spectrum_io/search_result/maxquant.py +++ b/spectrum_io/search_result/maxquant.py @@ -63,7 +63,13 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame: df = MaxQuant.update_columns_for_prosit(df, tmt_labeled) return filter_valid_prosit_sequences(df) + # a method for replacing missing proteins + @staticmethod + def sanity_check(PROTEINS: pd.Series) -> pd.Series: + return PROTEINS.apply(lambda x: 'missing_protein' if pd.isna(x) else x) + + @staticmethod def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFrame: """ @@ -109,6 +115,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram df["PEPTIDE_LENGTH"] = df["SEQUENCE"].apply(lambda x: len(x)) # adding protein column in the end df['PROTEINS'] = df.pop('PROTEINS') - - + # calling the static method to replace unkown proteins + df['PROTEINS'] = MaxQuant.sanity_check(df['PROTEINS']) + return df