diff --git a/spectrum_io/search_result/sage.py b/spectrum_io/search_result/sage.py index ccec3a8..920a33a 100644 --- a/spectrum_io/search_result/sage.py +++ b/spectrum_io/search_result/sage.py @@ -26,15 +26,7 @@ def read_result(path: Union[str, Path], tmt_labeled: str = "") -> pd.DataFrame: logger.info("Reading msms.tsv file") df = pd.read_csv( path, - usecols=[ - "filename", - "scannr", - "peptide", - "charge", - "hyperscore", - "calcmass", - "proteins" - ], + usecols=["filename", "scannr", "peptide", "charge", "hyperscore", "calcmass", "proteins"], sep="\t", ) logger.info("Finished reading msms.tsv file") @@ -65,9 +57,9 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram ) # removing .mzML - df['RAW_FILE'] = df['RAW_FILE'].str.replace(".mzML","",regex=True) - # extracting only the scan number - df['SCAN_NUMBER'] = df['SCAN_NUMBER'].str.split('=').str[3:].str.join('=') + df["RAW_FILE"] = df["RAW_FILE"].str.replace(".mzML", "", regex=True) + # extracting only the scan number + df["SCAN_NUMBER"] = [int(x.rsplit("=", 1)[-1]) for x in df["SCAN_NUMBER"]] # creating a column of decoys and targets df["REVERSE"] = df["PROTEINS"].str.startswith("rev_") # removing modification to create the unmodified sequences diff --git a/tests/unit_tests/test_sage.py b/tests/unit_tests/test_sage.py index 33085e1..d136356 100644 --- a/tests/unit_tests/test_sage.py +++ b/tests/unit_tests/test_sage.py @@ -2,6 +2,7 @@ from pathlib import Path import pandas as pd + from spectrum_io.search_result import Sage @@ -14,8 +15,6 @@ def test_read_sage(self): expected_sage_internal_path = Path(__file__).parent / "data" / "sage_output_internal.csv" internal_search_results_df = Sage.read_result(sage_output_path) + expected_df = pd.read_csv(expected_sage_internal_path, index_col=0) - # execute only once, then remove and test again - internal_search_results_df.to_csv(expected_sage_internal_path) - - self.assertEqual(internal_search_results_df, pd.read_csv(expected_sage_internal_path)) + pd.testing.assert_frame_equal(internal_search_results_df, expected_df)