From 125adee938830bd878c1b81c3690b20e43eb69a7 Mon Sep 17 00:00:00 2001 From: Mostafa Kalhor Date: Wed, 15 Nov 2023 17:00:27 +0000 Subject: [PATCH] cover xisearch and fix issue of mach_peaks are zero --- .../annotation/annotation.py | 58 ++++++++++++++----- .../metrics/fragments_ratio.py | 4 +- spectrum_fundamentals/metrics/percolator.py | 28 +++++++-- spectrum_fundamentals/metrics/similarity.py | 4 +- 4 files changed, 69 insertions(+), 25 deletions(-) diff --git a/spectrum_fundamentals/annotation/annotation.py b/spectrum_fundamentals/annotation/annotation.py index a9ce3bf..834bf96 100644 --- a/spectrum_fundamentals/annotation/annotation.py +++ b/spectrum_fundamentals/annotation/annotation.py @@ -320,6 +320,7 @@ def generate_annotation_matrix( return intensity, mass + def parallel_annotate(spectrum: np.ndarray, index_columns: dict, xl:bool = False) -> Optional[Tuple[np.ndarray, np.ndarray, float, int]]: """ Perform parallel annotation of a spectrum. @@ -368,6 +369,7 @@ def parallel_annotate(spectrum: np.ndarray, index_columns: dict, xl:bool = Fals unmod_sequence_a, spectrum[index_columns["PRECURSOR_CHARGE"]], ) + matched_peaks_b = match_peaks( fragments_meta_data_b, spectrum[index_columns["INTENSITIES"]], @@ -377,23 +379,38 @@ def parallel_annotate(spectrum: np.ndarray, index_columns: dict, xl:bool = Fals spectrum[index_columns["PRECURSOR_CHARGE"]], ) - if len(matched_peaks_a) == 0: - intensity = np.full(348, 0.0) - mass = np.full(348, 0.0) - return intensity_a, mass_a, calc_mass_a, 0 - if len(matched_peaks_b) == 0: - intensity = np.full(348, 0.0) - mass = np.full(348, 0.0) - return intensity_b, mass_b, calc_mass_b, 0 - matched_peaks_a, removed_peaks_a = handle_multiple_matches(matched_peaks_a) - matched_peaks_b, removed_peaks_b = handle_multiple_matches(matched_peaks_b) - intensities_a, mass_a = generate_annotation_matrix_xl( - matched_peaks_a, unmod_sequence_a, spectrum[index_columns["CROSSLINKER_POSITION_A"]] + if len(matched_peaks_a) == 0 and len(matched_peaks_b)==0 : + intensities_a = np.full(348, 0.0) + mass_a = np.full(348, 0.0) + intensities_b = np.full(348, 0.0) + mass_b = np.full(348, 0.0) + return intensities_a, intensities_b, mass_a, mass_b, calc_mass_a, calc_mass_b, 0, 0 + elif len(matched_peaks_a) == 0 and len(matched_peaks_b)!=0 : + intensities_a = np.full(348, 0.0) + mass_a = np.full(348, 0.0) + matched_peaks_b, removed_peaks_b = handle_multiple_matches(matched_peaks_b) + intensities_b, mass_b = generate_annotation_matrix_xl( + matched_peaks_b, unmod_sequence_b, spectrum[index_columns["CROSSLINKER_POSITION_B"]] ) - intensities_b, mass_b = generate_annotation_matrix_xl( - matched_peaks_b, unmod_sequence_b, spectrum[index_columns["CROSSLINKER_POSITION_B"]] + return intensities_a, intensities_b, mass_a, mass_b, calc_mass_a, calc_mass_b, 0, removed_peaks_b + elif len(matched_peaks_a) != 0 and len(matched_peaks_b)==0 : + intensities_b = np.full(348, 0.0) + mass_b = np.full(348, 0.0) + matched_peaks_a, removed_peaks_a = handle_multiple_matches(matched_peaks_a) + intensities_a, mass_a = generate_annotation_matrix_xl( + matched_peaks_a, unmod_sequence_a, spectrum[index_columns["CROSSLINKER_POSITION_A"]] ) - return intensities_a, intensities_b, mass_a, mass_b, calc_mass_a, calc_mass_b, removed_peaks_a, removed_peaks_b + return intensities_a, intensities_b, mass_a, mass_b, calc_mass_a, calc_mass_b, removed_peaks_a, 0 + else: + matched_peaks_a, removed_peaks_a = handle_multiple_matches(matched_peaks_a) + matched_peaks_b, removed_peaks_b = handle_multiple_matches(matched_peaks_b) + intensities_a, mass_a = generate_annotation_matrix_xl( + matched_peaks_a, unmod_sequence_a, spectrum[index_columns["CROSSLINKER_POSITION_A"]] + ) + intensities_b, mass_b = generate_annotation_matrix_xl( + matched_peaks_b, unmod_sequence_b, spectrum[index_columns["CROSSLINKER_POSITION_B"]] + ) + return intensities_a, intensities_b, mass_a, mass_b, calc_mass_a, calc_mass_b, removed_peaks_a, removed_peaks_b else: @@ -425,4 +442,13 @@ def parallel_annotate(spectrum: np.ndarray, index_columns: dict, xl:bool = Fals matched_peaks, unmod_sequence, spectrum[index_columns["PRECURSOR_CHARGE"]] ) return intensities, mass, calc_mass, removed_peaks - \ No newline at end of file + + + + + + + + + + diff --git a/spectrum_fundamentals/metrics/fragments_ratio.py b/spectrum_fundamentals/metrics/fragments_ratio.py index a8176f2..4113e19 100644 --- a/spectrum_fundamentals/metrics/fragments_ratio.py +++ b/spectrum_fundamentals/metrics/fragments_ratio.py @@ -50,7 +50,7 @@ def count_with_ion_mask( config = Config() config.read(CONFIG_PATH) - if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx","xisearch"]): if len(ion_mask) == 0: ion_mask = scipy.sparse.csr_matrix(np.ones((348, 1))) @@ -147,7 +147,7 @@ def calc(self): """Adds columns with count, fraction and fraction_predicted features to metrics_val dataframe.""" config = Config() config.read(CONFIG_PATH) - if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx","xisearch"]): true_intensities_a = self.true_intensities[:,0:348] true_intensities_b = self.true_intensities[:,348:] pred_intensities_a = self.pred_intensities[:,0:348] diff --git a/spectrum_fundamentals/metrics/percolator.py b/spectrum_fundamentals/metrics/percolator.py index e3bcf35..1b2153c 100644 --- a/spectrum_fundamentals/metrics/percolator.py +++ b/spectrum_fundamentals/metrics/percolator.py @@ -291,7 +291,7 @@ def get_target_decoy_label(reverse: bool): def add_common_features(self): """Add features used by both Andromeda and Prosit feature scoring sets.""" - if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx","xisearch"]): self.metrics_val["missedCleavages_A"] = self.metadata["SEQUENCE_A"].apply(Percolator.count_missed_cleavages) self.metrics_val["missedCleavages_B"] = self.metadata["SEQUENCE_B"].apply(Percolator.count_missed_cleavages) self.metrics_val["KR_A"] = self.metadata["SEQUENCE_A"].apply(Percolator.count_arginines_and_lysines) @@ -319,12 +319,15 @@ def add_common_features(self): def add_percolator_metadata_columns(self): """Add metadata columns needed by percolator, e.g. to identify a PSM.""" - if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx","xisearch"]): spec_id_cols = ["RAW_FILE", "SCAN_NUMBER", "MODIFIED_SEQUENCE_A","MODIFIED_SEQUENCE_B", "PRECURSOR_CHARGE"] self.metrics_val["Peptide"] = (self.metadata["MODIFIED_SEQUENCE_A"]+ "_" + self.metadata["MODIFIED_SEQUENCE_B"]).apply(lambda x: "_." + x + "._") self.metrics_val[self.prot_col_name] = (self.metadata["MODIFIED_SEQUENCE_A"]+ "_" + self.metadata["MODIFIED_SEQUENCE_B"]) + #self.metrics_val["label_pep_a"] = self.metadata["label_pep_a"] + #self.metrics_val["label_pep_b"] = self.metadata["label_pep_b"] + self.metrics_val["Label"] = self.target_decoy_labels else: spec_id_cols = ["RAW_FILE", "SCAN_NUMBER", "MODIFIED_SEQUENCE", "PRECURSOR_CHARGE"] self.metrics_val["Peptide"] = self.metadata["MODIFIED_SEQUENCE"].apply(lambda x: "_." + x + "._") @@ -380,9 +383,8 @@ def get_indices_below_fdr(self, feature_name: str, fdr_cutoff: float = 0.01) -> # scores_df['Sequence'] = self.metadata['SEQUENCE'] scores_df = scores_df.sort_values(feature_name, ascending=False) logger.debug(scores_df.head(100)) - + scores_df["fdr"] = Percolator.calculate_fdrs(scores_df["Label"]) - # filter for targets only scores_df = scores_df[scores_df["Label"] == TargetDecoyLabel.TARGET] @@ -413,6 +415,22 @@ def calculate_fdrs(sorted_labels: Union[pd.Series, np.ndarray]) -> np.ndarray: cumulative_decoy_count = np.cumsum(sorted_labels == TargetDecoyLabel.DECOY) + 1 cumulative_target_count = np.cumsum(sorted_labels == TargetDecoyLabel.TARGET) + 1 return Percolator.fdrs_to_qvals(cumulative_decoy_count / cumulative_target_count) + + #def calculate_fdrs_xl(sorted_labels: Union[pd.Series, np.ndarray]) -> np.ndarray: + """ + Calculate FDR. + + :param sorted_labels: array with labels sorted (target, decoy) + :return: array with calculated FDRs + """ + #if isinstance(sorted_labels, pd.Series): + #sorted_labels = sorted_labels.to_numpy() + #cumulative_decoy_count_1 = np.cumsum(sorted_labels == TargetDecoyLabel.DECOY) + 1 + #cumulative_decoy_count_2 = np.cumsum(sorted_labels == TargetDecoyLabel.DECOY) + 1 + #cumulative_target_count = np.cumsum(sorted_labels == TargetDecoyLabel.TARGET) + 1 + #return Percolator.fdrs_to_qvals(cumulative_decoy_count_1 - cumulative_decoy_count_2) / (cumulative_target_count) + + @staticmethod def fdrs_to_qvals(fdrs: np.ndarray) -> np.ndarray: @@ -465,7 +483,7 @@ def calc(self): if current_fdr >= 0.1: lda_failed = True break - if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(self.config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx","xisearch"]): self.metrics_val["collision_energy_aligned"] = self.metadata["COLLISION_ENERGY"] / 100.0 else: if lda_failed: diff --git a/spectrum_fundamentals/metrics/similarity.py b/spectrum_fundamentals/metrics/similarity.py index 0bf17d5..334f592 100644 --- a/spectrum_fundamentals/metrics/similarity.py +++ b/spectrum_fundamentals/metrics/similarity.py @@ -61,7 +61,7 @@ def spectral_angle( """ config = Config() config.read(CONFIG_PATH) - if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx", "xisearch"]): if charge != 0: if charge == 1: boolean_array = constants.SINGLE_CHARGED_MASK_XL @@ -435,7 +435,7 @@ def calc(self, all_features: bool): """ config = Config() config.read(CONFIG_PATH) - if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx"]): + if any(config.search_type.lower() == s.lower() for s in ["plink2", "xlinkx","xisearch"]): true_intensities_a = self.true_intensities[:,0:348] true_intensities_b = self.true_intensities[:,348:] pred_intensities_a = self.pred_intensities[:,0:348]