Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use custom columns as input features for percolator #122

Merged
merged 8 commits into from
Jul 31, 2024
47 changes: 47 additions & 0 deletions spectrum_fundamentals/metrics/percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,40 @@ def __init__(
all_features_flag: bool = False,
regression_method: str = "lowess",
fdr_cutoff: float = 0.01,
additional_columns: Optional[Union[str, list]] = None,
):
"""Initialize a Percolator obj."""
self.metadata = metadata
self.input_type = input_type
self.all_features_flag = all_features_flag
self.additional_columns = additional_columns
self.regression_method = regression_method
self.fdr_cutoff = fdr_cutoff
self.xl = "CROSSLINKER_TYPE" in self.metadata.columns
self.base_columns = [
"raw_file",
"scan_number",
"modified_sequence",
"precursor_charge",
"scan_event_number",
"mass",
"score",
"reverse",
"sequence",
"peptide_length",
"fragmentation",
"calculated_mass",
"sequence_a",
"sequence_b",
"modified_sequence_a",
"modified_sequence_b",
"retention_time",
"predicted_irt",
"instrument_types",
"mass_analyzer",
"mz_range",
"collision_energy",
]

super().__init__(pred_intensities, true_intensities, mz)

Expand Down Expand Up @@ -274,6 +300,24 @@ def add_common_features(self):
self.metrics_val["HCD"] = (self.metadata["FRAGMENTATION"] == "HCD").astype(int)
self.metrics_val["CID"] = (self.metadata["FRAGMENTATION"] == "CID").astype(int)

def add_additional_features(self):
"""Add additional features from custom serch results if specified."""
feature_cols = []

if isinstance(self.additional_columns, list):
feature_cols = self.additional_columns
elif isinstance(self.additional_columns, str) and (self.additional_columns.lower() == "all"):
feature_cols = [x for x in self.metadata.columns if x.lower() not in set(self.base_columns)]
feature_cols = [x for x in feature_cols if not x.lower().startswith("unnamed")] # remove Unnamed cols

for col in feature_cols:
if col not in self.metadata.columns:
raise ValueError(f"provided column: {col} cannot be found in search results.")
elif not pd.api.types.is_numeric_dtype(self.metadata[col]):
raise ValueError(f"wrong datatype for column: {col}, datatype must be numerical.")
else:
self.metrics_val[col] = self.metadata[col]

def add_percolator_metadata_columns(self):
"""Add metadata columns needed by percolator, e.g. to identify a PSM."""
if self.xl:
Expand Down Expand Up @@ -405,6 +449,8 @@ def calc(self):
np.random.seed(1)
# add Prosit or Andromeda features
if self.input_type == "rescore":
# add additional features
self.add_additional_features()
fragments_ratio = fr.FragmentsRatio(self.pred_intensities, self.true_intensities)
fragments_ratio.calc(xl=self.xl)
similarity = sim.SimilarityMetrics(self.pred_intensities, self.true_intensities, self.mz)
Expand Down Expand Up @@ -457,6 +503,7 @@ def calc(self):
)

else:
self.add_additional_features()
self.metrics_val["andromeda"] = self.metadata["SCORE"]

self.add_percolator_metadata_columns()
Expand Down
35 changes: 35 additions & 0 deletions tests/unit_tests/test_percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,41 @@ def test_get_delta_score(self):
perc.Percolator.get_delta_score(df, "spectral_angle"), np.array([20, 40, 0, 40, 250, 0])
)

def test_add_additional_features(self):
"""Test add_additional_features."""
types = {
"RAW_FILE": str,
"SCAN_NUMBER": int,
"MODIFIED_SEQUENCE": str,
"PRECURSOR_CHARGE": int,
"SCAN_EVENT_NUMBER": int,
"MASS": float,
"SCORE": float,
"REVERSE": int,
"SEQUENCE": str,
"PEPTIDE_LENGTH": float,
"A": float,
"B": float,
"precursor_charge": float,
"Unnamed 1": int,
}

perc_input = pd.DataFrame(columns=types).astype(types)

percolator_all = perc.Percolator(metadata=perc_input, input_type="rescore", additional_columns="all")
percolator_all.add_additional_features()
pd.testing.assert_frame_equal(
pd.DataFrame(columns=["A", "B"]).astype({"A": float, "B": float}), percolator_all.metrics_val
)

percolator_list = perc.Percolator(metadata=perc_input, input_type="rescore", additional_columns=["A"])
percolator_list.add_additional_features()
pd.testing.assert_frame_equal(pd.DataFrame(columns=["A"]).astype({"A": float}), percolator_list.metrics_val)

percolator_none = perc.Percolator(metadata=perc_input, input_type="rescore", additional_columns="none")
percolator_none.add_additional_features()
pd.testing.assert_frame_equal(pd.DataFrame(), percolator_none.metrics_val)

def test_calc(self):
"""Test calc."""
perc_input = pd.read_csv(Path(__file__).parent / "data/perc_input.csv")
Expand Down
Loading