Skip to content

Commit

Permalink
Merge branch 'development' of github.com:wilhelm-lab/spectrum_fundame…
Browse files Browse the repository at this point in the history
…ntals into development
  • Loading branch information
picciama committed Sep 19, 2023
2 parents a4fd363 + 9518361 commit 86695a3
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 40 deletions.
36 changes: 5 additions & 31 deletions spectrum_fundamentals/metrics/percolator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
import enum
import hashlib
import logging
import re
import subprocess
from typing import Optional, Tuple, Union

import numpy as np
Expand Down Expand Up @@ -60,26 +57,15 @@ def __init__(
all_features_flag: bool = False,
regression_method: str = "lowess",
fdr_cutoff: float = 0.01,
percolator_version: Optional[float] = 3.05,
):
"""Initialize a Percolator obj."""
self.metadata = metadata
self.input_type = input_type
self.all_features_flag = all_features_flag
self.regression_method = regression_method
self.fdr_cutoff = fdr_cutoff

self._resolve_percolator_compatibility(percolator_version)
super().__init__(pred_intensities, true_intensities, mz)

def _resolve_percolator_compatibility(self, percolator_version: Optional[float] = None):
if percolator_version is None:
result = subprocess.run(["percolator", "-h"], capture_output=True, text=True)
version_line = result.stderr.splitlines()[0].strip()
version = version_line.split("version ")[1]
percolator_version = float(re.sub(r"\.[^.]+$", "", version))
self.prot_col_name = "Proteins" if percolator_version >= 3.06 else "Protein"

@staticmethod
def sample_balanced_over_bins(retention_time_df: pd.DataFrame, sample_size: int = 5000) -> pd.Index:
"""
Expand Down Expand Up @@ -170,18 +156,6 @@ def get_aligned_predicted_retention_times(

return aligned_rts_predicted

@staticmethod
def get_scannr(metadata_subset: Union[pd.Series, Tuple[str, int]]) -> int:
"""
Creates a hash of the raw_file and scan number to use as a unique scan number in percolator.
:param metadata_subset: tuple of (raw_file, scan_number)
:return: hashed unique id
"""
raw_file, scan_number = metadata_subset
s = f"{raw_file}{scan_number}".encode()
return int(hashlib.sha224(s).hexdigest()[:6], 16)

@staticmethod
def get_delta_score(scores_df: pd.DataFrame, scoring_feature: str) -> np.ndarray:
"""
Expand Down Expand Up @@ -296,11 +270,11 @@ def add_percolator_metadata_columns(self):
spec_id_cols.append("SCAN_EVENT_NUMBER")
self.metrics_val["SpecId"] = self.metadata[spec_id_cols].apply(Percolator.get_specid, axis=1)
self.metrics_val["Label"] = self.target_decoy_labels
self.metrics_val["ScanNr"] = self.metadata[["RAW_FILE", "SCAN_NUMBER"]].apply(Percolator.get_scannr, axis=1)

self.metrics_val["ScanNr"] = self.metadata["SCAN_NUMBER"]
self.metrics_val["filename"] = self.metadata["RAW_FILE"]
self.metrics_val["Peptide"] = self.metadata["MODIFIED_SEQUENCE"].apply(lambda x: "_." + x + "._")

self.metrics_val[self.prot_col_name] = self.metadata[
self.metrics_val["Proteins"] = self.metadata[
"MODIFIED_SEQUENCE"
] # we don't need the protein ID to get PSM / peptide results, fill with peptide sequence

Expand Down Expand Up @@ -397,8 +371,8 @@ def fdrs_to_qvals(fdrs: np.ndarray) -> np.ndarray:

def _reorder_columns_for_percolator(self):
all_columns = self.metrics_val.columns
first_columns = ["SpecId", "Label", "ScanNr"]
last_columns = ["Peptide", "Protein"] if "Protein" in all_columns else ["Peptide", "Proteins"]
first_columns = ["SpecId", "Label", "ScanNr", "filename"]
last_columns = ["Peptide", "Proteins"]
mid_columns = list(set(all_columns) - set(first_columns) - set(last_columns))
new_columns = first_columns + sorted(mid_columns) + last_columns
self.metrics_val = self.metrics_val[new_columns]
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/data/perc_output.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
,SpecId,Label,ScanNr,CID,Charge1,Charge2,Charge3,Charge4,Charge5,Charge6,HCD,KR,Mass,RT,UnknownFragmentationMethod,abs_rt_diff,collision_energy_aligned,count_not_observed_and_not_predicted,count_not_observed_and_not_predicted_b,count_not_observed_and_not_predicted_y,count_not_observed_but_predicted,count_not_observed_but_predicted_b,count_not_observed_but_predicted_y,count_observed,count_observed_and_predicted,count_observed_and_predicted_b,count_observed_and_predicted_y,count_observed_b,count_observed_but_not_predicted,count_observed_but_not_predicted_b,count_observed_but_not_predicted_y,count_observed_y,count_predicted,count_predicted_b,count_predicted_y,fraction_not_observed_and_not_predicted,fraction_not_observed_and_not_predicted_b,fraction_not_observed_and_not_predicted_b_vs_predicted_b,fraction_not_observed_and_not_predicted_vs_predicted,fraction_not_observed_and_not_predicted_y,fraction_not_observed_and_not_predicted_y_vs_predicted_y,fraction_not_observed_but_predicted,fraction_not_observed_but_predicted_b,fraction_not_observed_but_predicted_b_vs_predicted,fraction_not_observed_but_predicted_vs_predicted,fraction_not_observed_but_predicted_y,fraction_not_observed_but_predicted_y_vs_predicted,fraction_observed,fraction_observed_and_predicted,fraction_observed_and_predicted_b,fraction_observed_and_predicted_b_vs_predicted_b,fraction_observed_and_predicted_vs_predicted,fraction_observed_and_predicted_y,fraction_observed_and_predicted_y_vs_predicted_y,fraction_observed_b,fraction_observed_but_not_predicted,fraction_observed_but_not_predicted_b,fraction_observed_but_not_predicted_b_vs_predicted_b,fraction_observed_but_not_predicted_vs_predicted,fraction_observed_but_not_predicted_y,fraction_observed_but_not_predicted_y_vs_predicted_y,fraction_observed_y,fraction_predicted,fraction_predicted_b,fraction_predicted_y,iRT,lda_scores,missedCleavages,modified_cosine,pearson_corr,pred_RT,sequence_length,spectral_angle,Peptide,Protein
,SpecId,Label,ScanNr,CID,Charge1,Charge2,Charge3,Charge4,Charge5,Charge6,HCD,KR,Mass,RT,UnknownFragmentationMethod,abs_rt_diff,collision_energy_aligned,count_not_observed_and_not_predicted,count_not_observed_and_not_predicted_b,count_not_observed_and_not_predicted_y,count_not_observed_but_predicted,count_not_observed_but_predicted_b,count_not_observed_but_predicted_y,count_observed,count_observed_and_predicted,count_observed_and_predicted_b,count_observed_and_predicted_y,count_observed_b,count_observed_but_not_predicted,count_observed_but_not_predicted_b,count_observed_but_not_predicted_y,count_observed_y,count_predicted,count_predicted_b,count_predicted_y,fraction_not_observed_and_not_predicted,fraction_not_observed_and_not_predicted_b,fraction_not_observed_and_not_predicted_b_vs_predicted_b,fraction_not_observed_and_not_predicted_vs_predicted,fraction_not_observed_and_not_predicted_y,fraction_not_observed_and_not_predicted_y_vs_predicted_y,fraction_not_observed_but_predicted,fraction_not_observed_but_predicted_b,fraction_not_observed_but_predicted_b_vs_predicted,fraction_not_observed_but_predicted_vs_predicted,fraction_not_observed_but_predicted_y,fraction_not_observed_but_predicted_y_vs_predicted,fraction_observed,fraction_observed_and_predicted,fraction_observed_and_predicted_b,fraction_observed_and_predicted_b_vs_predicted_b,fraction_observed_and_predicted_vs_predicted,fraction_observed_and_predicted_y,fraction_observed_and_predicted_y_vs_predicted_y,fraction_observed_b,fraction_observed_but_not_predicted,fraction_observed_but_not_predicted_b,fraction_observed_but_not_predicted_b_vs_predicted_b,fraction_observed_but_not_predicted_vs_predicted,fraction_observed_but_not_predicted_y,fraction_observed_but_not_predicted_y_vs_predicted_y,fraction_observed_y,fraction_predicted,fraction_predicted_b,fraction_predicted_y,iRT,lda_scores,missedCleavages,modified_cosine,pearson_corr,pred_RT,sequence_length,spectral_angle,Peptide,Proteins
0,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-7978-AAIGEATRL-2-1,1,10203379,0,0,1,0,0,0,0,1,1,900.50288029264,0.5000000183883155,0,0.0,0.3,2.0,1,1,2.0,1,1,4.0,2.0,1,1,1,2.0,0,2,3,4.0,2,2,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.5,0.25,0.3333333333333333,0.5,0.5,0.2,0.5,0.3333333333333333,0.25,0.0,0.0,0.5,0.4,1.0,0.6,0.5,0.6666666666666666,0.4,0.5000000183883155,1.7027402478448157e+30,1,0.8959462788501606,0.8831116463997931,0.5000000323590892,9,0.7449534694193893,_.AAIGEATRL._,AAIGEATRL
1,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-12304-AAVPRAAFL-2-2,-1,5831230,0,0,1,0,0,0,0,1,1,914.53379,1.000000038995633,0,2.533209730870567e-08,0.3,1.0,0,1,1.0,0,1,2.0,1.0,0,1,1,1.0,1,0,1,2.0,0,2,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,1.0,0.25,1.0,1.0,0.5,0.0,0.0,0.3333333333333333,0.5,0.0,0.6666666666666666,1.0000000136635356,-1.7027402478448157e+30,1,1.0,0.0,1.5000000246189773,9,0.5903344918223923,_.AAVPRAAFL._,AAVPRAAFL
2,20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-12398-AAYFGVYDTAK-2-3,-1,16059765,0,0,1,0,0,0,0,1,1,1204.5764,1.5000000993570155,0,1.0185683851915428e-07,0.3,1.0,0,1,1.0,0,1,2.0,1.0,0,1,1,1.0,1,0,1,2.0,0,2,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,0.5,0.25,0.0,0.0,0.5,0.3333333333333333,0.5,1.0,0.25,1.0,1.0,0.5,0.0,0.0,0.3333333333333333,0.5,0.0,0.6666666666666666,1.499999997500177,-1.7027402478448157e+30,0,1.0,0.0,2.500000056694346,11,0.5903344918223923,_.AAYFGVYDTAK._,AAYFGVYDTAK
Expand Down
11 changes: 3 additions & 8 deletions tests/unit_tests/test_percolator.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,12 +274,6 @@ def _create_noisy_logistic_data():
class TestPercolator:
"""Class to test percolator."""

def test_get_scannr(self):
"""Test get_scannr."""
np.testing.assert_equal(
perc.Percolator.get_scannr(("20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02", 7978)), 10203379
)

def test_get_specid(self):
"""Test get_specid."""
np.testing.assert_string_equal(
Expand Down Expand Up @@ -361,11 +355,12 @@ def test_calc(self):
percolator.metrics_val["SpecId"][0], "20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02-7978-AAIGEATRL-2-1"
)
np.testing.assert_equal(percolator.metrics_val["Label"][0], 1)
np.testing.assert_equal(percolator.metrics_val["ScanNr"][0], 10203379)
np.testing.assert_equal(percolator.metrics_val["ScanNr"][0], 7978)
np.testing.assert_equal(percolator.metrics_val["filename"][0], "20210122_0263_TMUCLHan_Peiru_DDA_IP_C797S_02")
# np.testing.assert_almost_equal(percolator.metrics_val['ExpMass'][0], 900.50345678)
np.testing.assert_string_equal(percolator.metrics_val["Peptide"][0], "_.AAIGEATRL._")
np.testing.assert_string_equal(
percolator.metrics_val["Protein"][0], "AAIGEATRL"
percolator.metrics_val["Proteins"][0], "AAIGEATRL"
) # we don't need the protein ID to get PSM / peptide results

# features
Expand Down

0 comments on commit 86695a3

Please sign in to comment.