Skip to content

Commit

Permalink
readded Spectra instead of df + added dlomix check
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Sep 13, 2024
1 parent 3490afa commit 9be535c
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 19 deletions.
59 changes: 48 additions & 11 deletions oktoberfest/predict/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
PredictionInterface = Union[DLomix, Koina, ZeroPredictor]
else:
PredictionInterface = Union[Koina, ZeroPredictor]
DLomix = None


class Predictor:
Expand Down Expand Up @@ -148,10 +149,10 @@ def predict_intensities(self, data: Spectra, chunk_idx: Optional[list[pd.Index]]
>>> print(library.layers["pred_int"])
"""
if chunk_idx is None:
intensities = self.predict_at_once(data=data.obs, **kwargs)
intensities = self.predict_at_once(data=data, **kwargs)
data.add_intensities(intensities["intensities"], intensities["annotation"], fragment_type=FragmentType.PRED)
else:
chunked_intensities = self.predict_in_chunks(data=data.obs, chunk_idx=chunk_idx, **kwargs)
chunked_intensities = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, **kwargs)
data.add_list_of_predicted_intensities(
chunked_intensities["intensities"], chunked_intensities["annotation"], chunk_idx
)
Expand Down Expand Up @@ -189,10 +190,46 @@ def predict_rt(self, data: Spectra, **kwargs):
>>> irt_predictor.predict_rt(data=library)
>>> print(library.obs["PREDICTED_IRT"])
"""
pred_irts = self.predict_at_once(data=data.obs, **kwargs)
pred_irts = self.predict_at_once(data=data, **kwargs)
data.add_column(pred_irts["irt"].squeeze(), name="PREDICTED_IRT")

def predict_at_once(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]:
def predict_at_once(self, data: Spectra, **kwargs) -> dict[str, np.ndarray]:
"""
Retrieve and return predictions in one go.
This function takes a Spectra object containing information about PSMs and predicts peptide properties. The
configuration of Koina/DLomix is set using the kwargs.
See the Koina or DLomix predict functions for details. TODO, link this properly.
:param data: Spectra containing the data for the prediction.
:param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict
:return: a dictionary with targets (keys) and predictions (values)
:Example:
.. code-block:: python
>>> from oktoberfest import predict as pr
>>> import pandas as pd
>>> # Required columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE and FRAGMENTATION
>>> meta_df = pd.DataFrame({"MODIFIED_SEQUENCE": ["AAAC[UNIMOD:4]RFVQ","RM[UNIMOD:35]PC[UNIMOD:4]HKPYL"],
>>> "COLLISION_ENERGY": [30,35],
>>> "PRECURSOR_CHARGE": [1,2],
>>> "FRAGMENTATION": ["HCD","HCD"]})
>>> var = Spectra._gen_vars_df()
>>> library = Spectra(obs=meta_df, var=var)
>>> intensity_predictor = pr.Predictor.from_koina(
>>> model_name="Prosit_2020_intensity_HCD",
>>> server_url="koina.wilhelmlab.org:443",
>>> ssl=True,
>>> targets=["intensities", "annotation"])
>>> predictions = intensity_predictor.predict_at_once(data=library)
>>> print(predictions)
"""
return self._predictor.predict(data, **self._filter_kwargs(**kwargs))

def _predict_at_once_df(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]:
"""
Retrieve and return predictions in one go.
Expand All @@ -211,7 +248,7 @@ def predict_at_once(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]
>>> from oktoberfest import predict as pr
>>> import pandas as pd
>>> # Requiered columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE and FRAGMENTATION
>>> # Required columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE and FRAGMENTATION
>>> meta_df = pd.DataFrame({"MODIFIED_SEQUENCE": ["AAAC[UNIMOD:4]RFVQ","RM[UNIMOD:35]PC[UNIMOD:4]HKPYL"],
>>> "COLLISION_ENERGY": [30,35],
>>> "PRECURSOR_CHARGE": [1,2],
Expand All @@ -228,15 +265,15 @@ def predict_at_once(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]
"""
return self._predictor.predict(data, **self._filter_kwargs(**kwargs))

def predict_in_chunks(self, data: pd.DataFrame, chunk_idx: list[pd.Index], **kwargs) -> dict[str, list[np.ndarray]]:
def predict_in_chunks(self, data: Spectra, chunk_idx: list[pd.Index], **kwargs) -> dict[str, list[np.ndarray]]:
"""
Retrieve and return predictions in chunks.
This function takes a dataframe containing information about PSMs and predicts peptide properties.The
This function takes a Spectra object containing information about PSMs and predicts peptide properties.The
configuration of Koina/DLomix is set using the kwargs.
See the Koina or DLomix predict functions for details. TODO, link this properly.
:param data: Dataframe containing the data for the prediction.
:param data: Spectra object containing the data for the prediction.
:param chunk_idx: The chunked indices of the provided dataframe. This is required in some cases,
e.g. if padding should be avoided when predicting peptides of different length.
For alphapept, this is required as padding is only performed within one batch, leading to
Expand All @@ -252,7 +289,7 @@ def predict_in_chunks(self, data: pd.DataFrame, chunk_idx: list[pd.Index], **kwa
>>> from oktoberfest import predict as pr
>>> from oktoberfest.utils import group_iterator
>>> # Requiered columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE, FRAGMENTATION and PEPTIDE_LENGTH
>>> # Required columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE, FRAGMENTATION and PEPTIDE_LENGTH
>>> meta_df = pd.DataFrame({"MODIFIED_SEQUENCE": ["AAAC[UNIMOD:4]RFVQ","RM[UNIMOD:35]PC[UNIMOD:4]HKPYL"],
>>> "COLLISION_ENERGY": [30,35],
>>> "PRECURSOR_CHARGE": [1,2],
Expand All @@ -266,12 +303,12 @@ def predict_in_chunks(self, data: pd.DataFrame, chunk_idx: list[pd.Index], **kwa
>>> server_url="koina.wilhelmlab.org:443",
>>> ssl=True,
>>> targets=["intensities", "annotation"])
>>> predictions = intensity_predictor.predict_in_chunks(data=library.obs, chunk_idx=idx)
>>> predictions = intensity_predictor.predict_in_chunks(data=library, chunk_idx=idx)
>>> print(predictions)
"""
results = []
for idx in chunk_idx:
results.append(self._predictor.predict(data.loc[idx], **self._filter_kwargs(**kwargs)))
results.append(self._predictor.predict(data[idx], **self._filter_kwargs(**kwargs)))
ret_val = {key: [item[key] for item in results] for key in results[0].keys()}
return ret_val

Expand Down
2 changes: 1 addition & 1 deletion oktoberfest/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def _make_predictions(config, queue_out, progress, lock, batch_df):
predictions = {
output_name: output
for predictor in predictors.values()
for output_name, output in predictor.predict_at_once(batch_df).items()
for output_name, output in predictor._predictor.predict(batch_df).items()
}
queue_out.put((predictions, batch_df))
with lock:
Expand Down
59 changes: 52 additions & 7 deletions tests/unit_tests/test_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from oktoberfest.data import Spectra
from oktoberfest.data.spectra import FragmentType
from oktoberfest.pr import Koina, Predictor
from oktoberfest.predict import Koina, Predictor
from oktoberfest.utils import Config

DATA_PATH = Path(__file__).parent / "data"
Expand Down Expand Up @@ -98,6 +98,9 @@ def test_from_koina(self, mock_koina):
@patch("oktoberfest.pr.predictor.DLomix")
def test_from_dlomix(self, mock_dlomix):
"""Test DLomix constructor for Predictor."""
if mock_dlomix is None:
self.assertTrue(True)
return
predictor = Predictor.from_dlomix(
model_type=self.model_type,
model_path=self.temp_dir / "prosit_baseline.keras",
Expand All @@ -118,6 +121,9 @@ def test_koina_from_config(self, mock_koina):
@patch("oktoberfest.pr.predictor.DLomix")
def test_dlomix_from_config(self, mock_dlomix):
"""Test config constructor for Predictor with DLomix."""
if mock_dlomix is None:
self.assertTrue(True)
return
self.mock_config.predict_intensity_locally = True
self.mock_config.download_baseline_intensity_predictor = False
self.mock_config.dlomix_inference_batch_size = 1024
Expand All @@ -128,6 +134,9 @@ def test_dlomix_from_config(self, mock_dlomix):
@patch("oktoberfest.pr.predictor.DLomix")
def test_download_new_model(self, mock_dlomix):
"""Test if new baseline model is downloaded if requested."""
if mock_dlomix is None:
self.assertTrue(True)
return
self.mock_config.download_baseline_intensity_predictor = True
predictor = Predictor.from_config(self.mock_config, model_type=self.model_type)
self.assertIsInstance(predictor._predictor, type(mock_dlomix.return_value))
Expand All @@ -147,7 +156,7 @@ def test_predict_intensities_at_once(self):
return_value={"intensities": self.intensities, "annotation": self.ion_annotations}
)
predictor.predict_intensities(self.mock_spectra)
predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra.obs)
predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra)
self.mock_spectra.add_intensities.assert_called_once_with(
self.intensities, self.ion_annotations, fragment_type=FragmentType.PRED
)
Expand All @@ -163,7 +172,7 @@ def test_predict_intensities_chunked(self):
}
)
predictor.predict_intensities(self.mock_spectra, chunk_idx=self.chunk_idx)
predictor.predict_in_chunks.assert_called_once_with(data=self.mock_spectra.obs, chunk_idx=self.chunk_idx)
predictor.predict_in_chunks.assert_called_once_with(data=self.mock_spectra, chunk_idx=self.chunk_idx)
self.mock_spectra.add_list_of_predicted_intensities.assert_called_once_with(
[self.intensities, self.intensities], [self.ion_annotations, self.ion_annotations], self.chunk_idx
)
Expand All @@ -174,16 +183,16 @@ def test_predict_rt(self):
predictor = Predictor(self.mock_koina, model_name=self.model_name)
predictor.predict_at_once = MagicMock(return_value={"irt": self.retention_times})
predictor.predict_rt(self.mock_spectra)
predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra.obs)
predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra)
self.mock_spectra.add_column.assert_called_once_with(self.retention_times, name="PREDICTED_IRT")

def test_predict_at_once(self):
"""Test prediction in one go."""
predictor = Predictor(self.mock_koina, model_name=self.model_name)
result = {"intensities": self.intensities, "annotation": self.ion_annotations}
predictor._predictor.predict = MagicMock(return_value=result)
output = predictor.predict_at_once(self.mock_spectra.obs)
predictor._predictor.predict.assert_called_once_with(self.mock_spectra.obs)
output = predictor.predict_at_once(self.mock_spectra)
predictor._predictor.predict.assert_called_once_with(self.mock_spectra)
self.assertEqual(output, result)

def test_predict_in_chunks(self):
Expand All @@ -196,7 +205,7 @@ def test_predict_in_chunks(self):
{"intensities": self.intensities, "annotation": self.ion_annotations},
]
)
output = predictor.predict_in_chunks(self.mock_spectra.obs, chunk_idx=self.chunk_idx)
output = predictor.predict_in_chunks(self.mock_spectra, chunk_idx=self.chunk_idx)
self.assertEqual(
output,
{
Expand Down Expand Up @@ -282,6 +291,42 @@ def test_ce_calibration_chunked(self):
class TestLocalPrediction(unittest.TestCase):
"""Test class for local prediction."""

@classmethod
def setUpClass(cls): # noqa: 402
cls.model_name = "Prosit_2019_intensity"
cls.model_type = "intensity"

cls.temp_dir = Path(tempfile.mkdtemp())
cls.data_dir = cls.temp_dir / "data"
cls.data_dir.mkdir()

cls.mock_config = create_autospec(Config, instance=True)
cls.mock_config.data = {}
cls.mock_config.data["models"] = {cls.model_type: cls.model_name}
cls.mock_config.output = cls.temp_dir

cls.mock_spectra = create_autospec(Spectra, instance=True)
cls.intensities = np.array([[0.0, 0.0, -1.0], [1.0, 0, -1.0], [1.0, 0.0, 0.0]])
cls.ion_annotations = np.array(
[["y1+1", "y1+2", "y1+3"], ["y1+1", "y1+2", "y1+3"], ["y1+1", "y1+2", "y1+3"]], dtype=object
)
cls.retention_times = np.array([30.0, 100.0, 160.0, 140.0, -2.0, 17.0])
cls.chunk_idx = [pd.Index([0, 1, 2]), pd.Index([3, 4, 5])]
cls.ce_range = (25, 30)

@patch("oktoberfest.pr.predictor.DLomix")
def test_predict_rt(self, mock_dlomix):
"""Test iRT prediction."""
# TODO add state-based test
if mock_dlomix is None:
self.assertTrue(True)
return
predictor = Predictor(mock_dlomix, model_name=self.model_name)
predictor.predict_at_once = MagicMock(return_value={"irt": self.retention_times})
predictor.predict_rt(self.mock_spectra)
predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra)
self.mock_spectra.add_column.assert_called_once_with(self.retention_times, name="PREDICTED_IRT")


class TestRefinementLearning(unittest.TestCase):
"""Test class for refinement learning."""

0 comments on commit 9be535c

Please sign in to comment.