From 9be535ca208469fcbd8edf4187affac19adff4d6 Mon Sep 17 00:00:00 2001 From: Mario Picciani Date: Fri, 13 Sep 2024 16:49:21 +0200 Subject: [PATCH] readded Spectra instead of df + added dlomix check --- oktoberfest/predict/predictor.py | 59 ++++++++++++++++++++++++++------ oktoberfest/runner.py | 2 +- tests/unit_tests/test_predict.py | 59 ++++++++++++++++++++++++++++---- 3 files changed, 101 insertions(+), 19 deletions(-) diff --git a/oktoberfest/predict/predictor.py b/oktoberfest/predict/predictor.py index 00960ba..f7faa7f 100644 --- a/oktoberfest/predict/predictor.py +++ b/oktoberfest/predict/predictor.py @@ -23,6 +23,7 @@ PredictionInterface = Union[DLomix, Koina, ZeroPredictor] else: PredictionInterface = Union[Koina, ZeroPredictor] + DLomix = None class Predictor: @@ -148,10 +149,10 @@ def predict_intensities(self, data: Spectra, chunk_idx: Optional[list[pd.Index]] >>> print(library.layers["pred_int"]) """ if chunk_idx is None: - intensities = self.predict_at_once(data=data.obs, **kwargs) + intensities = self.predict_at_once(data=data, **kwargs) data.add_intensities(intensities["intensities"], intensities["annotation"], fragment_type=FragmentType.PRED) else: - chunked_intensities = self.predict_in_chunks(data=data.obs, chunk_idx=chunk_idx, **kwargs) + chunked_intensities = self.predict_in_chunks(data=data, chunk_idx=chunk_idx, **kwargs) data.add_list_of_predicted_intensities( chunked_intensities["intensities"], chunked_intensities["annotation"], chunk_idx ) @@ -189,10 +190,46 @@ def predict_rt(self, data: Spectra, **kwargs): >>> irt_predictor.predict_rt(data=library) >>> print(library.obs["PREDICTED_IRT"]) """ - pred_irts = self.predict_at_once(data=data.obs, **kwargs) + pred_irts = self.predict_at_once(data=data, **kwargs) data.add_column(pred_irts["irt"].squeeze(), name="PREDICTED_IRT") - def predict_at_once(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]: + def predict_at_once(self, data: Spectra, **kwargs) -> dict[str, np.ndarray]: + """ + Retrieve and return predictions in one go. + + This function takes a Spectra object containing information about PSMs and predicts peptide properties. The + configuration of Koina/DLomix is set using the kwargs. + See the Koina or DLomix predict functions for details. TODO, link this properly. + + :param data: Spectra containing the data for the prediction. + :param kwargs: Additional parameters that are forwarded to Koina/DLomix::predict + + :return: a dictionary with targets (keys) and predictions (values) + + :Example: + + .. code-block:: python + + >>> from oktoberfest import predict as pr + >>> import pandas as pd + >>> # Required columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE and FRAGMENTATION + >>> meta_df = pd.DataFrame({"MODIFIED_SEQUENCE": ["AAAC[UNIMOD:4]RFVQ","RM[UNIMOD:35]PC[UNIMOD:4]HKPYL"], + >>> "COLLISION_ENERGY": [30,35], + >>> "PRECURSOR_CHARGE": [1,2], + >>> "FRAGMENTATION": ["HCD","HCD"]}) + >>> var = Spectra._gen_vars_df() + >>> library = Spectra(obs=meta_df, var=var) + >>> intensity_predictor = pr.Predictor.from_koina( + >>> model_name="Prosit_2020_intensity_HCD", + >>> server_url="koina.wilhelmlab.org:443", + >>> ssl=True, + >>> targets=["intensities", "annotation"]) + >>> predictions = intensity_predictor.predict_at_once(data=library) + >>> print(predictions) + """ + return self._predictor.predict(data, **self._filter_kwargs(**kwargs)) + + def _predict_at_once_df(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray]: """ Retrieve and return predictions in one go. @@ -211,7 +248,7 @@ def predict_at_once(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray] >>> from oktoberfest import predict as pr >>> import pandas as pd - >>> # Requiered columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE and FRAGMENTATION + >>> # Required columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE and FRAGMENTATION >>> meta_df = pd.DataFrame({"MODIFIED_SEQUENCE": ["AAAC[UNIMOD:4]RFVQ","RM[UNIMOD:35]PC[UNIMOD:4]HKPYL"], >>> "COLLISION_ENERGY": [30,35], >>> "PRECURSOR_CHARGE": [1,2], @@ -228,15 +265,15 @@ def predict_at_once(self, data: pd.DataFrame, **kwargs) -> dict[str, np.ndarray] """ return self._predictor.predict(data, **self._filter_kwargs(**kwargs)) - def predict_in_chunks(self, data: pd.DataFrame, chunk_idx: list[pd.Index], **kwargs) -> dict[str, list[np.ndarray]]: + def predict_in_chunks(self, data: Spectra, chunk_idx: list[pd.Index], **kwargs) -> dict[str, list[np.ndarray]]: """ Retrieve and return predictions in chunks. - This function takes a dataframe containing information about PSMs and predicts peptide properties.The + This function takes a Spectra object containing information about PSMs and predicts peptide properties.The configuration of Koina/DLomix is set using the kwargs. See the Koina or DLomix predict functions for details. TODO, link this properly. - :param data: Dataframe containing the data for the prediction. + :param data: Spectra object containing the data for the prediction. :param chunk_idx: The chunked indices of the provided dataframe. This is required in some cases, e.g. if padding should be avoided when predicting peptides of different length. For alphapept, this is required as padding is only performed within one batch, leading to @@ -252,7 +289,7 @@ def predict_in_chunks(self, data: pd.DataFrame, chunk_idx: list[pd.Index], **kwa >>> from oktoberfest import predict as pr >>> from oktoberfest.utils import group_iterator - >>> # Requiered columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE, FRAGMENTATION and PEPTIDE_LENGTH + >>> # Required columns: MODIFIED_SEQUENCE, COLLISION_ENERGY, PRECURSOR_CHARGE, FRAGMENTATION and PEPTIDE_LENGTH >>> meta_df = pd.DataFrame({"MODIFIED_SEQUENCE": ["AAAC[UNIMOD:4]RFVQ","RM[UNIMOD:35]PC[UNIMOD:4]HKPYL"], >>> "COLLISION_ENERGY": [30,35], >>> "PRECURSOR_CHARGE": [1,2], @@ -266,12 +303,12 @@ def predict_in_chunks(self, data: pd.DataFrame, chunk_idx: list[pd.Index], **kwa >>> server_url="koina.wilhelmlab.org:443", >>> ssl=True, >>> targets=["intensities", "annotation"]) - >>> predictions = intensity_predictor.predict_in_chunks(data=library.obs, chunk_idx=idx) + >>> predictions = intensity_predictor.predict_in_chunks(data=library, chunk_idx=idx) >>> print(predictions) """ results = [] for idx in chunk_idx: - results.append(self._predictor.predict(data.loc[idx], **self._filter_kwargs(**kwargs))) + results.append(self._predictor.predict(data[idx], **self._filter_kwargs(**kwargs))) ret_val = {key: [item[key] for item in results] for key in results[0].keys()} return ret_val diff --git a/oktoberfest/runner.py b/oktoberfest/runner.py index 76f9d15..a434d61 100644 --- a/oktoberfest/runner.py +++ b/oktoberfest/runner.py @@ -43,7 +43,7 @@ def _make_predictions(config, queue_out, progress, lock, batch_df): predictions = { output_name: output for predictor in predictors.values() - for output_name, output in predictor.predict_at_once(batch_df).items() + for output_name, output in predictor._predictor.predict(batch_df).items() } queue_out.put((predictions, batch_df)) with lock: diff --git a/tests/unit_tests/test_predict.py b/tests/unit_tests/test_predict.py index a188bea..87230b8 100644 --- a/tests/unit_tests/test_predict.py +++ b/tests/unit_tests/test_predict.py @@ -10,7 +10,7 @@ from oktoberfest.data import Spectra from oktoberfest.data.spectra import FragmentType -from oktoberfest.pr import Koina, Predictor +from oktoberfest.predict import Koina, Predictor from oktoberfest.utils import Config DATA_PATH = Path(__file__).parent / "data" @@ -98,6 +98,9 @@ def test_from_koina(self, mock_koina): @patch("oktoberfest.pr.predictor.DLomix") def test_from_dlomix(self, mock_dlomix): """Test DLomix constructor for Predictor.""" + if mock_dlomix is None: + self.assertTrue(True) + return predictor = Predictor.from_dlomix( model_type=self.model_type, model_path=self.temp_dir / "prosit_baseline.keras", @@ -118,6 +121,9 @@ def test_koina_from_config(self, mock_koina): @patch("oktoberfest.pr.predictor.DLomix") def test_dlomix_from_config(self, mock_dlomix): """Test config constructor for Predictor with DLomix.""" + if mock_dlomix is None: + self.assertTrue(True) + return self.mock_config.predict_intensity_locally = True self.mock_config.download_baseline_intensity_predictor = False self.mock_config.dlomix_inference_batch_size = 1024 @@ -128,6 +134,9 @@ def test_dlomix_from_config(self, mock_dlomix): @patch("oktoberfest.pr.predictor.DLomix") def test_download_new_model(self, mock_dlomix): """Test if new baseline model is downloaded if requested.""" + if mock_dlomix is None: + self.assertTrue(True) + return self.mock_config.download_baseline_intensity_predictor = True predictor = Predictor.from_config(self.mock_config, model_type=self.model_type) self.assertIsInstance(predictor._predictor, type(mock_dlomix.return_value)) @@ -147,7 +156,7 @@ def test_predict_intensities_at_once(self): return_value={"intensities": self.intensities, "annotation": self.ion_annotations} ) predictor.predict_intensities(self.mock_spectra) - predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra.obs) + predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra) self.mock_spectra.add_intensities.assert_called_once_with( self.intensities, self.ion_annotations, fragment_type=FragmentType.PRED ) @@ -163,7 +172,7 @@ def test_predict_intensities_chunked(self): } ) predictor.predict_intensities(self.mock_spectra, chunk_idx=self.chunk_idx) - predictor.predict_in_chunks.assert_called_once_with(data=self.mock_spectra.obs, chunk_idx=self.chunk_idx) + predictor.predict_in_chunks.assert_called_once_with(data=self.mock_spectra, chunk_idx=self.chunk_idx) self.mock_spectra.add_list_of_predicted_intensities.assert_called_once_with( [self.intensities, self.intensities], [self.ion_annotations, self.ion_annotations], self.chunk_idx ) @@ -174,7 +183,7 @@ def test_predict_rt(self): predictor = Predictor(self.mock_koina, model_name=self.model_name) predictor.predict_at_once = MagicMock(return_value={"irt": self.retention_times}) predictor.predict_rt(self.mock_spectra) - predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra.obs) + predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra) self.mock_spectra.add_column.assert_called_once_with(self.retention_times, name="PREDICTED_IRT") def test_predict_at_once(self): @@ -182,8 +191,8 @@ def test_predict_at_once(self): predictor = Predictor(self.mock_koina, model_name=self.model_name) result = {"intensities": self.intensities, "annotation": self.ion_annotations} predictor._predictor.predict = MagicMock(return_value=result) - output = predictor.predict_at_once(self.mock_spectra.obs) - predictor._predictor.predict.assert_called_once_with(self.mock_spectra.obs) + output = predictor.predict_at_once(self.mock_spectra) + predictor._predictor.predict.assert_called_once_with(self.mock_spectra) self.assertEqual(output, result) def test_predict_in_chunks(self): @@ -196,7 +205,7 @@ def test_predict_in_chunks(self): {"intensities": self.intensities, "annotation": self.ion_annotations}, ] ) - output = predictor.predict_in_chunks(self.mock_spectra.obs, chunk_idx=self.chunk_idx) + output = predictor.predict_in_chunks(self.mock_spectra, chunk_idx=self.chunk_idx) self.assertEqual( output, { @@ -282,6 +291,42 @@ def test_ce_calibration_chunked(self): class TestLocalPrediction(unittest.TestCase): """Test class for local prediction.""" + @classmethod + def setUpClass(cls): # noqa: 402 + cls.model_name = "Prosit_2019_intensity" + cls.model_type = "intensity" + + cls.temp_dir = Path(tempfile.mkdtemp()) + cls.data_dir = cls.temp_dir / "data" + cls.data_dir.mkdir() + + cls.mock_config = create_autospec(Config, instance=True) + cls.mock_config.data = {} + cls.mock_config.data["models"] = {cls.model_type: cls.model_name} + cls.mock_config.output = cls.temp_dir + + cls.mock_spectra = create_autospec(Spectra, instance=True) + cls.intensities = np.array([[0.0, 0.0, -1.0], [1.0, 0, -1.0], [1.0, 0.0, 0.0]]) + cls.ion_annotations = np.array( + [["y1+1", "y1+2", "y1+3"], ["y1+1", "y1+2", "y1+3"], ["y1+1", "y1+2", "y1+3"]], dtype=object + ) + cls.retention_times = np.array([30.0, 100.0, 160.0, 140.0, -2.0, 17.0]) + cls.chunk_idx = [pd.Index([0, 1, 2]), pd.Index([3, 4, 5])] + cls.ce_range = (25, 30) + + @patch("oktoberfest.pr.predictor.DLomix") + def test_predict_rt(self, mock_dlomix): + """Test iRT prediction.""" + # TODO add state-based test + if mock_dlomix is None: + self.assertTrue(True) + return + predictor = Predictor(mock_dlomix, model_name=self.model_name) + predictor.predict_at_once = MagicMock(return_value={"irt": self.retention_times}) + predictor.predict_rt(self.mock_spectra) + predictor.predict_at_once.assert_called_once_with(data=self.mock_spectra) + self.mock_spectra.add_column.assert_called_once_with(self.retention_times, name="PREDICTED_IRT") + class TestRefinementLearning(unittest.TestCase): """Test class for refinement learning."""