Merge pull request #293 from wilhelm-lab/release/0.8.3

Release/0.8.3
wilhelm-lab · Oct 30, 2024 · df3ce03 · df3ce03
2 parents 280a2b7 + eba7526
commit df3ce03
Show file tree

Hide file tree

Showing 24 changed files with 462,592 additions and 662 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Victor Giurcoiu
 email: [email protected]
 project_name: oktoberfest
 project_short_description: Public repo oktoberfest
-version: 0.8.2
+version: 0.8.3
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.8.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.8.2 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.8.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.8.3 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.8.2
+current_version = 0.8.3
 
 [bumpversion_files_whitelisted]
 init_file = oktoberfest/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -55,9 +55,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.8.2"
+version = "0.8.3"
 # The full version, including alpha/beta/rc tags.
-release = "0.8.2"
+release = "0.8.3"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/oktoberfest/__init__.py b/oktoberfest/__init__.py
@@ -5,7 +5,7 @@
 __author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
 __copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
 __license__ = "MIT"
-__version__ = "0.8.2"
+__version__ = "0.8.3"
 
 import logging.handlers
 import sys

diff --git a/oktoberfest/data/spectra.py b/oktoberfest/data/spectra.py
@@ -26,34 +26,53 @@ class FragmentType(Enum):
     """FragmentType class to enumerate pred, raw, and mz."""
 
     PRED = 1
-    RAW = 2
-    MZ = 3
+    PRED_A = 2
+    PRED_B = 3
+    RAW = 4
+    RAW_A = 5
+    RAW_B = 6
+    MZ = 7
+    MZ_A = 8
+    MZ_B = 9
 
 
 class Spectra(anndata.AnnData):
     """Main to init spectra data."""
 
     INTENSITY_COLUMN_PREFIX = "INTENSITY_RAW"
+    INTENSITY_COLUMN_PREFIX_A = "INTENSITY_RAW_A"
+    INTENSITY_COLUMN_PREFIX_B = "INTENSITY_RAW_B"
     INTENSITY_PRED_PREFIX = "INTENSITY_PRED"
+    INTENSITY_PRED_PREFIX_A = "INTENSITY_PRED_A"
+    INTENSITY_PRED_PREFIX_B = "INTENSITY_PRED_B"
     MZ_COLUMN_PREFIX = "MZ_RAW"
+    MZ_COLUMN_PREFIX_A = "MZ_RAW_A"
+    MZ_COLUMN_PREFIX_B = "MZ_RAW_B"
     INTENSITY_PRED_LAYER_NAME = "pred_int"
+    INTENSITY_PRED_LAYER_NAME_A = "pred_int_A"
+    INTENSITY_PRED_LAYER_NAME_B = "pred_int_B"
     INTENSITY_LAYER_NAME = "raw_int"
+    INTENSITY_LAYER_NAME_A = "raw_int_A"
+    INTENSITY_LAYER_NAME_B = "raw_int_B"
     MZ_LAYER_NAME = "mz"
+    MZ_LAYER_NAME_A = "mz_A"
+    MZ_LAYER_NAME_B = "mz_B"
     COLUMNS_FRAGMENT_ION = ["Y1+", "Y1++", "Y1+++", "B1+", "B1++", "B1+++"]
     MAX_CHARGE = 3
 
     @staticmethod
-    def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"]) -> pd.DataFrame:
+    def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"], xl: bool = False) -> pd.DataFrame:
         """
         Create annotation dataframe for vars in AnnData object.
 
         :param ion_types: ion types that are expected to be in the spectra
+        :param xl: crosslinked or linear peptide
         :return: pd.Dataframe of fragment annotations
         """
         df = pd.DataFrame(
             [
                 {"ion": f"{ion_type}{pos}+{charge}", "num": pos, "type": ion_type, "charge": charge}
-                for pos in c.POSITIONS
+                for pos in (c.POSITIONS_XL if xl else c.POSITIONS)
                 for ion_type in ion_types
                 for charge in c.CHARGES
             ]
@@ -62,24 +81,32 @@ def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"])
         return df
 
     @staticmethod
-    def _gen_column_names(fragment_type: FragmentType) -> list[str]:
+    def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> list[str]:
         """
         Get column names of the spectra data.
 
         :param fragment_type: choose predicted, raw, or mz
+        :param xl: crosslinked or linear peptide
         :return: A list of column names
         """
         prefix = Spectra._resolve_prefix(fragment_type)
         columns = []
-        for i in range(1, 30):
+        if xl:
+            max_range = 59
+        else:
+            max_range = 30
+        for i in range(1, max_range):
             for column in Spectra.COLUMNS_FRAGMENT_ION:
                 columns.append(prefix + "_" + column.replace("1", str(i)))
         return columns
 
     @staticmethod
     def _resolve_prefix(fragment_type: FragmentType) -> str:
         """
-        Resolve prefix given fragment type (1 for pred, 2 for raw, 3 for mz).
+        Resolve prefix given fragment type.
+
+        (1 for pred, 2 for xl_pred_a, 3 for xl_pred_a, 4 for raw, 5 for xl_raw_a,
+        6 for xl_raw_b, 7 for mz, 8 for xl_mz_a, 9 for xl_mz_b).
 
         :param fragment_type: choose predicted, raw, or mz
         :return: prefix as string
@@ -88,9 +115,21 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
         if fragment_type.value == 1:
             prefix = Spectra.INTENSITY_PRED_PREFIX
         elif fragment_type.value == 2:
-            prefix = Spectra.INTENSITY_COLUMN_PREFIX
+            prefix = Spectra.INTENSITY_PRED_PREFIX_A
         elif fragment_type.value == 3:
+            prefix = Spectra.INTENSITY_PRED_PREFIX_B
+        elif fragment_type.value == 4:
+            prefix = Spectra.INTENSITY_COLUMN_PREFIX
+        elif fragment_type.value == 5:
+            prefix = Spectra.INTENSITY_COLUMN_PREFIX_A
+        elif fragment_type.value == 6:
+            prefix = Spectra.INTENSITY_COLUMN_PREFIX_B
+        elif fragment_type.value == 7:
             prefix = Spectra.MZ_COLUMN_PREFIX
+        elif fragment_type.value == 8:
+            prefix = Spectra.MZ_COLUMN_PREFIX_A
+        else:
+            prefix = Spectra.MZ_COLUMN_PREFIX_B
         return prefix
 
     @staticmethod
@@ -104,9 +143,21 @@ def _resolve_layer_name(fragment_type: FragmentType) -> str:
         if fragment_type.value == 1:
             layer = Spectra.INTENSITY_PRED_LAYER_NAME
         elif fragment_type.value == 2:
-            layer = Spectra.INTENSITY_LAYER_NAME
+            layer = Spectra.INTENSITY_PRED_LAYER_NAME_A
         elif fragment_type.value == 3:
+            layer = Spectra.INTENSITY_PRED_LAYER_NAME_B
+        elif fragment_type.value == 4:
+            layer = Spectra.INTENSITY_LAYER_NAME
+        elif fragment_type.value == 5:
+            layer = Spectra.INTENSITY_LAYER_NAME_A
+        elif fragment_type.value == 6:
+            layer = Spectra.INTENSITY_LAYER_NAME_B
+        elif fragment_type.value == 7:
             layer = Spectra.MZ_LAYER_NAME
+        elif fragment_type.value == 8:
+            layer = Spectra.MZ_LAYER_NAME_A
+        elif fragment_type.value == 9:
+            layer = Spectra.MZ_LAYER_NAME_B
         return layer
 
     def __getitem__(self, index: Index):
@@ -191,6 +242,23 @@ def add_intensities(self, intensities: np.ndarray, annotation: np.ndarray, fragm
         layer = self._resolve_layer_name(fragment_type)
         self.layers[layer] = csr_matrix(sparse_intensity_matrix)
 
+    def add_intensities_without_mapping(self, intensities: np.ndarray, fragment_type: FragmentType):
+        """
+        Add predicted intensities and convert to sparse matrix.
+
+        This function takes a numpy array, containing intensities.
+        The intensitz arraz is aexpected to have the same shape as this object and will be added to
+        the respective lazer without checking the order of fragment annotations.
+
+        :param intensities: intensity numpy array to add with shapes (n x m)
+        :param fragment_type: the type of intensities to add. Can be FragmentType.RAW or FragmentType.PRED.
+        """
+        intensities[intensities == 0] = c.EPSILON
+        intensities[intensities == -1] = 0.0
+
+        layer = self._resolve_layer_name(fragment_type)
+        self.layers[layer] = csr_matrix(intensities)
+
     def add_list_of_predicted_intensities(
         self,
         intensities: list[np.ndarray],

diff --git a/oktoberfest/predict/alignment.py b/oktoberfest/predict/alignment.py
@@ -9,20 +9,23 @@
 logger = logging.getLogger(__name__)
 
 
-def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_charge: bool = False) -> Spectra:
+def _prepare_alignment_df(
+    library: Spectra, ce_range: tuple[int, int], group_by_charge: bool = False, xl: bool = False
+) -> Spectra:
     """
     Prepare an alignment DataFrame from the given Spectra library.
 
-    This function creates an alignment DataFrame by removing decoys and non-HCD-fragmented spectra
-    from the input library, selecting the top 1000 (or however many are available if <1000) highest-scoring spectra, and
-    repeating the DataFrame for each collision energy (CE) in the given range.
+    This function creates an alignment DataFrame by removing decoy and HCD fragmented spectra
+    from the input library, selecting the top 1000 highest-scoring spectra for linear and top 20s for cross-linked peptides
+    and repeating the DataFrame for each collision energy (CE) in the given range.
 
     :param library: the library to be propagated
     :param ce_range: the min and max CE to be propagated for alignment in the dataframe
     :param group_by_charge: if true, select the top 1000 spectra independently for each precursor charge
+    :param xl: if true, select the top 50 spectra for cross-linked peptide
     :return: a library that is modified according to the description above
     """
-    top_n = 1000
+    top_n = 1000 if not xl else 20
 
     if group_by_charge:
         groups = ["RAW_FILE", "PRECURSOR_CHARGE"]
@@ -48,16 +51,31 @@ def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_
     return alignment_library
 
 
-def _alignment(alignment_library: Spectra):
+def _alignment(alignment_library: Spectra, xl: bool = False):
     """
     Perform the alignment of predicted versus raw intensities.
 
     The function calculates the spectral angle between predicted and observed fragment intensities and
     adds it as a column to the alignment library.
 
     :param alignment_library: the library to perform the alignment on
+    :param xl: crosslinked or linear peptide
     """
-    pred_intensity = alignment_library.get_matrix(FragmentType.PRED)
-    raw_intensity = alignment_library.get_matrix(FragmentType.RAW)
-    sm = SimilarityMetrics(pred_intensity, raw_intensity)
-    alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
+    if xl:
+        pred_intensity_a = alignment_library.get_matrix(FragmentType.PRED_A)
+        pred_intensity_b = alignment_library.get_matrix(FragmentType.PRED_B)
+        raw_intensity_a = alignment_library.get_matrix(FragmentType.RAW_A)
+        raw_intensity_b = alignment_library.get_matrix(FragmentType.RAW_B)
+        sm_a = SimilarityMetrics(pred_intensity_a, raw_intensity_a)
+        sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
+        alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A")
+        alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B")
+        alignment_library.add_column(
+            (alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2,
+            "SPECTRAL_ANGLE",
+        )
+    else:
+        pred_intensity = alignment_library.get_matrix(FragmentType.PRED)
+        raw_intensity = alignment_library.get_matrix(FragmentType.RAW)
+        sm = SimilarityMetrics(pred_intensity, raw_intensity)
+        alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
diff --git a/oktoberfest/predict/dlomix.py b/oktoberfest/predict/dlomix.py
@@ -294,3 +294,18 @@ def predict(self, data: Spectra, dataset_name: str, keep_dataset: bool = True) -
             shutil.rmtree(self.output_path / dataset_name)
 
         return {self.output_name: preds, "annotation": annotations}
+
+    def predict_xl(
+        self, data: Spectra, dataset_name: str, keep_dataset: bool = True
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Perform inference on the xl data using the DLOmix model.
+
+        This is currently not implemented.
+
+        :param data: spectral library to predict features for
+        :param dataset_name: Name of the dataset for storing processed files for DLomix
+        :param keep_dataset: Whether to keep or discard the pre-processed dataset after inference
+        :raises NotImplementedError: Always.
+        """
+        raise NotImplementedError("This method is not implemeted")
diff --git a/oktoberfest/predict/koina.py b/oktoberfest/predict/koina.py
@@ -22,6 +22,26 @@
     "instrument_types": "INSTRUMENT_TYPES",
 }
 
+alternative_column_map_xl = {
+    "peptide_sequences_1": "MODIFIED_SEQUENCE_A",
+    "peptide_sequences_2": "MODIFIED_SEQUENCE_B",
+    "precursor_charges": "PRECURSOR_CHARGE",
+    "collision_energies": "COLLISION_ENERGY",
+    "fragmentation_types": "FRAGMENTATION",
+    "instrument_types": "INSTRUMENT_TYPES",
+}
+
+# Create a new mapping with switched keys and values for peptide_sequences_1 and peptide_sequences_2
+alternative_column_map_xl_switched = {
+    "peptide_sequences_1": "MODIFIED_SEQUENCE_B",
+    "peptide_sequences_2": "MODIFIED_SEQUENCE_A",
+    **{
+        key: value
+        for key, value in alternative_column_map_xl.items()
+        if key not in ["peptide_sequences_1", "peptide_sequences_2"]
+    },
+}
+
 
 class Koina(_KoinaGRPC):
     """Extension of the Koina GRPC class in koinapy, to add required logic for Oktoberfest."""
@@ -62,3 +82,53 @@ def predict(self, data: dict[str, np.ndarray] | pd.DataFrame | Spectra, **kwargs
                 for input_field in self.model_inputs.keys()
             }
         return super().predict(inputs=data, **kwargs)
+
+    def predict_xl(
+        self, data: dict[str, np.ndarray] | pd.DataFrame | Spectra, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Perform inference on the xl data using the Koina model.
+
+        This method allows you to perform inference on the provided input data using the configured Koina model. You can
+        choose to perform inference asynchronously (in parallel) or sequentially, depending on the value of the '_async'
+        parameter. If asynchronous inference is selected, the method will return when all inference tasks are complete.
+        Note: Ensure that the model and server are properly configured and that the input data matches the model's
+        nput requirements.
+
+        :param data: A dictionary or dataframe containing input data for inference. For the dictionary, keys are input names,
+            and values are numpy arrays. In case of a dataframe, the input fields for the requested model must be present
+            in the column names.
+        :param kwargs: Additional params that are forwarded to super().predict
+        :return: A dictionary containing the model's predictions. Keys are output names, and values are numpy arrays
+            representing the model's output.
+        :raises ValueError: If `data` is not of type `Spectra`, `pd.DataFrame`, or a dictionary.
+
+        Example::
+            model = Koina("Prosit_XL_CMS2_intensity")
+            input_data = {
+                "peptide_sequences_1": np.array(["PEPTIDEK" for _ in range(size)]),
+                "peptide_sequences_2": np.array(["PEPTIDEK" for _ in range(size)]),
+                "precursor_charges": np.array([2 for _ in range(size)]),
+                "collision_energies": np.array([20 for _ in range(size)]),
+                "fragmentation_types": np.array(["HCD" for _ in range(size)]),
+                "instrument_types": np.array(["QE" for _ in range(size)])
+            }
+            predictions = model.predict(input_data)
+        """
+        if isinstance(data, Spectra):
+            data = data.obs
+        if isinstance(data, pd.DataFrame):
+            data = {
+                input_field: data[[alternative_column_map_xl[input_field]]].to_numpy()
+                for input_field in self.model_inputs.keys()
+            }
+            prediction_ab = super().predict(inputs=data, debug=True, **kwargs)
+            data["peptide_sequences_1"], data["peptide_sequences_2"] = (
+                data["peptide_sequences_2"],
+                data["peptide_sequences_1"],
+            )
+            prediction_ba = super().predict(inputs=data, debug=True, **kwargs)
+
+            return prediction_ab, prediction_ba
+
+        raise ValueError("Input data must be of type Spectra, pd.DataFrame, or a dictionary.")