Skip to content

Commit

Permalink
Merge pull request #293 from wilhelm-lab/release/0.8.3
Browse files Browse the repository at this point in the history
Release/0.8.3
  • Loading branch information
mostafakalhor authored Oct 30, 2024
2 parents 280a2b7 + eba7526 commit df3ce03
Show file tree
Hide file tree
Showing 24 changed files with 462,592 additions and 662 deletions.
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Victor Giurcoiu
email: [email protected]
project_name: oktoberfest
project_short_description: Public repo oktoberfest
version: 0.8.2
version: 0.8.3
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.8.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.8.2 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.8.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.8.3 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.8.2
current_version = 0.8.3

[bumpversion_files_whitelisted]
init_file = oktoberfest/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.8.2"
version = "0.8.3"
# The full version, including alpha/beta/rc tags.
release = "0.8.2"
release = "0.8.3"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion oktoberfest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
__copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
__license__ = "MIT"
__version__ = "0.8.2"
__version__ = "0.8.3"

import logging.handlers
import sys
Expand Down
86 changes: 77 additions & 9 deletions oktoberfest/data/spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,34 +26,53 @@ class FragmentType(Enum):
"""FragmentType class to enumerate pred, raw, and mz."""

PRED = 1
RAW = 2
MZ = 3
PRED_A = 2
PRED_B = 3
RAW = 4
RAW_A = 5
RAW_B = 6
MZ = 7
MZ_A = 8
MZ_B = 9


class Spectra(anndata.AnnData):
"""Main to init spectra data."""

INTENSITY_COLUMN_PREFIX = "INTENSITY_RAW"
INTENSITY_COLUMN_PREFIX_A = "INTENSITY_RAW_A"
INTENSITY_COLUMN_PREFIX_B = "INTENSITY_RAW_B"
INTENSITY_PRED_PREFIX = "INTENSITY_PRED"
INTENSITY_PRED_PREFIX_A = "INTENSITY_PRED_A"
INTENSITY_PRED_PREFIX_B = "INTENSITY_PRED_B"
MZ_COLUMN_PREFIX = "MZ_RAW"
MZ_COLUMN_PREFIX_A = "MZ_RAW_A"
MZ_COLUMN_PREFIX_B = "MZ_RAW_B"
INTENSITY_PRED_LAYER_NAME = "pred_int"
INTENSITY_PRED_LAYER_NAME_A = "pred_int_A"
INTENSITY_PRED_LAYER_NAME_B = "pred_int_B"
INTENSITY_LAYER_NAME = "raw_int"
INTENSITY_LAYER_NAME_A = "raw_int_A"
INTENSITY_LAYER_NAME_B = "raw_int_B"
MZ_LAYER_NAME = "mz"
MZ_LAYER_NAME_A = "mz_A"
MZ_LAYER_NAME_B = "mz_B"
COLUMNS_FRAGMENT_ION = ["Y1+", "Y1++", "Y1+++", "B1+", "B1++", "B1+++"]
MAX_CHARGE = 3

@staticmethod
def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"]) -> pd.DataFrame:
def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"], xl: bool = False) -> pd.DataFrame:
"""
Create annotation dataframe for vars in AnnData object.
:param ion_types: ion types that are expected to be in the spectra
:param xl: crosslinked or linear peptide
:return: pd.Dataframe of fragment annotations
"""
df = pd.DataFrame(
[
{"ion": f"{ion_type}{pos}+{charge}", "num": pos, "type": ion_type, "charge": charge}
for pos in c.POSITIONS
for pos in (c.POSITIONS_XL if xl else c.POSITIONS)
for ion_type in ion_types
for charge in c.CHARGES
]
Expand All @@ -62,24 +81,32 @@ def _gen_vars_df(ion_types: list[str] = c.FRAGMENTATION_TO_IONS_BY_PAIRS["HCD"])
return df

@staticmethod
def _gen_column_names(fragment_type: FragmentType) -> list[str]:
def _gen_column_names(fragment_type: FragmentType, xl: bool = False) -> list[str]:
"""
Get column names of the spectra data.
:param fragment_type: choose predicted, raw, or mz
:param xl: crosslinked or linear peptide
:return: A list of column names
"""
prefix = Spectra._resolve_prefix(fragment_type)
columns = []
for i in range(1, 30):
if xl:
max_range = 59
else:
max_range = 30
for i in range(1, max_range):
for column in Spectra.COLUMNS_FRAGMENT_ION:
columns.append(prefix + "_" + column.replace("1", str(i)))
return columns

@staticmethod
def _resolve_prefix(fragment_type: FragmentType) -> str:
"""
Resolve prefix given fragment type (1 for pred, 2 for raw, 3 for mz).
Resolve prefix given fragment type.
(1 for pred, 2 for xl_pred_a, 3 for xl_pred_a, 4 for raw, 5 for xl_raw_a,
6 for xl_raw_b, 7 for mz, 8 for xl_mz_a, 9 for xl_mz_b).
:param fragment_type: choose predicted, raw, or mz
:return: prefix as string
Expand All @@ -88,9 +115,21 @@ def _resolve_prefix(fragment_type: FragmentType) -> str:
if fragment_type.value == 1:
prefix = Spectra.INTENSITY_PRED_PREFIX
elif fragment_type.value == 2:
prefix = Spectra.INTENSITY_COLUMN_PREFIX
prefix = Spectra.INTENSITY_PRED_PREFIX_A
elif fragment_type.value == 3:
prefix = Spectra.INTENSITY_PRED_PREFIX_B
elif fragment_type.value == 4:
prefix = Spectra.INTENSITY_COLUMN_PREFIX
elif fragment_type.value == 5:
prefix = Spectra.INTENSITY_COLUMN_PREFIX_A
elif fragment_type.value == 6:
prefix = Spectra.INTENSITY_COLUMN_PREFIX_B
elif fragment_type.value == 7:
prefix = Spectra.MZ_COLUMN_PREFIX
elif fragment_type.value == 8:
prefix = Spectra.MZ_COLUMN_PREFIX_A
else:
prefix = Spectra.MZ_COLUMN_PREFIX_B
return prefix

@staticmethod
Expand All @@ -104,9 +143,21 @@ def _resolve_layer_name(fragment_type: FragmentType) -> str:
if fragment_type.value == 1:
layer = Spectra.INTENSITY_PRED_LAYER_NAME
elif fragment_type.value == 2:
layer = Spectra.INTENSITY_LAYER_NAME
layer = Spectra.INTENSITY_PRED_LAYER_NAME_A
elif fragment_type.value == 3:
layer = Spectra.INTENSITY_PRED_LAYER_NAME_B
elif fragment_type.value == 4:
layer = Spectra.INTENSITY_LAYER_NAME
elif fragment_type.value == 5:
layer = Spectra.INTENSITY_LAYER_NAME_A
elif fragment_type.value == 6:
layer = Spectra.INTENSITY_LAYER_NAME_B
elif fragment_type.value == 7:
layer = Spectra.MZ_LAYER_NAME
elif fragment_type.value == 8:
layer = Spectra.MZ_LAYER_NAME_A
elif fragment_type.value == 9:
layer = Spectra.MZ_LAYER_NAME_B
return layer

def __getitem__(self, index: Index):
Expand Down Expand Up @@ -191,6 +242,23 @@ def add_intensities(self, intensities: np.ndarray, annotation: np.ndarray, fragm
layer = self._resolve_layer_name(fragment_type)
self.layers[layer] = csr_matrix(sparse_intensity_matrix)

def add_intensities_without_mapping(self, intensities: np.ndarray, fragment_type: FragmentType):
"""
Add predicted intensities and convert to sparse matrix.
This function takes a numpy array, containing intensities.
The intensitz arraz is aexpected to have the same shape as this object and will be added to
the respective lazer without checking the order of fragment annotations.
:param intensities: intensity numpy array to add with shapes (n x m)
:param fragment_type: the type of intensities to add. Can be FragmentType.RAW or FragmentType.PRED.
"""
intensities[intensities == 0] = c.EPSILON
intensities[intensities == -1] = 0.0

layer = self._resolve_layer_name(fragment_type)
self.layers[layer] = csr_matrix(intensities)

def add_list_of_predicted_intensities(
self,
intensities: list[np.ndarray],
Expand Down
38 changes: 28 additions & 10 deletions oktoberfest/predict/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,23 @@
logger = logging.getLogger(__name__)


def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_charge: bool = False) -> Spectra:
def _prepare_alignment_df(
library: Spectra, ce_range: tuple[int, int], group_by_charge: bool = False, xl: bool = False
) -> Spectra:
"""
Prepare an alignment DataFrame from the given Spectra library.
This function creates an alignment DataFrame by removing decoys and non-HCD-fragmented spectra
from the input library, selecting the top 1000 (or however many are available if <1000) highest-scoring spectra, and
repeating the DataFrame for each collision energy (CE) in the given range.
This function creates an alignment DataFrame by removing decoy and HCD fragmented spectra
from the input library, selecting the top 1000 highest-scoring spectra for linear and top 20s for cross-linked peptides
and repeating the DataFrame for each collision energy (CE) in the given range.
:param library: the library to be propagated
:param ce_range: the min and max CE to be propagated for alignment in the dataframe
:param group_by_charge: if true, select the top 1000 spectra independently for each precursor charge
:param xl: if true, select the top 50 spectra for cross-linked peptide
:return: a library that is modified according to the description above
"""
top_n = 1000
top_n = 1000 if not xl else 20

if group_by_charge:
groups = ["RAW_FILE", "PRECURSOR_CHARGE"]
Expand All @@ -48,16 +51,31 @@ def _prepare_alignment_df(library: Spectra, ce_range: tuple[int, int], group_by_
return alignment_library


def _alignment(alignment_library: Spectra):
def _alignment(alignment_library: Spectra, xl: bool = False):
"""
Perform the alignment of predicted versus raw intensities.
The function calculates the spectral angle between predicted and observed fragment intensities and
adds it as a column to the alignment library.
:param alignment_library: the library to perform the alignment on
:param xl: crosslinked or linear peptide
"""
pred_intensity = alignment_library.get_matrix(FragmentType.PRED)
raw_intensity = alignment_library.get_matrix(FragmentType.RAW)
sm = SimilarityMetrics(pred_intensity, raw_intensity)
alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
if xl:
pred_intensity_a = alignment_library.get_matrix(FragmentType.PRED_A)
pred_intensity_b = alignment_library.get_matrix(FragmentType.PRED_B)
raw_intensity_a = alignment_library.get_matrix(FragmentType.RAW_A)
raw_intensity_b = alignment_library.get_matrix(FragmentType.RAW_B)
sm_a = SimilarityMetrics(pred_intensity_a, raw_intensity_a)
sm_b = SimilarityMetrics(pred_intensity_b, raw_intensity_b)
alignment_library.add_column(sm_a.spectral_angle(raw_intensity_a, pred_intensity_a, 0), "SPECTRAL_ANGLE_A")
alignment_library.add_column(sm_b.spectral_angle(raw_intensity_b, pred_intensity_b, 0), "SPECTRAL_ANGLE_B")
alignment_library.add_column(
(alignment_library.obs["SPECTRAL_ANGLE_A"] + alignment_library.obs["SPECTRAL_ANGLE_B"]) / 2,
"SPECTRAL_ANGLE",
)
else:
pred_intensity = alignment_library.get_matrix(FragmentType.PRED)
raw_intensity = alignment_library.get_matrix(FragmentType.RAW)
sm = SimilarityMetrics(pred_intensity, raw_intensity)
alignment_library.add_column(sm.spectral_angle(raw_intensity, pred_intensity, 0), "SPECTRAL_ANGLE")
15 changes: 15 additions & 0 deletions oktoberfest/predict/dlomix.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,3 +294,18 @@ def predict(self, data: Spectra, dataset_name: str, keep_dataset: bool = True) -
shutil.rmtree(self.output_path / dataset_name)

return {self.output_name: preds, "annotation": annotations}

def predict_xl(
self, data: Spectra, dataset_name: str, keep_dataset: bool = True
) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
"""
Perform inference on the xl data using the DLOmix model.
This is currently not implemented.
:param data: spectral library to predict features for
:param dataset_name: Name of the dataset for storing processed files for DLomix
:param keep_dataset: Whether to keep or discard the pre-processed dataset after inference
:raises NotImplementedError: Always.
"""
raise NotImplementedError("This method is not implemeted")
70 changes: 70 additions & 0 deletions oktoberfest/predict/koina.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@
"instrument_types": "INSTRUMENT_TYPES",
}

alternative_column_map_xl = {
"peptide_sequences_1": "MODIFIED_SEQUENCE_A",
"peptide_sequences_2": "MODIFIED_SEQUENCE_B",
"precursor_charges": "PRECURSOR_CHARGE",
"collision_energies": "COLLISION_ENERGY",
"fragmentation_types": "FRAGMENTATION",
"instrument_types": "INSTRUMENT_TYPES",
}

# Create a new mapping with switched keys and values for peptide_sequences_1 and peptide_sequences_2
alternative_column_map_xl_switched = {
"peptide_sequences_1": "MODIFIED_SEQUENCE_B",
"peptide_sequences_2": "MODIFIED_SEQUENCE_A",
**{
key: value
for key, value in alternative_column_map_xl.items()
if key not in ["peptide_sequences_1", "peptide_sequences_2"]
},
}


class Koina(_KoinaGRPC):
"""Extension of the Koina GRPC class in koinapy, to add required logic for Oktoberfest."""
Expand Down Expand Up @@ -62,3 +82,53 @@ def predict(self, data: dict[str, np.ndarray] | pd.DataFrame | Spectra, **kwargs
for input_field in self.model_inputs.keys()
}
return super().predict(inputs=data, **kwargs)

def predict_xl(
self, data: dict[str, np.ndarray] | pd.DataFrame | Spectra, **kwargs
) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
"""
Perform inference on the xl data using the Koina model.
This method allows you to perform inference on the provided input data using the configured Koina model. You can
choose to perform inference asynchronously (in parallel) or sequentially, depending on the value of the '_async'
parameter. If asynchronous inference is selected, the method will return when all inference tasks are complete.
Note: Ensure that the model and server are properly configured and that the input data matches the model's
nput requirements.
:param data: A dictionary or dataframe containing input data for inference. For the dictionary, keys are input names,
and values are numpy arrays. In case of a dataframe, the input fields for the requested model must be present
in the column names.
:param kwargs: Additional params that are forwarded to super().predict
:return: A dictionary containing the model's predictions. Keys are output names, and values are numpy arrays
representing the model's output.
:raises ValueError: If `data` is not of type `Spectra`, `pd.DataFrame`, or a dictionary.
Example::
model = Koina("Prosit_XL_CMS2_intensity")
input_data = {
"peptide_sequences_1": np.array(["PEPTIDEK" for _ in range(size)]),
"peptide_sequences_2": np.array(["PEPTIDEK" for _ in range(size)]),
"precursor_charges": np.array([2 for _ in range(size)]),
"collision_energies": np.array([20 for _ in range(size)]),
"fragmentation_types": np.array(["HCD" for _ in range(size)]),
"instrument_types": np.array(["QE" for _ in range(size)])
}
predictions = model.predict(input_data)
"""
if isinstance(data, Spectra):
data = data.obs
if isinstance(data, pd.DataFrame):
data = {
input_field: data[[alternative_column_map_xl[input_field]]].to_numpy()
for input_field in self.model_inputs.keys()
}
prediction_ab = super().predict(inputs=data, debug=True, **kwargs)
data["peptide_sequences_1"], data["peptide_sequences_2"] = (
data["peptide_sequences_2"],
data["peptide_sequences_1"],
)
prediction_ba = super().predict(inputs=data, debug=True, **kwargs)

return prediction_ab, prediction_ba

raise ValueError("Input data must be of type Spectra, pd.DataFrame, or a dictionary.")
Loading

0 comments on commit df3ce03

Please sign in to comment.