Skip to content

Commit

Permalink
fixed dlib for koina
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Sep 22, 2023
1 parent 178b08f commit 78741e2
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 72 deletions.
1 change: 1 addition & 0 deletions spectrum_io/spectral_library/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . import digest
from .dlib import DLib
from .msp import MSP
from .spectral_library import SpectralLibrary
from .spectronaut import Spectronaut

logger = logging.getLogger(__name__)
122 changes: 52 additions & 70 deletions spectrum_io/spectral_library/dlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES
from spectrum_fundamentals.mod_string import internal_to_mod_mass, internal_without_mods

from .spectral_library import SpectralLibrary
Expand All @@ -25,62 +26,9 @@
class DLib(SpectralLibrary):
"""Main to init a DLib obj."""

def __init__(
self,
precursor_mz: Union[List[float], np.ndarray],
precursor_charges: Union[List[int], np.ndarray],
modified_sequences: List[str],
retention_times: Union[List[float], np.ndarray],
fragmentmz: List[np.ndarray],
intensities: List[np.ndarray],
path: Union[str, Path],
min_intensity_threshold: Optional[float] = 0.05,
):
def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: List[np.ndarray]):
"""
Initializer for the DLib class.
:param precursor_mz: precursor mass to charge ratios
:param precursor_charges: precurosr charges
:param modified_sequences: modified sequences in internal format
:param retention_times: retention times
:param fragmentmz: mass to charge ratio of fragments
:param intensities: intensities
:param path: path to the file the dlib is written to
:param min_intensity_threshold: minimal intensity required when masking fragmentmz and intensities
"""
self.path = path
self.create_database(self.path)

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragmentmz, intensities, min_intensity_threshold)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})

@staticmethod
def _calculate_masked_values(
fragmentmz: List[np.ndarray],
intensities: List[np.ndarray],
intensity_min_threshold: Optional[float] = 0.05,
):
"""
Internal function called during __init__ that masks, filters, byte encodes, swaps and compresses fragmentmz \
Internal function that masks, filters, byte encodes, swaps and compresses fragmentmz \
and intensities.
This will produce the data for the following columns in this order:
Expand All @@ -90,7 +38,6 @@ def _calculate_masked_values(
- 'IntensityEncodedLength'.
:param fragmentmz: fragmentmz provided in __init__
:param intensities: intensities provided in __init__
:param intensity_min_threshold: minimum threshold for tge intensity; default=0.05
:return: 4 lists as described above
"""
mz_bytes_list = []
Expand All @@ -99,8 +46,7 @@ def _calculate_masked_values(
i_lengths = []
for mz, i in zip(fragmentmz, intensities):
# mask to only existing peaks
mask = i >= intensity_min_threshold
print(mask)
mask = i >= self.min_intensity_threshold

Check warning on line 49 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L49

Added line #L49 was not covered by tests
sort_index = np.argsort(mz[mask])
masked_mz_ordered = mz[mask][sort_index]
masked_i_ordered = i[mask][sort_index]
Expand Down Expand Up @@ -165,14 +111,10 @@ def create_database(path: Union[str, Path]):
c.execute(sql_insert_meta, ["staleProteinMapping", "true"])
conn.commit()

def write(self, chunksize: Optional[Union[None, int]]):
"""
Writes the entries ad p2p table to file.
:param chunksize: optional size of chunks to insert at once
"""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=chunksize)
def write(self):
"""Writes the entries ad p2p table to file."""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=self.chunksize)

Check warning on line 117 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L116-L117

Added lines #L116 - L117 were not covered by tests

def _write_entries(self, *args, **kwargs):
"""
Expand All @@ -181,7 +123,7 @@ def _write_entries(self, *args, **kwargs):
:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.path)
conn = sqlite3.connect(self.out_path)

Check warning on line 126 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L126

Added line #L126 was not covered by tests
self.entries.to_sql(name="entries", con=conn, *args, **kwargs)
conn.commit()

Expand All @@ -192,10 +134,50 @@ def _write_p2p(self, *args, **kwargs):
:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.path)
conn = sqlite3.connect(self.out_path)

Check warning on line 137 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L137

Added line #L137 was not covered by tests
self.p2p.to_sql(name="peptidetoprotein", con=conn, *args, **kwargs)
conn.commit()

def prepare_spectrum(self):
"""Prepare spectrum."""
pass
"""Converts grpc output and metadata dataframe into dlib format."""
# precursor_mz: Union[List[float], np.ndarray],
# precursor_charges: Union[List[int], np.ndarray],
# modified_sequences: List[str],
# retention_times: Union[List[float], np.ndarray],
# fragmentmz: List[np.ndarray],
# intensities: List[np.ndarray],

intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]

Check warning on line 151 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L150-L151

Added lines #L150 - L151 were not covered by tests
# annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
irt = self.grpc_output[list(self.grpc_output)[1]]
retention_times = irt.flatten()
modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]

Check warning on line 155 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L153-L155

Added lines #L153 - L155 were not covered by tests

precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
precursor_masses = self.spectra_input["MASS"]
precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges

Check warning on line 159 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L157-L159

Added lines #L157 - L159 were not covered by tests

self.create_database(self.out_path)

Check warning on line 161 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L161

Added line #L161 was not covered by tests

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragment_mz, intensities)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

Check warning on line 168 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L164-L168

Added lines #L164 - L168 were not covered by tests

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"

Check warning on line 180 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L172-L180

Added lines #L172 - L180 were not covered by tests

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})

Check warning on line 183 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L183

Added line #L183 was not covered by tests
15 changes: 13 additions & 2 deletions spectrum_io/spectral_library/spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,36 @@
class SpectralLibrary:
"""Main to initialze a SpectralLibrary obj."""

def __init__(self, input_dataframe: pd.DataFrame, grpc_dict: dict, output_path: Union[str, Path]):
def __init__(
self,
input_dataframe: pd.DataFrame,
grpc_dict: dict,
output_path: Union[str, Path],
min_intensity_threshold: Optional[float] = 0.05,
chunksize: Optional[int] = None,
):
"""
Initialize a SpectralLibrary obj.
:param input_dataframe: dataframe of sequences, charges, and masses of all library peptides
:param grpc_dict: GRPC client output dictionary with spectrum, irt, and proteotypicity prediction
:param output_path: path to output file including file name
:param min_intensity_threshold: optional filter for low intensity peaks
:param chunksize: optional chunksize for dlib
"""
if isinstance(output_path, str):
output_path = Path(output_path)
self.spectra_input = input_dataframe
self.grpc_output = grpc_dict
self.out_path = output_path
self.min_intensity_threshold = min_intensity_threshold
self.chunksize = chunksize

def load(self):
"""Load predictions from hdf5 file."""

@abstractmethod
def write(self, chunksize: Optional[Union[None, int]]):
def write(self):
"""Write predictions."""
pass

Expand Down

0 comments on commit 78741e2

Please sign in to comment.