Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed dlib for koina #73

Merged
merged 1 commit into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions spectrum_io/spectral_library/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . import digest
from .dlib import DLib
from .msp import MSP
from .spectral_library import SpectralLibrary
from .spectronaut import Spectronaut

logger = logging.getLogger(__name__)
122 changes: 52 additions & 70 deletions spectrum_io/spectral_library/dlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES
from spectrum_fundamentals.mod_string import internal_to_mod_mass, internal_without_mods

from .spectral_library import SpectralLibrary
Expand All @@ -25,62 +26,9 @@
class DLib(SpectralLibrary):
"""Main to init a DLib obj."""

def __init__(
self,
precursor_mz: Union[List[float], np.ndarray],
precursor_charges: Union[List[int], np.ndarray],
modified_sequences: List[str],
retention_times: Union[List[float], np.ndarray],
fragmentmz: List[np.ndarray],
intensities: List[np.ndarray],
path: Union[str, Path],
min_intensity_threshold: Optional[float] = 0.05,
):
def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: List[np.ndarray]):
"""
Initializer for the DLib class.

:param precursor_mz: precursor mass to charge ratios
:param precursor_charges: precurosr charges
:param modified_sequences: modified sequences in internal format
:param retention_times: retention times
:param fragmentmz: mass to charge ratio of fragments
:param intensities: intensities
:param path: path to the file the dlib is written to
:param min_intensity_threshold: minimal intensity required when masking fragmentmz and intensities
"""
self.path = path
self.create_database(self.path)

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragmentmz, intensities, min_intensity_threshold)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})

@staticmethod
def _calculate_masked_values(
fragmentmz: List[np.ndarray],
intensities: List[np.ndarray],
intensity_min_threshold: Optional[float] = 0.05,
):
"""
Internal function called during __init__ that masks, filters, byte encodes, swaps and compresses fragmentmz \
Internal function that masks, filters, byte encodes, swaps and compresses fragmentmz \
and intensities.

This will produce the data for the following columns in this order:
Expand All @@ -90,7 +38,6 @@
- 'IntensityEncodedLength'.
:param fragmentmz: fragmentmz provided in __init__
:param intensities: intensities provided in __init__
:param intensity_min_threshold: minimum threshold for tge intensity; default=0.05
:return: 4 lists as described above
"""
mz_bytes_list = []
Expand All @@ -99,8 +46,7 @@
i_lengths = []
for mz, i in zip(fragmentmz, intensities):
# mask to only existing peaks
mask = i >= intensity_min_threshold
print(mask)
mask = i >= self.min_intensity_threshold

Check warning on line 49 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L49

Added line #L49 was not covered by tests
sort_index = np.argsort(mz[mask])
masked_mz_ordered = mz[mask][sort_index]
masked_i_ordered = i[mask][sort_index]
Expand Down Expand Up @@ -165,14 +111,10 @@
c.execute(sql_insert_meta, ["staleProteinMapping", "true"])
conn.commit()

def write(self, chunksize: Optional[Union[None, int]]):
"""
Writes the entries ad p2p table to file.

:param chunksize: optional size of chunks to insert at once
"""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=chunksize)
def write(self):
"""Writes the entries ad p2p table to file."""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=self.chunksize)

Check warning on line 117 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L116-L117

Added lines #L116 - L117 were not covered by tests

def _write_entries(self, *args, **kwargs):
"""
Expand All @@ -181,7 +123,7 @@
:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.path)
conn = sqlite3.connect(self.out_path)

Check warning on line 126 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L126

Added line #L126 was not covered by tests
self.entries.to_sql(name="entries", con=conn, *args, **kwargs)
conn.commit()

Expand All @@ -192,10 +134,50 @@
:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.path)
conn = sqlite3.connect(self.out_path)

Check warning on line 137 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L137

Added line #L137 was not covered by tests
self.p2p.to_sql(name="peptidetoprotein", con=conn, *args, **kwargs)
conn.commit()

def prepare_spectrum(self):
"""Prepare spectrum."""
pass
"""Converts grpc output and metadata dataframe into dlib format."""
# precursor_mz: Union[List[float], np.ndarray],
# precursor_charges: Union[List[int], np.ndarray],
# modified_sequences: List[str],
# retention_times: Union[List[float], np.ndarray],
# fragmentmz: List[np.ndarray],
# intensities: List[np.ndarray],

intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]

Check warning on line 151 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L150-L151

Added lines #L150 - L151 were not covered by tests
# annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
irt = self.grpc_output[list(self.grpc_output)[1]]
retention_times = irt.flatten()
modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]

Check warning on line 155 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L153-L155

Added lines #L153 - L155 were not covered by tests

precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
precursor_masses = self.spectra_input["MASS"]
precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges

Check warning on line 159 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L157-L159

Added lines #L157 - L159 were not covered by tests

self.create_database(self.out_path)

Check warning on line 161 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L161

Added line #L161 was not covered by tests

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragment_mz, intensities)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

Check warning on line 168 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L164-L168

Added lines #L164 - L168 were not covered by tests

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"

Check warning on line 180 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L172-L180

Added lines #L172 - L180 were not covered by tests

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})

Check warning on line 183 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L183

Added line #L183 was not covered by tests
15 changes: 13 additions & 2 deletions spectrum_io/spectral_library/spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,36 @@
class SpectralLibrary:
"""Main to initialze a SpectralLibrary obj."""

def __init__(self, input_dataframe: pd.DataFrame, grpc_dict: dict, output_path: Union[str, Path]):
def __init__(
self,
input_dataframe: pd.DataFrame,
grpc_dict: dict,
output_path: Union[str, Path],
min_intensity_threshold: Optional[float] = 0.05,
chunksize: Optional[int] = None,
):
"""
Initialize a SpectralLibrary obj.

:param input_dataframe: dataframe of sequences, charges, and masses of all library peptides
:param grpc_dict: GRPC client output dictionary with spectrum, irt, and proteotypicity prediction
:param output_path: path to output file including file name
:param min_intensity_threshold: optional filter for low intensity peaks
:param chunksize: optional chunksize for dlib
"""
if isinstance(output_path, str):
output_path = Path(output_path)
self.spectra_input = input_dataframe
self.grpc_output = grpc_dict
self.out_path = output_path
self.min_intensity_threshold = min_intensity_threshold
self.chunksize = chunksize

def load(self):
"""Load predictions from hdf5 file."""

@abstractmethod
def write(self, chunksize: Optional[Union[None, int]]):
def write(self):
"""Write predictions."""
pass

Expand Down
Loading