Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/dlib support #99

Merged
merged 9 commits into from
Apr 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
887 changes: 317 additions & 570 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ h5py = "^3.1.0"
pymzml = "^2.5.0"
pyteomics = "^4.3.3"
lxml= '^4.5.2'
tables = "^3.6.1"
spectrum-fundamentals = ">=0.5.1,<0.6.0"
spectrum-fundamentals = ">=0.5.2,<0.6.0"
alphatims = "^1.0.8"
sortedcontainers = "^2.4.0"

Expand Down
16 changes: 5 additions & 11 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
alphatims==1.0.8 ; python_version >= "3.8" and python_full_version < "3.11.0"
blosc2==2.0.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
click==8.1.7 ; python_version >= "3.8" and python_full_version < "3.11.0"
colorama==0.4.6 ; python_version >= "3.8" and python_full_version < "3.11.0" and platform_system == "Windows"
contourpy==1.1.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
cycler==0.12.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
cython==3.0.10 ; python_version >= "3.8" and python_full_version < "3.11.0"
fonttools==4.51.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
h5py==3.10.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
h5py==3.11.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
importlib-metadata==7.1.0 ; python_version >= "3.8" and python_version < "3.9"
importlib-resources==6.4.0 ; python_version >= "3.8" and python_version < "3.10"
joblib==1.4.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
Expand All @@ -17,31 +15,27 @@ markdown-it-py==3.0.0 ; python_version >= "3.8" and python_full_version < "3.11.
matplotlib==3.7.5 ; python_version >= "3.8" and python_full_version < "3.11.0"
mdurl==0.1.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
moepy==1.1.4 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
msgpack==1.0.8 ; python_version >= "3.8" and python_full_version < "3.11.0"
numba==0.58.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
numexpr==2.8.6 ; python_version >= "3.8" and python_full_version < "3.11.0"
numpy==1.24.4 ; python_version >= "3.8" and python_full_version < "3.11.0"
packaging==24.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
pandas==1.5.3 ; python_version >= "3.8" and python_full_version < "3.11.0"
pillow==10.3.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
psutil==5.9.8 ; python_version >= "3.8" and python_full_version < "3.11.0"
py-cpuinfo==9.0.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
pygments==2.17.2 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
pymzml==2.5.6 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
pymzml==2.5.9 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
pyparsing==3.1.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
pyteomics==4.7.1 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
pyteomics==4.7.2 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
python-dateutil==2.9.0.post0 ; python_version >= "3.8" and python_full_version < "3.11.0"
pytz==2024.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
pyyaml==6.0.1 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
pyzstd==0.15.10 ; python_version >= "3.8" and python_full_version < "3.11.0"
regex==2023.12.25 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
regex==2024.4.16 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
rich==13.7.1 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
scikit-learn==1.3.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
scipy==1.10.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
six==1.16.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
sortedcontainers==2.4.0 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
spectrum-fundamentals==0.5.0 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
tables==3.8.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
spectrum-fundamentals==0.5.2 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
threadpoolctl==3.4.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
tqdm==4.66.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
typing-extensions==4.11.0 ; python_version >= "3.8" and python_version < "3.9"
Expand Down
172 changes: 95 additions & 77 deletions spectrum_io/spectral_library/dlib.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sqlite3
import zlib
from pathlib import Path
from typing import List, Optional, Union
from typing import IO, Dict, Union

import numpy as np
import pandas as pd
Expand All @@ -26,7 +26,16 @@
class DLib(SpectralLibrary):
"""Main to init a DLib obj."""

def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: List[np.ndarray]):
def _initialize(self, out: Union[IO, sqlite3.Connection]):
if isinstance(out, IO):
raise TypeError("Not supported. Use msp/spectronaut if you want to write a text file.")
if self.mode == "w":
DLib._create_database(out)

def _get_handle(self):
return sqlite3.connect(self.out_path)

def _calculate_masked_values(self, fragmentmz: np.ndarray, intensities: np.ndarray):
"""
Internal function that masks, filters, byte encodes, swaps and compresses fragmentmz \
and intensities.
Expand All @@ -44,9 +53,10 @@ def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: Li
i_bytes_list = []
mz_lengths = []
i_lengths = []
for mz, i in zip(fragmentmz, intensities):

full_mask = self._fragment_filter_passed(fragmentmz, intensities)
for mz, i, mask in zip(fragmentmz, intensities, np.array(full_mask)):
# mask to only existing peaks
mask = i >= self.min_intensity_threshold
sort_index = np.argsort(mz[mask])
masked_mz_ordered = mz[mask][sort_index]
masked_i_ordered = i[mask][sort_index]
Expand All @@ -59,50 +69,54 @@ def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: Li
bytes_mz = bytes(masked_mz_ordered)
bytes_i = bytes(masked_i_ordered)
mz_bytes_list.append(zlib.compress(bytes_mz))
i_bytes_list.append(zlib.compress(bytes(bytes_i)))
i_bytes_list.append(zlib.compress(bytes_i))
mz_lengths.append(len(bytes_mz))
i_lengths.append(len(bytes_i))
return mz_bytes_list, i_bytes_list, mz_lengths, i_lengths

@staticmethod
def create_database(path: Union[str, Path]):
def _create_database(conn: sqlite3.Connection):
"""
Creates the database file with prefab tables entries, peptidetoprotein (p2p) and metadata, according to the \
dlib specification.

:param path: specifies the path of the created database file
:param conn: specifies the path of the created database file
"""
sql_create_entries = """
CREATE TABLE entries
( PrecursorMz double not null,
PrecursorCharge int not null,
PeptideModSeq string not null,
PeptideSeq string not null,
Copies int not null,
RTInSeconds double not null,
Score double not null,
MassEncodedLength int not null,
MassArray blob not null,
IntensityEncodedLength int not null,
IntensityArray blob not null,
CorrelationEncodedLength int,
CorrelationArray blob,
RTInSecondsStart double,
RTInSecondsStop double,
MedianChromatogramEncodedLength int,
MedianChromatogramArray blob,
SourceFile string not null
CREATE TABLE IF NOT EXISTS entries
(
PrecursorMz REAL NOT NULL,
PrecursorCharge INTEGER NOT NULL,
PeptideModSeq TEXT NOT NULL,
PeptideSeq TEXT NOT NULL,
Copies INTEGER NOT NULL DEFAULT 1,
RTInSeconds REAL NUT NULL,
Score REAL NOT NULL DEFAULT 0,
MassEncodedLength INTEGER NOT NULL,
MassArray BLOB NOT NULL,
IntensityEncodedLength INTEGER NOT NULL,
IntensityArray BLOB NOT NULL,
CorrelationEncodedLength INTEGER,
CorrelationArray BLOB,
RTInSecondsStart REAL,
RTInSecondsStop REAL,
MedianChromatogramEncodedLength INTEGER,
MedianChromatogramArray BLOB,
SourceFile TEXT NOT NULL DEFAULT 'Oktoberfest'
)
"""
sql_create_p2p = """
CREATE TABLE peptidetoprotein
(PeptideSeq string not null, isDecoy boolean, ProteinAccession string not null)
CREATE TABLE IF NOT EXISTS peptidetoprotein
(
PeptideSeq TEXT NOT NULL,
isDecoy BOOL DEFAULT FALSE,
ProteinAccession TEXT NOT NULL DEFAULT 'UNKNOWN'
)
"""
sql_create_meta = """
CREATE TABLE metadata (Key string not null, Value string not null)
CREATE TABLE IF NOT EXISTS metadata (Key string not null, Value string not null)
"""
sql_insert_meta = "INSERT INTO metadata VALUES (?,?)"
conn = sqlite3.connect(path)
c = conn.cursor()
c.execute(sql_create_entries)
c.execute(sql_create_p2p)
Expand All @@ -111,34 +125,38 @@ def create_database(path: Union[str, Path]):
c.execute(sql_insert_meta, ["staleProteinMapping", "true"])
conn.commit()

def write(self):
"""Writes the entries ad p2p table to file."""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
def _write(self, out: Union[IO, sqlite3.Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame):
if isinstance(out, IO):
raise TypeError("Not supported. Use msp/spectronaut if you want to write a text file.")
seqs = metadata["SEQUENCE"]
modseqs = metadata["MODIFIED_SEQUENCE"]
mass_mod_sequences = internal_to_mod_mass(modseqs)

def _write_entries(self, *args, **kwargs):
"""
Internal function to write the entries table.
p_charges = metadata["PRECURSOR_CHARGE"]
p_mzs = (metadata["MASS"] + (p_charges * PARTICLE_MASSES["PROTON"])) / p_charges
# ces = metadata["COLLISION_ENERGY"]

:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.out_path)
self.entries.to_sql(name="entries", con=conn, *args, **kwargs)
conn.commit()
# prepare spectra
irts = data["irt"][:, 0] # should create a 1D view of the (n_peptides, 1) shaped array
f_mzss = data["mz"]
f_intss = data["intensities"]
# f_annotss = data["annotation"].astype("S", copy=False)

def _write_p2p(self, *args, **kwargs):
"""
Internal function to write the p2p table.
masked_values = self._calculate_masked_values(f_mzss, f_intss)

:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.out_path)
self.p2p.to_sql(name="peptidetoprotein", con=conn, *args, **kwargs)
conn.commit()
data_list = [*masked_values, p_charges, mass_mod_sequences, seqs, irts, p_mzs]
entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))
p2p = pd.DataFrame({"PeptideSeq": seqs})

out.execute("BEGIN")

entries.to_sql(index=False, name="entries", con=out, if_exists="append", method="multi")
p2p.to_sql(index=False, name="peptidetoprotein", con=out, if_exists="append", method="multi")

out.commit()
# conn.close()

def prepare_spectrum(self):
# def prepare_spectrum(self):
"""Converts grpc output and metadata dataframe into dlib format."""
# precursor_mz: Union[List[float], np.ndarray],
# precursor_charges: Union[List[int], np.ndarray],
Expand All @@ -147,37 +165,37 @@ def prepare_spectrum(self):
# fragmentmz: List[np.ndarray],
# intensities: List[np.ndarray],

intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
# intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
# fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
# annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
irt = self.grpc_output[list(self.grpc_output)[1]]
retention_times = irt.flatten()
modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]
# irt = self.grpc_output[list(self.grpc_output)[1]]
# retention_times = irt.flatten()
# modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]

precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
precursor_masses = self.spectra_input["MASS"]
precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges
# precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
# precursor_masses = self.spectra_input["MASS"]
# precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges

self.create_database(self.out_path)
# self.create_database(self.out_path)

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragment_mz, intensities)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))
# masked_values = self._calculate_masked_values(fragment_mz, intensities)
# mass_mod_sequences = internal_to_mod_mass(modified_sequences)
# sequences = internal_without_mods(modified_sequences)
# data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
# self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"
# self.entries["Copies"] = 1 # this is hardcorded for now and unused
# self.entries["Score"] = 0
# self.entries["CorrelationEncodedLength"] = None
# self.entries["CorrelationArray"] = None
# self.entries["RTInSecondsStart"] = None
# self.entries["RTInSecondsStop"] = None
# self.entries["MedianChromatogramEncodedLength"] = None
# self.entries["MedianChromatogramArray"] = None
# self.entries["SourceFile"] = "Prosit"

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})
# self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})
9 changes: 6 additions & 3 deletions spectrum_io/spectral_library/msp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import IO, Dict
from sqlite3 import Connection
from typing import IO, Dict, Union

import numpy as np
import pandas as pd
Expand All @@ -16,8 +17,10 @@ def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes):
annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode()
return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n'

def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame):
# prepare metadata
if isinstance(out, Connection):
raise TypeError("Not supported. Use DLib if you want to write a database file.")
stripped_peptides = metadata["SEQUENCE"]
modss = internal_to_mod_names(metadata["MODIFIED_SEQUENCE"])
p_charges = metadata["PRECURSOR_CHARGE"]
Expand Down Expand Up @@ -50,5 +53,5 @@ def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
lines.extend(fragment_list)
out.writelines(lines)

def _write_header(self, out: IO):
def _initialize(self, out: Union[IO, Connection]):
pass
21 changes: 10 additions & 11 deletions spectrum_io/spectral_library/spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from multiprocessing import Queue
from multiprocessing.managers import ValueProxy
from pathlib import Path
from sqlite3 import Connection
from typing import IO, Dict, Optional, Union

import numpy as np
Expand Down Expand Up @@ -44,19 +45,22 @@ def write(self, *args, **kwargs):
:param args: Positional arguments to be passed to the internal _write method.
:param kwargs: Keyword arguments to be passed to the internal _write method.
"""
with open(self.out_path, self.mode) as out:
self._write_header(out)
with self._get_handle() as out:
self._initialize(out)
self._write(out, *args, **kwargs)

def _get_handle(self):
return open(self.out_path, self.mode)

def async_write(self, queue: Queue, progress: ValueProxy):
"""
Asynchronously write content to the output file from a queue.

:param queue: A queue from which content will be retrieved for writing.
:param progress: An integer value representing the progress of the writing process.
"""
with open(self.out_path, self.mode) as out:
self._write_header(out)
with self._get_handle() as out:
self._initialize(out)
while True:
content = queue.get()
if content is None:
Expand All @@ -82,7 +86,7 @@ def _fragment_filter_passed(
return (f_mz != -1) & (f_int >= self.min_intensity_threshold)

@abstractmethod
def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame):
"""
Internal writer function.

Expand All @@ -97,10 +101,5 @@ def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
pass

@abstractmethod
def _write_header(self, out: IO):
pass

@abstractmethod
def prepare_spectrum(self):
"""Prepare spectrum."""
def _initialize(self, out: Union[IO, Connection]):
pass
Loading
Loading