Merge pull request #99 from wilhelm-lab/feature/dlib_support

Feature/dlib support
wilhelm-lab · Apr 27, 2024 · 275ef32 · 275ef32
2 parents 100afb2 + 4a0c100
commit 275ef32
Show file tree

Hide file tree

Showing 8 changed files with 501 additions and 690 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,8 +29,7 @@ h5py = "^3.1.0"
 pymzml = "^2.5.0"
 pyteomics = "^4.3.3"
 lxml= '^4.5.2'
-tables = "^3.6.1"
-spectrum-fundamentals = ">=0.5.1,<0.6.0"
+spectrum-fundamentals = ">=0.5.2,<0.6.0"
 alphatims = "^1.0.8"
 sortedcontainers = "^2.4.0"
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,10 @@
 alphatims==1.0.8 ; python_version >= "3.8" and python_full_version < "3.11.0"
-blosc2==2.0.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 click==8.1.7 ; python_version >= "3.8" and python_full_version < "3.11.0"
 colorama==0.4.6 ; python_version >= "3.8" and python_full_version < "3.11.0" and platform_system == "Windows"
 contourpy==1.1.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
 cycler==0.12.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
-cython==3.0.10 ; python_version >= "3.8" and python_full_version < "3.11.0"
 fonttools==4.51.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
-h5py==3.10.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
+h5py==3.11.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 importlib-metadata==7.1.0 ; python_version >= "3.8" and python_version < "3.9"
 importlib-resources==6.4.0 ; python_version >= "3.8" and python_version < "3.10"
 joblib==1.4.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
@@ -17,31 +15,27 @@ markdown-it-py==3.0.0 ; python_version >= "3.8" and python_full_version < "3.11.
 matplotlib==3.7.5 ; python_version >= "3.8" and python_full_version < "3.11.0"
 mdurl==0.1.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
 moepy==1.1.4 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
-msgpack==1.0.8 ; python_version >= "3.8" and python_full_version < "3.11.0"
 numba==0.58.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
-numexpr==2.8.6 ; python_version >= "3.8" and python_full_version < "3.11.0"
 numpy==1.24.4 ; python_version >= "3.8" and python_full_version < "3.11.0"
 packaging==24.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 pandas==1.5.3 ; python_version >= "3.8" and python_full_version < "3.11.0"
 pillow==10.3.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 psutil==5.9.8 ; python_version >= "3.8" and python_full_version < "3.11.0"
-py-cpuinfo==9.0.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 pygments==2.17.2 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
-pymzml==2.5.6 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
+pymzml==2.5.9 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
 pyparsing==3.1.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
-pyteomics==4.7.1 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
+pyteomics==4.7.2 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
 python-dateutil==2.9.0.post0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 pytz==2024.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
 pyyaml==6.0.1 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
 pyzstd==0.15.10 ; python_version >= "3.8" and python_full_version < "3.11.0"
-regex==2023.12.25 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
+regex==2024.4.16 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
 rich==13.7.1 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
 scikit-learn==1.3.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
 scipy==1.10.1 ; python_version >= "3.8" and python_full_version < "3.11.0"
 six==1.16.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 sortedcontainers==2.4.0 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
-spectrum-fundamentals==0.5.0 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
-tables==3.8.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
+spectrum-fundamentals==0.5.2 ; python_full_version >= "3.8.0" and python_full_version < "3.11.0"
 threadpoolctl==3.4.0 ; python_version >= "3.8" and python_full_version < "3.11.0"
 tqdm==4.66.2 ; python_version >= "3.8" and python_full_version < "3.11.0"
 typing-extensions==4.11.0 ; python_version >= "3.8" and python_version < "3.9"

diff --git a/spectrum_io/spectral_library/dlib.py b/spectrum_io/spectral_library/dlib.py
@@ -1,7 +1,7 @@
 import sqlite3
 import zlib
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import IO, Dict, Union
 
 import numpy as np
 import pandas as pd
@@ -26,7 +26,16 @@
 class DLib(SpectralLibrary):
     """Main to init a DLib obj."""
 
-    def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: List[np.ndarray]):
+    def _initialize(self, out: Union[IO, sqlite3.Connection]):
+        if isinstance(out, IO):
+            raise TypeError("Not supported. Use msp/spectronaut if you want to write a text file.")
+        if self.mode == "w":
+            DLib._create_database(out)
+
+    def _get_handle(self):
+        return sqlite3.connect(self.out_path)
+
+    def _calculate_masked_values(self, fragmentmz: np.ndarray, intensities: np.ndarray):
         """
         Internal function that masks, filters, byte encodes, swaps and compresses fragmentmz \
         and intensities.
@@ -44,9 +53,10 @@ def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: Li
         i_bytes_list = []
         mz_lengths = []
         i_lengths = []
-        for mz, i in zip(fragmentmz, intensities):
+
+        full_mask = self._fragment_filter_passed(fragmentmz, intensities)
+        for mz, i, mask in zip(fragmentmz, intensities, np.array(full_mask)):
             # mask to only existing peaks
-            mask = i >= self.min_intensity_threshold
             sort_index = np.argsort(mz[mask])
             masked_mz_ordered = mz[mask][sort_index]
             masked_i_ordered = i[mask][sort_index]
@@ -59,50 +69,54 @@ def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: Li
             bytes_mz = bytes(masked_mz_ordered)
             bytes_i = bytes(masked_i_ordered)
             mz_bytes_list.append(zlib.compress(bytes_mz))
-            i_bytes_list.append(zlib.compress(bytes(bytes_i)))
+            i_bytes_list.append(zlib.compress(bytes_i))
             mz_lengths.append(len(bytes_mz))
             i_lengths.append(len(bytes_i))
         return mz_bytes_list, i_bytes_list, mz_lengths, i_lengths
 
     @staticmethod
-    def create_database(path: Union[str, Path]):
+    def _create_database(conn: sqlite3.Connection):
         """
         Creates the database file with prefab tables entries, peptidetoprotein (p2p) and metadata, according to the \
         dlib specification.
 
-        :param path: specifies the path of the created database file
+        :param conn: specifies the path of the created database file
         """
         sql_create_entries = """
-            CREATE TABLE entries
-            (   PrecursorMz double not null,
-                PrecursorCharge int not null,
-                PeptideModSeq string not null,
-                PeptideSeq string not null,
-                Copies int not null,
-                RTInSeconds double not null,
-                Score double not null,
-                MassEncodedLength int not null,
-                MassArray blob not null,
-                IntensityEncodedLength int not null,
-                IntensityArray blob not null,
-                CorrelationEncodedLength int,
-                CorrelationArray blob,
-                RTInSecondsStart double,
-                RTInSecondsStop double,
-                MedianChromatogramEncodedLength int,
-                MedianChromatogramArray blob,
-                SourceFile string not null
+            CREATE TABLE IF NOT EXISTS entries
+            (
+                PrecursorMz REAL NOT NULL,
+                PrecursorCharge INTEGER NOT NULL,
+                PeptideModSeq TEXT NOT NULL,
+                PeptideSeq TEXT NOT NULL,
+                Copies INTEGER NOT NULL DEFAULT 1,
+                RTInSeconds REAL NUT NULL,
+                Score REAL NOT NULL DEFAULT 0,
+                MassEncodedLength INTEGER NOT NULL,
+                MassArray BLOB NOT NULL,
+                IntensityEncodedLength INTEGER NOT NULL,
+                IntensityArray BLOB NOT NULL,
+                CorrelationEncodedLength INTEGER,
+                CorrelationArray BLOB,
+                RTInSecondsStart REAL,
+                RTInSecondsStop REAL,
+                MedianChromatogramEncodedLength INTEGER,
+                MedianChromatogramArray BLOB,
+                SourceFile TEXT NOT NULL DEFAULT 'Oktoberfest'
             )
         """
         sql_create_p2p = """
-            CREATE TABLE peptidetoprotein
-            (PeptideSeq string not null, isDecoy boolean, ProteinAccession string not null)
+            CREATE TABLE IF NOT EXISTS peptidetoprotein
+            (
+                PeptideSeq TEXT NOT NULL,
+                isDecoy BOOL DEFAULT FALSE,
+                ProteinAccession TEXT NOT NULL DEFAULT 'UNKNOWN'
+            )
         """
         sql_create_meta = """
-            CREATE TABLE metadata (Key string not null, Value string not null)
+            CREATE TABLE IF NOT EXISTS metadata (Key string not null, Value string not null)
         """
         sql_insert_meta = "INSERT INTO metadata VALUES (?,?)"
-        conn = sqlite3.connect(path)
         c = conn.cursor()
         c.execute(sql_create_entries)
         c.execute(sql_create_p2p)
@@ -111,34 +125,38 @@ def create_database(path: Union[str, Path]):
         c.execute(sql_insert_meta, ["staleProteinMapping", "true"])
         conn.commit()
 
-    def write(self):
-        """Writes the entries ad p2p table to file."""
-        self._write_entries(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
-        self._write_p2p(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
+    def _write(self, out: Union[IO, sqlite3.Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame):
+        if isinstance(out, IO):
+            raise TypeError("Not supported. Use msp/spectronaut if you want to write a text file.")
+        seqs = metadata["SEQUENCE"]
+        modseqs = metadata["MODIFIED_SEQUENCE"]
+        mass_mod_sequences = internal_to_mod_mass(modseqs)
 
-    def _write_entries(self, *args, **kwargs):
-        """
-        Internal function to write the entries table.
+        p_charges = metadata["PRECURSOR_CHARGE"]
+        p_mzs = (metadata["MASS"] + (p_charges * PARTICLE_MASSES["PROTON"])) / p_charges
+        # ces = metadata["COLLISION_ENERGY"]
 
-        :param args: forwarded to pandas.to_sql
-        :param kwargs: forwarded to pandas.to_sql
-        """
-        conn = sqlite3.connect(self.out_path)
-        self.entries.to_sql(name="entries", con=conn, *args, **kwargs)
-        conn.commit()
+        # prepare spectra
+        irts = data["irt"][:, 0]  # should create a 1D view of the (n_peptides, 1) shaped array
+        f_mzss = data["mz"]
+        f_intss = data["intensities"]
+        # f_annotss = data["annotation"].astype("S", copy=False)
 
-    def _write_p2p(self, *args, **kwargs):
-        """
-        Internal function to write the p2p table.
+        masked_values = self._calculate_masked_values(f_mzss, f_intss)
 
-        :param args: forwarded to pandas.to_sql
-        :param kwargs: forwarded to pandas.to_sql
-        """
-        conn = sqlite3.connect(self.out_path)
-        self.p2p.to_sql(name="peptidetoprotein", con=conn, *args, **kwargs)
-        conn.commit()
+        data_list = [*masked_values, p_charges, mass_mod_sequences, seqs, irts, p_mzs]
+        entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))
+        p2p = pd.DataFrame({"PeptideSeq": seqs})
+
+        out.execute("BEGIN")
+
+        entries.to_sql(index=False, name="entries", con=out, if_exists="append", method="multi")
+        p2p.to_sql(index=False, name="peptidetoprotein", con=out, if_exists="append", method="multi")
+
+        out.commit()
+        # conn.close()
 
-    def prepare_spectrum(self):
+        # def prepare_spectrum(self):
         """Converts grpc output and metadata dataframe into dlib format."""
         # precursor_mz: Union[List[float], np.ndarray],
         # precursor_charges: Union[List[int], np.ndarray],
@@ -147,37 +165,37 @@ def prepare_spectrum(self):
         # fragmentmz: List[np.ndarray],
         # intensities: List[np.ndarray],
 
-        intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
-        fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
+        # intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
+        # fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]
         # annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
-        irt = self.grpc_output[list(self.grpc_output)[1]]
-        retention_times = irt.flatten()
-        modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]
+        # irt = self.grpc_output[list(self.grpc_output)[1]]
+        # retention_times = irt.flatten()
+        # modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]
 
-        precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
-        precursor_masses = self.spectra_input["MASS"]
-        precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges
+        # precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
+        # precursor_masses = self.spectra_input["MASS"]
+        # precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges
 
-        self.create_database(self.out_path)
+        # self.create_database(self.out_path)
 
         # gather all values for the entries table and create pandas DataFrame
-        masked_values = self._calculate_masked_values(fragment_mz, intensities)
-        mass_mod_sequences = internal_to_mod_mass(modified_sequences)
-        sequences = internal_without_mods(modified_sequences)
-        data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
-        self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))
+        # masked_values = self._calculate_masked_values(fragment_mz, intensities)
+        # mass_mod_sequences = internal_to_mod_mass(modified_sequences)
+        # sequences = internal_without_mods(modified_sequences)
+        # data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
+        # self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))
 
         # hardcoded entries that we currently not use.
         # Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
-        self.entries["Copies"] = 1  # this is hardcorded for now and unused
-        self.entries["Score"] = 0
-        self.entries["CorrelationEncodedLength"] = None
-        self.entries["CorrelationArray"] = None
-        self.entries["RTInSecondsStart"] = None
-        self.entries["RTInSecondsStop"] = None
-        self.entries["MedianChromatogramEncodedLength"] = None
-        self.entries["MedianChromatogramArray"] = None
-        self.entries["SourceFile"] = "Prosit"
+        # self.entries["Copies"] = 1  # this is hardcorded for now and unused
+        # self.entries["Score"] = 0
+        # self.entries["CorrelationEncodedLength"] = None
+        # self.entries["CorrelationArray"] = None
+        # self.entries["RTInSecondsStart"] = None
+        # self.entries["RTInSecondsStop"] = None
+        # self.entries["MedianChromatogramEncodedLength"] = None
+        # self.entries["MedianChromatogramArray"] = None
+        # self.entries["SourceFile"] = "Prosit"
 
         # gather all values for the p2p table and create pandas DataFrame
-        self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})
+        # self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})
diff --git a/spectrum_io/spectral_library/msp.py b/spectrum_io/spectral_library/msp.py
@@ -1,4 +1,5 @@
-from typing import IO, Dict
+from sqlite3 import Connection
+from typing import IO, Dict, Union
 
 import numpy as np
 import pandas as pd
@@ -16,8 +17,10 @@ def _assemble_fragment_string(f_mz: float, f_int: float, f_a: bytes):
         annot = f_a[:-2].decode() if f_a.endswith(b"1") else f_a.replace(b"+", b"^").decode()
         return f'{f_mz:.8f}\t{f_int:.4f}\t"{annot}/0.0ppm"\n'
 
-    def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
+    def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame):
         # prepare metadata
+        if isinstance(out, Connection):
+            raise TypeError("Not supported. Use DLib if you want to write a database file.")
         stripped_peptides = metadata["SEQUENCE"]
         modss = internal_to_mod_names(metadata["MODIFIED_SEQUENCE"])
         p_charges = metadata["PRECURSOR_CHARGE"]
@@ -50,5 +53,5 @@ def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
             lines.extend(fragment_list)
         out.writelines(lines)
 
-    def _write_header(self, out: IO):
+    def _initialize(self, out: Union[IO, Connection]):
         pass
diff --git a/spectrum_io/spectral_library/spectral_library.py b/spectrum_io/spectral_library/spectral_library.py
@@ -2,6 +2,7 @@
 from multiprocessing import Queue
 from multiprocessing.managers import ValueProxy
 from pathlib import Path
+from sqlite3 import Connection
 from typing import IO, Dict, Optional, Union
 
 import numpy as np
@@ -44,19 +45,22 @@ def write(self, *args, **kwargs):
         :param args: Positional arguments to be passed to the internal _write method.
         :param kwargs: Keyword arguments to be passed to the internal _write method.
         """
-        with open(self.out_path, self.mode) as out:
-            self._write_header(out)
+        with self._get_handle() as out:
+            self._initialize(out)
             self._write(out, *args, **kwargs)
 
+    def _get_handle(self):
+        return open(self.out_path, self.mode)
+
     def async_write(self, queue: Queue, progress: ValueProxy):
         """
         Asynchronously write content to the output file from a queue.
 
         :param queue: A queue from which content will be retrieved for writing.
         :param progress: An integer value representing the progress of the writing process.
         """
-        with open(self.out_path, self.mode) as out:
-            self._write_header(out)
+        with self._get_handle() as out:
+            self._initialize(out)
             while True:
                 content = queue.get()
                 if content is None:
@@ -82,7 +86,7 @@ def _fragment_filter_passed(
         return (f_mz != -1) & (f_int >= self.min_intensity_threshold)
 
     @abstractmethod
-    def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
+    def _write(self, out: Union[IO, Connection], data: Dict[str, np.ndarray], metadata: pd.DataFrame):
         """
         Internal writer function.
 
@@ -97,10 +101,5 @@ def _write(self, out: IO, data: Dict[str, np.ndarray], metadata: pd.DataFrame):
         pass
 
     @abstractmethod
-    def _write_header(self, out: IO):
-        pass
-
-    @abstractmethod
-    def prepare_spectrum(self):
-        """Prepare spectrum."""
+    def _initialize(self, out: Union[IO, Connection]):
         pass