Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/0.3.2 #72

Merged
merged 10 commits into from
Oct 3, 2023
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.3.1
version: 0.3.2
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.3.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.3.1 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.3.2 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.3.2 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.1
current_version = 0.3.2

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.3.1"
version = "0.3.2"
# The full version, including alpha/beta/rc tags.
release = "0.3.1"
release = "0.3.2"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
1,331 changes: 705 additions & 626 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.3.1" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.3.2" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Mario Picciani <[email protected]>"]
license = "MIT"
Expand Down Expand Up @@ -30,7 +30,7 @@ pymzml = "^2.5.0"
pyteomics = "^4.3.3"
lxml= '^4.5.2'
tables = "^3.6.1"
spectrum-fundamentals = "^0.4.0"
spectrum-fundamentals = ">=0.4.3,<0.5.0"

[tool.poetry.dev-dependencies]
pytest = ">=6.2.3"
Expand Down Expand Up @@ -99,3 +99,5 @@ requires = [
"wheel"
]
build-backend = "poetry.core.masonry.api"

[tool.setuptools_scm]
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = "Mario Picciani"
__email__ = "[email protected]"
__version__ = "0.3.1"
__version__ = "0.3.2"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.3.1", message=click.style("spectrum_io Version: 0.3.1"))
@click.version_option(version="0.3.2", message=click.style("spectrum_io Version: 0.3.2"))
def main() -> None:
"""spectrum_io."""

Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/search_result/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def update_columns_for_prosit(df: pd.DataFrame, tmt_labeled: str) -> pd.DataFram
logger.info("Adding TMT fixed modifications")
df["MODIFIED_SEQUENCE"] = maxquant_to_internal(
df["MODIFIED_SEQUENCE"].to_numpy(),
fixed_mods={"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}", "K": f"K{unimod_tag}"},
fixed_mods={"C": "C[UNIMOD:4]", "^_": f"_{unimod_tag}-", "K": f"K{unimod_tag}"},
)
df["MASS"] = df.apply(lambda x: MaxQuant.add_tmt_mod(x.MASS, x.MODIFIED_SEQUENCE, unimod_tag), axis=1)
if "msa" in tmt_labeled:
Expand Down
1 change: 1 addition & 0 deletions spectrum_io/spectral_library/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from . import digest
from .dlib import DLib
from .msp import MSP
from .spectral_library import SpectralLibrary
from .spectronaut import Spectronaut

logger = logging.getLogger(__name__)
122 changes: 52 additions & 70 deletions spectrum_io/spectral_library/dlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import pandas as pd
from spectrum_fundamentals.constants import PARTICLE_MASSES
from spectrum_fundamentals.mod_string import internal_to_mod_mass, internal_without_mods

from .spectral_library import SpectralLibrary
Expand All @@ -25,62 +26,9 @@
class DLib(SpectralLibrary):
"""Main to init a DLib obj."""

def __init__(
self,
precursor_mz: Union[List[float], np.ndarray],
precursor_charges: Union[List[int], np.ndarray],
modified_sequences: List[str],
retention_times: Union[List[float], np.ndarray],
fragmentmz: List[np.ndarray],
intensities: List[np.ndarray],
path: Union[str, Path],
min_intensity_threshold: Optional[float] = 0.05,
):
def _calculate_masked_values(self, fragmentmz: List[np.ndarray], intensities: List[np.ndarray]):
"""
Initializer for the DLib class.

:param precursor_mz: precursor mass to charge ratios
:param precursor_charges: precurosr charges
:param modified_sequences: modified sequences in internal format
:param retention_times: retention times
:param fragmentmz: mass to charge ratio of fragments
:param intensities: intensities
:param path: path to the file the dlib is written to
:param min_intensity_threshold: minimal intensity required when masking fragmentmz and intensities
"""
self.path = path
self.create_database(self.path)

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragmentmz, intensities, min_intensity_threshold)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})

@staticmethod
def _calculate_masked_values(
fragmentmz: List[np.ndarray],
intensities: List[np.ndarray],
intensity_min_threshold: Optional[float] = 0.05,
):
"""
Internal function called during __init__ that masks, filters, byte encodes, swaps and compresses fragmentmz \
Internal function that masks, filters, byte encodes, swaps and compresses fragmentmz \
and intensities.

This will produce the data for the following columns in this order:
Expand All @@ -90,7 +38,6 @@
- 'IntensityEncodedLength'.
:param fragmentmz: fragmentmz provided in __init__
:param intensities: intensities provided in __init__
:param intensity_min_threshold: minimum threshold for tge intensity; default=0.05
:return: 4 lists as described above
"""
mz_bytes_list = []
Expand All @@ -99,8 +46,7 @@
i_lengths = []
for mz, i in zip(fragmentmz, intensities):
# mask to only existing peaks
mask = i >= intensity_min_threshold
print(mask)
mask = i >= self.min_intensity_threshold

Check warning on line 49 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L49

Added line #L49 was not covered by tests
sort_index = np.argsort(mz[mask])
masked_mz_ordered = mz[mask][sort_index]
masked_i_ordered = i[mask][sort_index]
Expand Down Expand Up @@ -165,14 +111,10 @@
c.execute(sql_insert_meta, ["staleProteinMapping", "true"])
conn.commit()

def write(self, chunksize: Optional[Union[None, int]]):
"""
Writes the entries ad p2p table to file.

:param chunksize: optional size of chunks to insert at once
"""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=chunksize)
def write(self):
"""Writes the entries ad p2p table to file."""
self._write_entries(index=False, if_exists="append", method="multi", chunksize=self.chunksize)
self._write_p2p(index=False, if_exists="append", method="multi", chunksize=self.chunksize)

Check warning on line 117 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L116-L117

Added lines #L116 - L117 were not covered by tests

def _write_entries(self, *args, **kwargs):
"""
Expand All @@ -181,7 +123,7 @@
:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.path)
conn = sqlite3.connect(self.out_path)

Check warning on line 126 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L126

Added line #L126 was not covered by tests
self.entries.to_sql(name="entries", con=conn, *args, **kwargs)
conn.commit()

Expand All @@ -192,10 +134,50 @@
:param args: forwarded to pandas.to_sql
:param kwargs: forwarded to pandas.to_sql
"""
conn = sqlite3.connect(self.path)
conn = sqlite3.connect(self.out_path)

Check warning on line 137 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L137

Added line #L137 was not covered by tests
self.p2p.to_sql(name="peptidetoprotein", con=conn, *args, **kwargs)
conn.commit()

def prepare_spectrum(self):
"""Prepare spectrum."""
pass
"""Converts grpc output and metadata dataframe into dlib format."""
# precursor_mz: Union[List[float], np.ndarray],
# precursor_charges: Union[List[int], np.ndarray],
# modified_sequences: List[str],
# retention_times: Union[List[float], np.ndarray],
# fragmentmz: List[np.ndarray],
# intensities: List[np.ndarray],

intensities = self.grpc_output[list(self.grpc_output)[0]]["intensity"]
fragment_mz = self.grpc_output[list(self.grpc_output)[0]]["fragmentmz"]

Check warning on line 151 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L150-L151

Added lines #L150 - L151 were not covered by tests
# annotation = self.grpc_output[list(self.grpc_output)[0]]["annotation"]
irt = self.grpc_output[list(self.grpc_output)[1]]
retention_times = irt.flatten()
modified_sequences = self.spectra_input["MODIFIED_SEQUENCE"]

Check warning on line 155 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L153-L155

Added lines #L153 - L155 were not covered by tests

precursor_charges = self.spectra_input["PRECURSOR_CHARGE"]
precursor_masses = self.spectra_input["MASS"]
precursor_mz = (precursor_masses + (precursor_charges * PARTICLE_MASSES["PROTON"])) / precursor_charges

Check warning on line 159 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L157-L159

Added lines #L157 - L159 were not covered by tests

self.create_database(self.out_path)

Check warning on line 161 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L161

Added line #L161 was not covered by tests

# gather all values for the entries table and create pandas DataFrame
masked_values = self._calculate_masked_values(fragment_mz, intensities)
mass_mod_sequences = internal_to_mod_mass(modified_sequences)
sequences = internal_without_mods(modified_sequences)
data_list = [*masked_values, precursor_charges, mass_mod_sequences, sequences, retention_times, precursor_mz]
self.entries = pd.DataFrame(dict(zip(DLIB_COL_NAMES, data_list)))

Check warning on line 168 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L164-L168

Added lines #L164 - L168 were not covered by tests

# hardcoded entries that we currently not use.
# Visit https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for dlib specs
self.entries["Copies"] = 1 # this is hardcorded for now and unused
self.entries["Score"] = 0
self.entries["CorrelationEncodedLength"] = None
self.entries["CorrelationArray"] = None
self.entries["RTInSecondsStart"] = None
self.entries["RTInSecondsStop"] = None
self.entries["MedianChromatogramEncodedLength"] = None
self.entries["MedianChromatogramArray"] = None
self.entries["SourceFile"] = "Prosit"

Check warning on line 180 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L172-L180

Added lines #L172 - L180 were not covered by tests

# gather all values for the p2p table and create pandas DataFrame
self.p2p = pd.DataFrame({"PeptideSeq": sequences, "isDecoy": False, "ProteinAccession": "unknown"})

Check warning on line 183 in spectrum_io/spectral_library/dlib.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/spectral_library/dlib.py#L183

Added line #L183 was not covered by tests
15 changes: 13 additions & 2 deletions spectrum_io/spectral_library/spectral_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,36 @@
class SpectralLibrary:
"""Main to initialze a SpectralLibrary obj."""

def __init__(self, input_dataframe: pd.DataFrame, grpc_dict: dict, output_path: Union[str, Path]):
def __init__(
self,
input_dataframe: pd.DataFrame,
grpc_dict: dict,
output_path: Union[str, Path],
min_intensity_threshold: Optional[float] = 0.05,
chunksize: Optional[int] = None,
):
"""
Initialize a SpectralLibrary obj.

:param input_dataframe: dataframe of sequences, charges, and masses of all library peptides
:param grpc_dict: GRPC client output dictionary with spectrum, irt, and proteotypicity prediction
:param output_path: path to output file including file name
:param min_intensity_threshold: optional filter for low intensity peaks
:param chunksize: optional chunksize for dlib
"""
if isinstance(output_path, str):
output_path = Path(output_path)
self.spectra_input = input_dataframe
self.grpc_output = grpc_dict
self.out_path = output_path
self.min_intensity_threshold = min_intensity_threshold
self.chunksize = chunksize

def load(self):
"""Load predictions from hdf5 file."""

@abstractmethod
def write(self, chunksize: Optional[Union[None, int]]):
def write(self):
"""Write predictions."""
pass

Expand Down
8 changes: 4 additions & 4 deletions tests/unit_tests/test_maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def test_update_columns_tmt(self, maxquant_df: pd.DataFrame):
:param maxquant_df: maxquant df as pd.DataFrame
"""
prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="tmt")
assert prosit_df["MODIFIED_SEQUENCE"][0] == "[UNIMOD:737]DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVRK[UNIMOD:737]"
assert prosit_df["MODIFIED_SEQUENCE"][3] == "[UNIMOD:737]SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK[UNIMOD:737]"
assert prosit_df["MODIFIED_SEQUENCE"][0] == "[UNIMOD:737]-DS[UNIMOD:21]DS[UNIMOD:21]WDADAFSVEDPVRK[UNIMOD:737]"
assert prosit_df["MODIFIED_SEQUENCE"][3] == "[UNIMOD:737]-SS[UNIMOD:21]PTPES[UNIMOD:21]PTMLTK[UNIMOD:737]"

assert prosit_df["MASS"][0] == 1.0 + 2 * 229.162932
assert prosit_df["MASS"][3] == 2.0 + 2 * 229.162932
Expand All @@ -76,9 +76,9 @@ def test_update_columns_tmt_msa(self, maxquant_df: pd.DataFrame):
"""
prosit_df = mq.MaxQuant.update_columns_for_prosit(maxquant_df, tmt_labeled="tmt_msa")
assert (
prosit_df["MODIFIED_SEQUENCE_MSA"][0] == "[UNIMOD:737]DS[UNIMOD:23]DS[UNIMOD:23]WDADAFSVEDPVRK[UNIMOD:737]"
prosit_df["MODIFIED_SEQUENCE_MSA"][0] == "[UNIMOD:737]-DS[UNIMOD:23]DS[UNIMOD:23]WDADAFSVEDPVRK[UNIMOD:737]"
)
assert prosit_df["MODIFIED_SEQUENCE_MSA"][3] == "[UNIMOD:737]SS[UNIMOD:23]PTPES[UNIMOD:23]PTMLTK[UNIMOD:737]"
assert prosit_df["MODIFIED_SEQUENCE_MSA"][3] == "[UNIMOD:737]-SS[UNIMOD:23]PTPES[UNIMOD:23]PTMLTK[UNIMOD:737]"

def test_filter_valid_prosit_sequences(self, invalid_df: pd.DataFrame):
"""Test filter_valid_prosit_sequences."""
Expand Down
Loading