Skip to content

Commit

Permalink
Merge pull request #79 from wilhelm-lab/release/0.3.4
Browse files Browse the repository at this point in the history
Release/0.3.4
  • Loading branch information
picciama authored Nov 13, 2023
2 parents 42ecdfb + e544c1b commit 6d28b8e
Show file tree
Hide file tree
Showing 21 changed files with 1,160 additions and 792 deletions.
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.3.3
version: 0.3.4
license: MIT
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max-line-length = 120
max-complexity = 10
docstring-convention = google
per-file-ignores =
tests/*:S101
tests/*:S101,S301,S403
noxfile.py:DAR101
spectrum_io/raw/thermo_raw.py:S603,S404
spectrum_io/raw/msraw.py:S405,S314
Expand Down
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.3.3 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.3.3 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.3.4 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.3.4 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.3
current_version = 0.3.4

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.3.3"
version = "0.3.4"
# The full version, including alpha/beta/rc tags.
release = "0.3.3"
release = "0.3.4"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
1,347 changes: 684 additions & 663 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.3.3" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.3.4" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Mario Picciani <[email protected]>"]
license = "MIT"
Expand Down Expand Up @@ -30,7 +30,7 @@ pymzml = "^2.5.0"
pyteomics = "^4.3.3"
lxml= '^4.5.2'
tables = "^3.6.1"
spectrum-fundamentals = ">=0.4.3,<0.5.0"
spectrum-fundamentals = ">=0.4.4,<0.5.0"

[tool.poetry.dev-dependencies]
pytest = ">=6.2.3"
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = "Mario Picciani"
__email__ = "[email protected]"
__version__ = "0.3.3"
__version__ = "0.3.4"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.3.3", message=click.style("spectrum_io Version: 0.3.3"))
@click.version_option(version="0.3.4", message=click.style("spectrum_io Version: 0.3.4"))
def main() -> None:
"""spectrum_io."""

Expand Down
202 changes: 112 additions & 90 deletions spectrum_io/raw/msraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def check_analyzer(mass_analyzers: Dict[str, str]) -> Dict[str, str]:
accession = mass_analyzers[elem]
if accession in ["MS:1000079", "MS:1000484"]: # fourier transform ion cyclotron, orbitrap
mass_analyzers[elem] = "FTMS"
elif accession in ["MS:1000082", "MS:1000264" "MS:1000078"]: # quadrupole ion-trap, ion-trap, linear ion-trap
elif accession in ["MS:1000082", "MS:1000264", "MS:1000078"]: # quadrupole ion-trap, ion-trap, linear ion-trap
mass_analyzers[elem] = "ITMS"
elif accession in ["MS:1000084"]: # TOF
mass_analyzers[elem] = "TOF"
Expand Down Expand Up @@ -135,43 +135,124 @@ def read_mzml(
:return: pd.DataFrame with intensities and m/z values
"""
file_list = MSRaw.get_file_list(source, ext)
data = {} # type: Dict[str, Any]

if package == "pymzml":
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ImportWarning)
for file_path in file_list:
logger.info(f"Reading mzML file: {file_path}")
MSRaw._get_scans_pymzml(file_path, data, scanidx, *args, **kwargs)
data = MSRaw._read_mzml_pymzml(file_list, scanidx, *args, **kwargs)
elif package == "pyteomics":
data = MSRaw._read_mzml_pyteomics(file_list, *args, **kwargs)
else:
raise AssertionError("Choose either 'pymzml' or 'pyteomics'")

data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
return data

@staticmethod
def _read_mzml_pymzml(file_list: List[Path], scanidx: Optional[List] = None, *args, **kwargs) -> pd.DataFrame:
data_dict = {}
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ImportWarning)
for file_path in file_list:
mass_analyzer = get_mass_analyzer(file_path)
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
file_name = file_path.stem
for spec in data_iter:
if spec["ms level"] != 1: # filter out ms1 spectra if there are any
spec_id = spec["id"].split("scan=")[-1]
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
fragmentation = spec["scanList"]["scan"][0]["filter string"].split("@")[1][:3].upper()
mz_range = spec["scanList"]["scan"][0]["filter string"].split("[")[1][:-1]
rt = spec["scanList"]["scan"][0]["scan start time"]
key = f"{file_name}_{spec_id}"
data[key] = [
file_name,
spec_id,
spec["intensity array"],
spec["m/z array"],
mz_range,
rt,
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
mass_analyzer = get_mass_analyzer(file_path)
namespace = "{http://psi.hupo.org/ms/mzml}"

if scanidx is None:
spectra = data_iter
else:
# this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
# https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
# /standardMzml.py#L81-L84
spectra = (data_iter[idx] for idx in scanidx)

for spec in spectra:
if spec.ms_level != 2:
continue # filter out ms1 spectra if there are any
key = f"{file_name}_{spec.ID}"
scan = spec.get_element_by_path(["scanList", "scan"])[0]
instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
activation = spec.get_element_by_path(["precursorList", "precursor", "activation"])[0]
fragmentation = "unknown"
collision_energy = 0.0
for cv_param in activation:
name = cv_param.get("name")
if name == "collision energy":
collision_energy = float(cv_param.get("value"))
continue
if "beam-type" in name:
fragmentation = "HCD"
elif "collision-induced dissociation" in name:
fragmentation = "CID"
else:
fragmentation = name
scan_window = scan.find(f".//{namespace}scanWindow")
scan_lower_limit = float(
scan_window.find(f'./{namespace}cvParam[@accession="MS:1000501"]').get("value")
)
scan_upper_limit = float(
scan_window.find(f'./{namespace}cvParam[@accession="MS:1000500"]').get("value")
)
mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
data_dict[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer.get(instrument_configuration_ref, "unknown"),
fragmentation,
collision_energy,
]
data_iter.close()
else:
raise AssertionError("Choose either 'pymzml' or 'pyteomics'")
data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
return data

data = pd.DataFrame.from_dict(data, orient="index", columns=MZML_DATA_COLUMNS)
data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
@staticmethod
def _read_mzml_pyteomics(file_list: List[Path], *args, **kwargs) -> pd.DataFrame:
data_dict = {}
for file_path in file_list:
mass_analyzer = get_mass_analyzer(file_path)
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
file_name = file_path.stem
for spec in data_iter:
if spec["ms level"] != 2:
continue # filter out ms1 spectra if there are any
spec_id = spec["id"].split("scan=")[-1]
scan = spec["scanList"]["scan"][0]
instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
activation = spec["precursorList"]["precursor"][0]["activation"]
fragmentation = "unknown"
collision_energy = 0.0
for key, value in activation.items():
if key == "collision energy":
collision_energy = value
elif "beam-type" in key:
fragmentation = "HCD"
elif "collision-induced dissociation" in key:
fragmentation = "CID"
else:
fragmentation = key
scan_lower_limit = scan["scanWindowList"]["scanWindow"][0]["scan window lower limit"]
scan_upper_limit = scan["scanWindowList"]["scanWindow"][0]["scan window upper limit"]
mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
rt = spec["scanList"]["scan"][0]["scan start time"]
key = f"{file_name}_{spec_id}"
data_dict[key] = [
file_name,
spec_id,
spec["intensity array"],
spec["m/z array"],
mz_range,
rt,
mass_analyzer.get(instrument_configuration_ref, "unknown"),
fragmentation,
collision_energy,
]
data_iter.close()
data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
return data

@staticmethod
Expand Down Expand Up @@ -206,62 +287,3 @@ def get_file_list(source: Union[str, Path, List[Union[str, Path]]], ext: str = "
else:
raise TypeError("source can only be a single str or Path or a list of files.")
return file_list

@staticmethod
def _get_scans_pymzml(
file_path: Union[str, Path], data: Dict, scanidx: Optional[List] = None, *args, **kwargs
) -> None:
"""
Reads mzml and generates a dataframe containing intensities and m/z values.
:param file_path: path to a single mzml file.
:param data: dictionary to be added to by this function
:param scanidx: optional list of scan numbers to extract. if not specified, all scans will be extracted
:param args: additional positional arguments
:param kwargs: additional keyword arguments
"""
if isinstance(file_path, str):
file_path = Path(file_path)
data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
file_name = file_path.stem
mass_analyzer = get_mass_analyzer(file_path)
if scanidx is None:
for spec in data_iter:
if spec.ms_level != 1: # filter out ms1 spectra if there are any
key = f"{file_name}_{spec.ID}"
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
fragmentation = filter_string.split("@")[1][:3].upper()
mz_range = filter_string.split("[")[1][:-1]
data[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
else:
for idx in scanidx:
spec = data_iter[idx]
# this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
# https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
# /standardMzml.py#L81-L84
key = f"{file_name}_{spec.ID}"
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
fragmentation = filter_string.split("@")[1][:3].upper()
mz_range = filter_string.split("[")[1][:-1]
data[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
data_iter.close()
1 change: 1 addition & 0 deletions spectrum_io/search_result/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .mascot import Mascot
from .maxquant import MaxQuant
from .msfragger import MSFragger
from .sage import Sage
33 changes: 13 additions & 20 deletions spectrum_io/search_result/msfragger.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import spectrum_fundamentals.constants as c
from pyteomics import pepxml
from spectrum_fundamentals.mod_string import internal_without_mods
from spectrum_fundamentals.mod_string import internal_without_mods, msfragger_to_internal
from tqdm import tqdm

from .search_results import SearchResults, filter_valid_prosit_sequences
Expand Down Expand Up @@ -42,7 +42,7 @@ def read_result(path: Union[str, Path], tmt_labeled: str) -> pd.DataFrame:

df = pd.concat(ms_frag_results)

df = update_columns_for_prosit(df, "")
df = update_columns_for_prosit(df, tmt_labeled)
return filter_valid_prosit_sequences(df)


Expand All @@ -58,7 +58,17 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
df["RAW_FILE"] = df["spectrum"].apply(lambda x: x.split(".")[0])
df["MASS"] = df["precursor_neutral_mass"]
df["PEPTIDE_LENGTH"] = df["peptide"].apply(lambda x: len(x))
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"])

if tmt_labeled != "":
unimod_tag = c.TMT_MODS[tmt_labeled]
logger.info("Adding TMT fixed modifications")
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(
df["modified_peptide"].to_list(),
fixed_mods={"C": "C[UNIMOD:4]", r"n[\d+]": f"{unimod_tag}-", "K": f"K{unimod_tag}"},
)
else:
df["MODIFIED_SEQUENCE"] = msfragger_to_internal(df["modified_peptide"].to_list())

df.rename(
columns={
"assumed_charge": "PRECURSOR_CHARGE",
Expand All @@ -84,20 +94,3 @@ def update_columns_for_prosit(df, tmt_labeled: str) -> pd.DataFrame:
"PEPTIDE_LENGTH",
]
]


def msfragger_to_internal(modstrings: pd.Series):
"""
Transform modstring from msfragger format to internal format.
This function takes a modstrings column from a pandas dataframe and converts each
supported modification (M[147] and C[160]) to the internal representation that is
M[UNIMOD:35] and C[UNIMOD:4], respectively. Since C is considered a fixed modification,
every occurence of a C is transformed to C[UNIMOD:4] as well.
:param modstrings: pd.Series containing the msfragger modstrings
:return: pd.Series with internal modstrings
"""
modstrings = modstrings.str.replace("M[147]", "M[UNIMOD:35]", regex=False)
modstrings = modstrings.str.replace(r"C\[160\]|C", "C[UNIMOD:4]", regex=True)
return modstrings
Loading

0 comments on commit 6d28b8e

Please sign in to comment.