Skip to content

Commit

Permalink
Merge branch 'development' into feature/msfragger_mods_tmt
Browse files Browse the repository at this point in the history
  • Loading branch information
picciama committed Nov 10, 2023
2 parents c1f4b4a + 2f5c0e8 commit 31ca434
Show file tree
Hide file tree
Showing 6 changed files with 815 additions and 763 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max-line-length = 120
max-complexity = 10
docstring-convention = google
per-file-ignores =
tests/*:S101
tests/*:S101,S301,S403
noxfile.py:DAR101
spectrum_io/raw/thermo_raw.py:S603,S404
spectrum_io/raw/msraw.py:S405,S314
Expand Down
1,347 changes: 684 additions & 663 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pymzml = "^2.5.0"
pyteomics = "^4.3.3"
lxml= '^4.5.2'
tables = "^3.6.1"
spectrum-fundamentals = ">=0.4.3,<0.5.0"
spectrum-fundamentals = ">=0.4.4,<0.5.0"

[tool.poetry.dev-dependencies]
pytest = ">=6.2.3"
Expand Down
202 changes: 112 additions & 90 deletions spectrum_io/raw/msraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def check_analyzer(mass_analyzers: Dict[str, str]) -> Dict[str, str]:
accession = mass_analyzers[elem]
if accession in ["MS:1000079", "MS:1000484"]: # fourier transform ion cyclotron, orbitrap
mass_analyzers[elem] = "FTMS"
elif accession in ["MS:1000082", "MS:1000264" "MS:1000078"]: # quadrupole ion-trap, ion-trap, linear ion-trap
elif accession in ["MS:1000082", "MS:1000264", "MS:1000078"]: # quadrupole ion-trap, ion-trap, linear ion-trap
mass_analyzers[elem] = "ITMS"
elif accession in ["MS:1000084"]: # TOF
mass_analyzers[elem] = "TOF"
Expand Down Expand Up @@ -135,43 +135,124 @@ def read_mzml(
:return: pd.DataFrame with intensities and m/z values
"""
file_list = MSRaw.get_file_list(source, ext)
data = {} # type: Dict[str, Any]

if package == "pymzml":
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ImportWarning)
for file_path in file_list:
logger.info(f"Reading mzML file: {file_path}")
MSRaw._get_scans_pymzml(file_path, data, scanidx, *args, **kwargs)
data = MSRaw._read_mzml_pymzml(file_list, scanidx, *args, **kwargs)
elif package == "pyteomics":
data = MSRaw._read_mzml_pyteomics(file_list, *args, **kwargs)
else:
raise AssertionError("Choose either 'pymzml' or 'pyteomics'")

data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
return data

@staticmethod
def _read_mzml_pymzml(file_list: List[Path], scanidx: Optional[List] = None, *args, **kwargs) -> pd.DataFrame:
data_dict = {}
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ImportWarning)
for file_path in file_list:
mass_analyzer = get_mass_analyzer(file_path)
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
file_name = file_path.stem
for spec in data_iter:
if spec["ms level"] != 1: # filter out ms1 spectra if there are any
spec_id = spec["id"].split("scan=")[-1]
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
fragmentation = spec["scanList"]["scan"][0]["filter string"].split("@")[1][:3].upper()
mz_range = spec["scanList"]["scan"][0]["filter string"].split("[")[1][:-1]
rt = spec["scanList"]["scan"][0]["scan start time"]
key = f"{file_name}_{spec_id}"
data[key] = [
file_name,
spec_id,
spec["intensity array"],
spec["m/z array"],
mz_range,
rt,
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
mass_analyzer = get_mass_analyzer(file_path)
namespace = "{http://psi.hupo.org/ms/mzml}"

if scanidx is None:
spectra = data_iter
else:
# this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
# https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
# /standardMzml.py#L81-L84
spectra = (data_iter[idx] for idx in scanidx)

for spec in spectra:
if spec.ms_level != 2:
continue # filter out ms1 spectra if there are any
key = f"{file_name}_{spec.ID}"
scan = spec.get_element_by_path(["scanList", "scan"])[0]
instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
activation = spec.get_element_by_path(["precursorList", "precursor", "activation"])[0]
fragmentation = "unknown"
collision_energy = 0.0
for cv_param in activation:
name = cv_param.get("name")
if name == "collision energy":
collision_energy = float(cv_param.get("value"))
continue
if "beam-type" in name:
fragmentation = "HCD"
elif "collision-induced dissociation" in name:
fragmentation = "CID"
else:
fragmentation = name
scan_window = scan.find(f".//{namespace}scanWindow")
scan_lower_limit = float(
scan_window.find(f'./{namespace}cvParam[@accession="MS:1000501"]').get("value")
)
scan_upper_limit = float(
scan_window.find(f'./{namespace}cvParam[@accession="MS:1000500"]').get("value")
)
mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
data_dict[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer.get(instrument_configuration_ref, "unknown"),
fragmentation,
collision_energy,
]
data_iter.close()
else:
raise AssertionError("Choose either 'pymzml' or 'pyteomics'")
data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
return data

data = pd.DataFrame.from_dict(data, orient="index", columns=MZML_DATA_COLUMNS)
data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
@staticmethod
def _read_mzml_pyteomics(file_list: List[Path], *args, **kwargs) -> pd.DataFrame:
data_dict = {}
for file_path in file_list:
mass_analyzer = get_mass_analyzer(file_path)
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
file_name = file_path.stem
for spec in data_iter:
if spec["ms level"] != 2:
continue # filter out ms1 spectra if there are any
spec_id = spec["id"].split("scan=")[-1]
scan = spec["scanList"]["scan"][0]
instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
activation = spec["precursorList"]["precursor"][0]["activation"]
fragmentation = "unknown"
collision_energy = 0.0
for key, value in activation.items():
if key == "collision energy":
collision_energy = value
elif "beam-type" in key:
fragmentation = "HCD"
elif "collision-induced dissociation" in key:
fragmentation = "CID"
else:
fragmentation = key
scan_lower_limit = scan["scanWindowList"]["scanWindow"][0]["scan window lower limit"]
scan_upper_limit = scan["scanWindowList"]["scanWindow"][0]["scan window upper limit"]
mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
rt = spec["scanList"]["scan"][0]["scan start time"]
key = f"{file_name}_{spec_id}"
data_dict[key] = [
file_name,
spec_id,
spec["intensity array"],
spec["m/z array"],
mz_range,
rt,
mass_analyzer.get(instrument_configuration_ref, "unknown"),
fragmentation,
collision_energy,
]
data_iter.close()
data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
return data

@staticmethod
Expand Down Expand Up @@ -206,62 +287,3 @@ def get_file_list(source: Union[str, Path, List[Union[str, Path]]], ext: str = "
else:
raise TypeError("source can only be a single str or Path or a list of files.")
return file_list

@staticmethod
def _get_scans_pymzml(
file_path: Union[str, Path], data: Dict, scanidx: Optional[List] = None, *args, **kwargs
) -> None:
"""
Reads mzml and generates a dataframe containing intensities and m/z values.
:param file_path: path to a single mzml file.
:param data: dictionary to be added to by this function
:param scanidx: optional list of scan numbers to extract. if not specified, all scans will be extracted
:param args: additional positional arguments
:param kwargs: additional keyword arguments
"""
if isinstance(file_path, str):
file_path = Path(file_path)
data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
file_name = file_path.stem
mass_analyzer = get_mass_analyzer(file_path)
if scanidx is None:
for spec in data_iter:
if spec.ms_level != 1: # filter out ms1 spectra if there are any
key = f"{file_name}_{spec.ID}"
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
fragmentation = filter_string.split("@")[1][:3].upper()
mz_range = filter_string.split("[")[1][:-1]
data[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
else:
for idx in scanidx:
spec = data_iter[idx]
# this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
# https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
# /standardMzml.py#L81-L84
key = f"{file_name}_{spec.ID}"
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
fragmentation = filter_string.split("@")[1][:3].upper()
mz_range = filter_string.split("[")[1][:-1]
data[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
data_iter.close()
Binary file added tests/unit_tests/data/testdf.pkl
Binary file not shown.
25 changes: 17 additions & 8 deletions tests/unit_tests/test_msraw.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pickle
import unittest
from pathlib import Path

Expand All @@ -8,16 +9,24 @@
import spectrum_io.raw.msraw as msraw


def _test_read_mzml(package: str):
source = Path(__file__).parent / "data/test.mzml"

msraw_obj = msraw.MSRaw(source, source)
df = msraw_obj.read_mzml(source, package=package)
# pickle.dump(df, open(Path(__file__).parent / "data/testdf.pkl", "wb"))

target_df = pickle.load(open(Path(__file__).parent / "data/testdf.pkl", "rb"))
pd.testing.assert_frame_equal(df, target_df)


class TestMsraw(unittest.TestCase):
"""Class to test msraw."""

def test_read_mzml(self):
def test_read_mzml_with_pyteomics(self):
"""Test read_mzml."""
source = Path(__file__).parent / "data/test.mzml"

msraw_obj = msraw.MSRaw(source, source)
df = msraw_obj.read_mzml(source)
_test_read_mzml(package="pyteomics")

self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(df.shape[1], 8)
self.assertListEqual(list(df.columns), MZML_DATA_COLUMNS)
def test_read_mzml_with_pymzml(self):
"""Test read_mzml."""
_test_read_mzml(package="pymzml")

0 comments on commit 31ca434

Please sign in to comment.