Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed instrumentConfigurationRef missing #75

Merged
merged 10 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ max-line-length = 120
max-complexity = 10
docstring-convention = google
per-file-ignores =
tests/*:S101
tests/*:S101,S301,S403
noxfile.py:DAR101
spectrum_io/raw/thermo_raw.py:S603,S404
spectrum_io/raw/msraw.py:S405,S314
Expand Down
1,347 changes: 684 additions & 663 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pymzml = "^2.5.0"
pyteomics = "^4.3.3"
lxml= '^4.5.2'
tables = "^3.6.1"
spectrum-fundamentals = ">=0.4.3,<0.5.0"
spectrum-fundamentals = ">=0.4.4,<0.5.0"

[tool.poetry.dev-dependencies]
pytest = ">=6.2.3"
Expand Down
202 changes: 112 additions & 90 deletions spectrum_io/raw/msraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
accession = mass_analyzers[elem]
if accession in ["MS:1000079", "MS:1000484"]: # fourier transform ion cyclotron, orbitrap
mass_analyzers[elem] = "FTMS"
elif accession in ["MS:1000082", "MS:1000264" "MS:1000078"]: # quadrupole ion-trap, ion-trap, linear ion-trap
elif accession in ["MS:1000082", "MS:1000264", "MS:1000078"]: # quadrupole ion-trap, ion-trap, linear ion-trap
mass_analyzers[elem] = "ITMS"
elif accession in ["MS:1000084"]: # TOF
mass_analyzers[elem] = "TOF"
Expand Down Expand Up @@ -135,43 +135,124 @@
:return: pd.DataFrame with intensities and m/z values
"""
file_list = MSRaw.get_file_list(source, ext)
data = {} # type: Dict[str, Any]

if package == "pymzml":
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ImportWarning)
for file_path in file_list:
logger.info(f"Reading mzML file: {file_path}")
MSRaw._get_scans_pymzml(file_path, data, scanidx, *args, **kwargs)
data = MSRaw._read_mzml_pymzml(file_list, scanidx, *args, **kwargs)
elif package == "pyteomics":
data = MSRaw._read_mzml_pyteomics(file_list, *args, **kwargs)
else:
raise AssertionError("Choose either 'pymzml' or 'pyteomics'")

Check warning on line 144 in spectrum_io/raw/msraw.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/raw/msraw.py#L144

Added line #L144 was not covered by tests

data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
return data

@staticmethod
def _read_mzml_pymzml(file_list: List[Path], scanidx: Optional[List] = None, *args, **kwargs) -> pd.DataFrame:
data_dict = {}
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=ImportWarning)
for file_path in file_list:
mass_analyzer = get_mass_analyzer(file_path)
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
file_name = file_path.stem
for spec in data_iter:
if spec["ms level"] != 1: # filter out ms1 spectra if there are any
spec_id = spec["id"].split("scan=")[-1]
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
fragmentation = spec["scanList"]["scan"][0]["filter string"].split("@")[1][:3].upper()
mz_range = spec["scanList"]["scan"][0]["filter string"].split("[")[1][:-1]
rt = spec["scanList"]["scan"][0]["scan start time"]
key = f"{file_name}_{spec_id}"
data[key] = [
file_name,
spec_id,
spec["intensity array"],
spec["m/z array"],
mz_range,
rt,
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
mass_analyzer = get_mass_analyzer(file_path)
namespace = "{http://psi.hupo.org/ms/mzml}"

if scanidx is None:
spectra = data_iter
else:
# this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
# https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
# /standardMzml.py#L81-L84
spectra = (data_iter[idx] for idx in scanidx)

for spec in spectra:
if spec.ms_level != 2:
continue # filter out ms1 spectra if there are any
key = f"{file_name}_{spec.ID}"
scan = spec.get_element_by_path(["scanList", "scan"])[0]
instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
activation = spec.get_element_by_path(["precursorList", "precursor", "activation"])[0]
fragmentation = "unknown"
collision_energy = 0.0
for cv_param in activation:
name = cv_param.get("name")
if name == "collision energy":
collision_energy = float(cv_param.get("value"))
continue
if "beam-type" in name:
fragmentation = "HCD"
elif "collision-induced dissociation" in name:
fragmentation = "CID"

Check warning on line 186 in spectrum_io/raw/msraw.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/raw/msraw.py#L186

Added line #L186 was not covered by tests
else:
fragmentation = name

Check warning on line 188 in spectrum_io/raw/msraw.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/raw/msraw.py#L188

Added line #L188 was not covered by tests
scan_window = scan.find(f".//{namespace}scanWindow")
scan_lower_limit = float(
scan_window.find(f'./{namespace}cvParam[@accession="MS:1000501"]').get("value")
)
scan_upper_limit = float(
scan_window.find(f'./{namespace}cvParam[@accession="MS:1000500"]').get("value")
)
mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
data_dict[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer.get(instrument_configuration_ref, "unknown"),
fragmentation,
collision_energy,
]
data_iter.close()
else:
raise AssertionError("Choose either 'pymzml' or 'pyteomics'")
data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
return data

data = pd.DataFrame.from_dict(data, orient="index", columns=MZML_DATA_COLUMNS)
data["SCAN_NUMBER"] = pd.to_numeric(data["SCAN_NUMBER"])
@staticmethod
def _read_mzml_pyteomics(file_list: List[Path], *args, **kwargs) -> pd.DataFrame:
data_dict = {}
for file_path in file_list:
mass_analyzer = get_mass_analyzer(file_path)
logger.info(f"Reading mzML file: {file_path}")
data_iter = mzml.read(source=str(file_path), *args, **kwargs)
file_name = file_path.stem
for spec in data_iter:
if spec["ms level"] != 2:
continue # filter out ms1 spectra if there are any
spec_id = spec["id"].split("scan=")[-1]
scan = spec["scanList"]["scan"][0]
instrument_configuration_ref = scan.get("instrumentConfigurationRef", "")
activation = spec["precursorList"]["precursor"][0]["activation"]
fragmentation = "unknown"
collision_energy = 0.0
for key, value in activation.items():
if key == "collision energy":
collision_energy = value
elif "beam-type" in key:
fragmentation = "HCD"
elif "collision-induced dissociation" in key:
fragmentation = "CID"

Check warning on line 235 in spectrum_io/raw/msraw.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/raw/msraw.py#L235

Added line #L235 was not covered by tests
else:
fragmentation = key

Check warning on line 237 in spectrum_io/raw/msraw.py

View check run for this annotation

Codecov / codecov/patch

spectrum_io/raw/msraw.py#L237

Added line #L237 was not covered by tests
scan_lower_limit = scan["scanWindowList"]["scanWindow"][0]["scan window lower limit"]
scan_upper_limit = scan["scanWindowList"]["scanWindow"][0]["scan window upper limit"]
mz_range = f"{scan_lower_limit}-{scan_upper_limit}"
rt = spec["scanList"]["scan"][0]["scan start time"]
key = f"{file_name}_{spec_id}"
data_dict[key] = [
file_name,
spec_id,
spec["intensity array"],
spec["m/z array"],
mz_range,
rt,
mass_analyzer.get(instrument_configuration_ref, "unknown"),
fragmentation,
collision_energy,
]
data_iter.close()
data = pd.DataFrame.from_dict(data_dict, orient="index", columns=MZML_DATA_COLUMNS)
return data

@staticmethod
Expand Down Expand Up @@ -206,62 +287,3 @@
else:
raise TypeError("source can only be a single str or Path or a list of files.")
return file_list

@staticmethod
def _get_scans_pymzml(
file_path: Union[str, Path], data: Dict, scanidx: Optional[List] = None, *args, **kwargs
) -> None:
"""
Reads mzml and generates a dataframe containing intensities and m/z values.

:param file_path: path to a single mzml file.
:param data: dictionary to be added to by this function
:param scanidx: optional list of scan numbers to extract. if not specified, all scans will be extracted
:param args: additional positional arguments
:param kwargs: additional keyword arguments
"""
if isinstance(file_path, str):
file_path = Path(file_path)
data_iter = pymzml.run.Reader(file_path, args=args, kwargs=kwargs)
file_name = file_path.stem
mass_analyzer = get_mass_analyzer(file_path)
if scanidx is None:
for spec in data_iter:
if spec.ms_level != 1: # filter out ms1 spectra if there are any
key = f"{file_name}_{spec.ID}"
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
fragmentation = filter_string.split("@")[1][:3].upper()
mz_range = filter_string.split("[")[1][:-1]
data[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
else:
for idx in scanidx:
spec = data_iter[idx]
# this does not work if some spectra are filtered out, e.g. mzML files with only MS2 spectra, see:
# https://github.com/pymzml/pymzML/blob/a883ff0e61fd97465b0a74667233ff594238e335/pymzml/file_classes
# /standardMzml.py#L81-L84
key = f"{file_name}_{spec.ID}"
instrument_configuration_ref = spec["scanList"]["scan"][0]["instrumentConfigurationRef"]
filter_string = str(spec.element.find(".//*[@accession='MS:1000512']").get("value"))
fragmentation = filter_string.split("@")[1][:3].upper()
mz_range = filter_string.split("[")[1][:-1]
data[key] = [
file_name,
spec.ID,
spec.i,
spec.mz,
mz_range,
spec.scan_time_in_minutes(),
mass_analyzer[instrument_configuration_ref],
fragmentation,
]
data_iter.close()
Binary file added tests/unit_tests/data/testdf.pkl
Binary file not shown.
25 changes: 17 additions & 8 deletions tests/unit_tests/test_msraw.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pickle
import unittest
from pathlib import Path

Expand All @@ -8,16 +9,24 @@
import spectrum_io.raw.msraw as msraw


def _test_read_mzml(package: str):
source = Path(__file__).parent / "data/test.mzml"

msraw_obj = msraw.MSRaw(source, source)
df = msraw_obj.read_mzml(source, package=package)
# pickle.dump(df, open(Path(__file__).parent / "data/testdf.pkl", "wb"))

target_df = pickle.load(open(Path(__file__).parent / "data/testdf.pkl", "rb"))
pd.testing.assert_frame_equal(df, target_df)


class TestMsraw(unittest.TestCase):
"""Class to test msraw."""

def test_read_mzml(self):
def test_read_mzml_with_pyteomics(self):
"""Test read_mzml."""
source = Path(__file__).parent / "data/test.mzml"

msraw_obj = msraw.MSRaw(source, source)
df = msraw_obj.read_mzml(source)
_test_read_mzml(package="pyteomics")

self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(df.shape[1], 8)
self.assertListEqual(list(df.columns), MZML_DATA_COLUMNS)
def test_read_mzml_with_pymzml(self):
"""Test read_mzml."""
_test_read_mzml(package="pymzml")
Loading