Skip to content

Commit

Permalink
Merge pull request #144 from wilhelm-lab/patch/0.6.1
Browse files Browse the repository at this point in the history
Patch/0.6.1
  • Loading branch information
picciama authored Aug 8, 2024
2 parents 472db6b + 182fae3 commit cb45899
Show file tree
Hide file tree
Showing 14 changed files with 423 additions and 372 deletions.
2 changes: 1 addition & 1 deletion .cookietemple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ full_name: Mario Picciani
email: [email protected]
project_name: spectrum_io
project_short_description: IO related functionalities for oktoberfest.
version: 0.6.0
version: 0.6.1
license: MIT
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.6.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.6.0 # <<COOKIETEMPLE_FORCE_BUMP>>
name-template: "0.6.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
tag-template: 0.6.1 # <<COOKIETEMPLE_FORCE_BUMP>>
exclude-labels:
- "skip-changelog"

Expand Down
2 changes: 1 addition & 1 deletion cookietemple.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.0
current_version = 0.6.1

[bumpversion_files_whitelisted]
init_file = spectrum_io/__init__.py
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@
# the built documents.
#
# The short X.Y version.
version = "0.6.0"
version = "0.6.1"
# The full version, including alpha/beta/rc tags.
release = "0.6.0"
release = "0.6.1"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
582 changes: 322 additions & 260 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "spectrum_io"
version = "0.6.0" # <<COOKIETEMPLE_FORCE_BUMP>>
version = "0.6.1" # <<COOKIETEMPLE_FORCE_BUMP>>
description = "IO related functionalities for oktoberfest."
authors = ["Wilhelmlab at Technical University of Munich"]
license = "MIT"
Expand All @@ -23,7 +23,7 @@ click = ">=8.0.0"
rich = ">=10.3.0"
PyYAML = ">=5.4.1"
numpy = "^1.18.1"
pandas = "^1.3.0"
pandas = ">=1.3,<3.0"
h5py = "^3.1.0"
pyarrow = ">=16.0.0"
pymzml = "^2.5.0"
Expand Down
19 changes: 10 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ alabaster==0.7.16 ; python_version >= "3.9" and python_full_version < "3.11.0"
alphatims==1.0.8 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
annotated-types==0.7.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
anyio==4.4.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
attrs==24.1.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
attrs==24.2.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
authlib==1.3.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
babel==2.15.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
bandit==1.7.9 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
black==24.8.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
certifi==2024.7.4 ; python_version >= "3.9" and python_full_version < "3.11.0"
cffi==1.16.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0" and platform_python_implementation != "PyPy"
cffi==1.17.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0" and platform_python_implementation != "PyPy"
cfgv==3.4.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
charset-normalizer==3.3.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
click==8.1.7 ; python_version >= "3.9" and python_full_version < "3.11.0"
Expand Down Expand Up @@ -45,17 +45,17 @@ lxml==5.2.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
markdown-it-py==3.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
markupsafe==2.1.5 ; python_version >= "3.9" and python_full_version < "3.11.0"
marshmallow==3.21.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
matplotlib==3.9.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
matplotlib==3.9.1.post1 ; python_version >= "3.9" and python_full_version < "3.11.0"
mccabe==0.7.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
mdurl==0.1.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
moepy==1.1.4 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
mypy-extensions==1.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
mypy==1.11.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
nodeenv==1.9.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
numba==0.60.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
numpy==1.24.4 ; python_version >= "3.9" and python_full_version < "3.11.0"
numpy==1.24.4 ; python_version >= "3.9" and python_version < "3.11"
packaging==24.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
pandas==1.5.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
pandas==2.2.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
pathspec==0.12.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
pbr==6.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
pep8-naming==0.14.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
Expand All @@ -78,9 +78,9 @@ pyparsing==3.1.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
pyteomics==4.7.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
pytest==8.3.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_full_version < "3.11.0"
pytz==2024.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
pytz==2024.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
pyupgrade==3.17.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
pyyaml==6.0.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
pyyaml==6.0.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
pyzstd==0.16.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
regex==2024.7.24 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
requests==2.32.3 ; python_version >= "3.9" and python_full_version < "3.11.0"
Expand All @@ -98,7 +98,7 @@ six==1.16.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
sniffio==1.3.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
snowballstemmer==2.2.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
sortedcontainers==2.4.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
spectrum-fundamentals==0.7.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
spectrum-fundamentals==0.7.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
sphinx-autobuild==2024.4.16 ; python_version >= "3.9" and python_full_version < "3.11.0"
sphinx-autodoc-typehints==2.2.3 ; python_version >= "3.9" and python_full_version < "3.11.0"
sphinx-click==6.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
Expand All @@ -124,10 +124,11 @@ types-attrs==19.1.0 ; python_full_version >= "3.9.0" and python_full_version < "
types-pkg-resources==0.1.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
types-requests==2.32.0.20240712 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.11"
tzdata==2024.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
urllib3==2.2.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
uvicorn==0.30.5 ; python_version >= "3.9" and python_full_version < "3.11.0"
virtualenv==20.26.3 ; python_version >= "3.9" and python_full_version < "3.11.0"
watchfiles==0.22.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
watchfiles==0.23.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
websockets==12.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
xdoctest[colors]==1.1.6 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.10"
2 changes: 1 addition & 1 deletion spectrum_io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
__copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
__license__ = "MIT"
__version__ = "0.6.0"
__version__ = "0.6.1"

import logging
import logging.handlers
Expand Down
2 changes: 1 addition & 1 deletion spectrum_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@click.command()
@click.version_option(version="0.6.0", message=click.style("spectrum_io Version: 0.6.0"))
@click.version_option(version="0.6.1", message=click.style("spectrum_io Version: 0.6.1"))
def main() -> None:
"""spectrum_io."""

Expand Down
27 changes: 6 additions & 21 deletions spectrum_io/file/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,7 @@
import pyarrow.parquet as pq
import scipy

# TODO add sparse matrix / anndata support
# TODO add speed benchmarks
# TODO add support for HuggingFace datasets API

Pathlike = Union[Path, str]
Dataset = Union[pd.DataFrame, scipy.sparse.spmatrix]

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -53,36 +48,26 @@ def read_partition(path: Pathlike, dataset_name: str) -> pd.DataFrame:
return df


def write_file(data: Dataset, path: Pathlike) -> None:
def write_file(data: pd.DataFrame, path: Pathlike) -> None:
"""Writes a single DataFrame or matrix to a Parquet file.
:param data: Data to store
:param path: Path to write the Parquet file to
:raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset
"""
if isinstance(data, pd.DataFrame):
data.to_parquet(path)
else:
raise NotImplementedError
data.to_parquet(path)


def write_partition(datasets: List[Dataset], path: Pathlike, dataset_names: List[str]) -> None:
def write_partition(datasets: List[pd.DataFrame], path: Pathlike, dataset_names: List[str]) -> None:
"""
Write several datasets to a Parquet dataset as a directory containing subdirectories partinioned by dataset name.
Write several datasets to a Parquet dataset as a directory containing subdirectories partitioned by dataset name.
:param datasets: Datasets to write
:param path: Root path to write the partitioned dataset to
:param dataset_names: Names to assign to the datasets for retrieval. Careful: If all of these are strings of ints,
Parquet will convert them to raw integers!
:raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset
"""
if all(isinstance(x, pd.DataFrame) for x in datasets):
df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)])
table = pa.Table.from_pandas(df)
else:
raise NotImplementedError
df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)])
table = pa.Table.from_pandas(df)

if isinstance(path, str):
path = Path(path)
Expand Down
2 changes: 0 additions & 2 deletions spectrum_io/raw/msraw.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ def read_mzml(
source: Union[str, Path, List[Union[str, Path]]],
ext: str = "mzml",
package: str = "pyteomics",
search_type: str = "Maxquant",
scanidx: Optional[List] = None,
*args,
**kwargs,
Expand All @@ -128,7 +127,6 @@ def read_mzml(
:param ext: file extension for searching a specified directory
:param package: package for parsing the mzml file. Can eiter be "pymzml" or "pyteomics"
:param scanidx: optional list of scan numbers to extract. if not specified, all scans will be extracted
:param search_type: type of the search (Maxquant, Mascot, Msfragger)
:param args: additional positional arguments
:param kwargs: additional keyword arguments
:raises AssertionError: if package has an unexpected type
Expand Down
16 changes: 15 additions & 1 deletion spectrum_io/search_result/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,21 @@ def read_result(
parsed_mods["^_"] = f"_{unimod_tag}-"

logger.info("Reading msms.txt file")
self.results = pd.read_csv(self.path / "msms.txt", sep="\t")
self.results = pd.read_csv(
self.path / "msms.txt",
usecols=[
"Raw file",
"Scan number",
"Modified sequence",
"Charge",
"Scan event number",
"Mass", # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead
"Score",
"Reverse",
"Proteins",
],
sep="\t",
)

logger.info("Finished reading msms.txt file")

Expand Down
27 changes: 19 additions & 8 deletions spectrum_io/search_result/search_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@
logger = logging.getLogger(__name__)


COLUMNS = [
"RAW_FILE",
"SCAN_NUMBER",
"MODIFIED_SEQUENCE",
"PRECURSOR_CHARGE",
"MASS",
"SCORE",
"REVERSE",
"SEQUENCE",
"PEPTIDE_LENGTH",
"PROTEINS",
]


def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
"""
Filter valid Prosit sequences.
Expand All @@ -22,11 +36,8 @@ def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
# retain only peptides that fall within [7, 30] length supported by Prosit
df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)]
# remove unsupported mods to exclude
unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]", r"\+"]
exclude_mods_pattern = re.compile("|".join(unsupported_mods))
df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)]
# remove non-canonical aas
df = df[(~df["SEQUENCE"].str.contains("U|O"))]
supported_pattern = re.compile(r"^(?:\[UNIMOD:\d+\]\-)?(?:[ACDEFGHIKLMNPQRSTVWY]+(?:\[UNIMOD:\d+\])?)*$")
df = df[df["MODIFIED_SEQUENCE"].str.match(supported_pattern)]
# remove precursor charges greater than 6
df = df[df["PRECURSOR_CHARGE"] <= 6]
logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}")
Expand Down Expand Up @@ -124,8 +135,8 @@ def generate_internal(
"""
if out_path is None:
# convert and return
return self.read_result(tmt_label, custom_mods=custom_mods)

filtered_df = self.read_result(tmt_label, custom_mods=custom_mods)
return filtered_df[COLUMNS]
if isinstance(out_path, str):
out_path = Path(out_path)

Expand All @@ -136,7 +147,7 @@ def generate_internal(
return csv.read_file(out_path)

# convert, save and return
df = self.read_result(tmt_label, custom_mods=custom_mods)
df = self.read_result(tmt_label, custom_mods=custom_mods)[COLUMNS]
csv.write_file(df, out_path)
return df

Expand Down
Loading

0 comments on commit cb45899

Please sign in to comment.