Merge pull request #144 from wilhelm-lab/patch/0.6.1

Patch/0.6.1
wilhelm-lab · Aug 8, 2024 · cb45899 · cb45899
2 parents 472db6b + 182fae3
commit cb45899
Show file tree

Hide file tree

Showing 14 changed files with 423 additions and 372 deletions.
diff --git a/.cookietemple.yml b/.cookietemple.yml
@@ -15,5 +15,5 @@ full_name: Mario Picciani
 email: [email protected]
 project_name: spectrum_io
 project_short_description: IO related functionalities for oktoberfest.
-version: 0.6.0
+version: 0.6.1
 license: MIT
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.6.0 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
-tag-template: 0.6.0 # <<COOKIETEMPLE_FORCE_BUMP>>
+name-template: "0.6.1 🌈" # <<COOKIETEMPLE_FORCE_BUMP>>
+tag-template: 0.6.1 # <<COOKIETEMPLE_FORCE_BUMP>>
 exclude-labels:
     - "skip-changelog"
 

diff --git a/cookietemple.cfg b/cookietemple.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.6.0
+current_version = 0.6.1
 
 [bumpversion_files_whitelisted]
 init_file = spectrum_io/__init__.py

diff --git a/docs/conf.py b/docs/conf.py
@@ -52,9 +52,9 @@
 # the built documents.
 #
 # The short X.Y version.
-version = "0.6.0"
+version = "0.6.1"
 # The full version, including alpha/beta/rc tags.
-release = "0.6.0"
+release = "0.6.1"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "spectrum_io"
-version = "0.6.0"  # <<COOKIETEMPLE_FORCE_BUMP>>
+version = "0.6.1"  # <<COOKIETEMPLE_FORCE_BUMP>>
 description = "IO related functionalities for oktoberfest."
 authors = ["Wilhelmlab at Technical University of Munich"]
 license = "MIT"
@@ -23,7 +23,7 @@ click = ">=8.0.0"
 rich = ">=10.3.0"
 PyYAML = ">=5.4.1"
 numpy = "^1.18.1"
-pandas = "^1.3.0"
+pandas = ">=1.3,<3.0"
 h5py = "^3.1.0"
 pyarrow = ">=16.0.0"
 pymzml = "^2.5.0"

diff --git a/requirements.txt b/requirements.txt
@@ -2,13 +2,13 @@ alabaster==0.7.16 ; python_version >= "3.9" and python_full_version < "3.11.0"
 alphatims==1.0.8 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 annotated-types==0.7.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 anyio==4.4.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
-attrs==24.1.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
+attrs==24.2.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 authlib==1.3.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 babel==2.15.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
 bandit==1.7.9 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 black==24.8.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 certifi==2024.7.4 ; python_version >= "3.9" and python_full_version < "3.11.0"
-cffi==1.16.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0" and platform_python_implementation != "PyPy"
+cffi==1.17.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0" and platform_python_implementation != "PyPy"
 cfgv==3.4.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
 charset-normalizer==3.3.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
 click==8.1.7 ; python_version >= "3.9" and python_full_version < "3.11.0"
@@ -45,17 +45,17 @@ lxml==5.2.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 markdown-it-py==3.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 markupsafe==2.1.5 ; python_version >= "3.9" and python_full_version < "3.11.0"
 marshmallow==3.21.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
-matplotlib==3.9.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
+matplotlib==3.9.1.post1 ; python_version >= "3.9" and python_full_version < "3.11.0"
 mccabe==0.7.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 mdurl==0.1.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 moepy==1.1.4 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 mypy-extensions==1.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 mypy==1.11.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 nodeenv==1.9.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
 numba==0.60.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
-numpy==1.24.4 ; python_version >= "3.9" and python_full_version < "3.11.0"
+numpy==1.24.4 ; python_version >= "3.9" and python_version < "3.11"
 packaging==24.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
-pandas==1.5.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
+pandas==2.2.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
 pathspec==0.12.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 pbr==6.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 pep8-naming==0.14.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
@@ -78,9 +78,9 @@ pyparsing==3.1.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
 pyteomics==4.7.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 pytest==8.3.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_full_version < "3.11.0"
-pytz==2024.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
+pytz==2024.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
 pyupgrade==3.17.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
-pyyaml==6.0.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
+pyyaml==6.0.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
 pyzstd==0.16.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 regex==2024.7.24 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 requests==2.32.3 ; python_version >= "3.9" and python_full_version < "3.11.0"
@@ -98,7 +98,7 @@ six==1.16.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
 sniffio==1.3.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
 snowballstemmer==2.2.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
 sortedcontainers==2.4.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
-spectrum-fundamentals==0.7.1 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
+spectrum-fundamentals==0.7.2 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 sphinx-autobuild==2024.4.16 ; python_version >= "3.9" and python_full_version < "3.11.0"
 sphinx-autodoc-typehints==2.2.3 ; python_version >= "3.9" and python_full_version < "3.11.0"
 sphinx-click==6.0.0 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
@@ -124,10 +124,11 @@ types-attrs==19.1.0 ; python_full_version >= "3.9.0" and python_full_version < "
 types-pkg-resources==0.1.3 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 types-requests==2.32.0.20240712 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.11"
+tzdata==2024.1 ; python_version >= "3.9" and python_full_version < "3.11.0"
 urllib3==2.2.2 ; python_version >= "3.9" and python_full_version < "3.11.0"
 uvicorn==0.30.5 ; python_version >= "3.9" and python_full_version < "3.11.0"
 virtualenv==20.26.3 ; python_version >= "3.9" and python_full_version < "3.11.0"
-watchfiles==0.22.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
+watchfiles==0.23.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
 websockets==12.0 ; python_version >= "3.9" and python_full_version < "3.11.0"
 xdoctest[colors]==1.1.6 ; python_full_version >= "3.9.0" and python_full_version < "3.11.0"
 zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.10"
diff --git a/spectrum_io/__init__.py b/spectrum_io/__init__.py
@@ -5,7 +5,7 @@
 __author__ = """The Oktoberfest development team (Wilhelmlab at Technical University of Munich)"""
 __copyright__ = f"Copyright {datetime.now():%Y}, Wilhelmlab at Technical University of Munich"
 __license__ = "MIT"
-__version__ = "0.6.0"
+__version__ = "0.6.1"
 
 import logging
 import logging.handlers

diff --git a/spectrum_io/__main__.py b/spectrum_io/__main__.py
@@ -5,7 +5,7 @@
 
 
 @click.command()
-@click.version_option(version="0.6.0", message=click.style("spectrum_io Version: 0.6.0"))
+@click.version_option(version="0.6.1", message=click.style("spectrum_io Version: 0.6.1"))
 def main() -> None:
     """spectrum_io."""
 

diff --git a/spectrum_io/file/parquet.py b/spectrum_io/file/parquet.py
@@ -7,12 +7,7 @@
 import pyarrow.parquet as pq
 import scipy
 
-# TODO add sparse matrix / anndata support
-# TODO add speed benchmarks
-# TODO add support for HuggingFace datasets API
-
 Pathlike = Union[Path, str]
-Dataset = Union[pd.DataFrame, scipy.sparse.spmatrix]
 
 logger = logging.getLogger(__name__)
 
@@ -53,36 +48,26 @@ def read_partition(path: Pathlike, dataset_name: str) -> pd.DataFrame:
     return df
 
 
-def write_file(data: Dataset, path: Pathlike) -> None:
+def write_file(data: pd.DataFrame, path: Pathlike) -> None:
     """Writes a single DataFrame or matrix to a Parquet file.
 
     :param data: Data to store
     :param path: Path to write the Parquet file to
-
-    :raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset
     """
-    if isinstance(data, pd.DataFrame):
-        data.to_parquet(path)
-    else:
-        raise NotImplementedError
+    data.to_parquet(path)
 
 
-def write_partition(datasets: List[Dataset], path: Pathlike, dataset_names: List[str]) -> None:
+def write_partition(datasets: List[pd.DataFrame], path: Pathlike, dataset_names: List[str]) -> None:
     """
-    Write several datasets to a Parquet dataset as a directory containing subdirectories partinioned by dataset name.
+    Write several datasets to a Parquet dataset as a directory containing subdirectories partitioned by dataset name.
 
     :param datasets: Datasets to write
     :param path: Root path to write the partitioned dataset to
     :param dataset_names: Names to assign to the datasets for retrieval. Careful: If all of these are strings of ints,
         Parquet will convert them to raw integers!
-
-    :raises NotImplementedError: if anything else but a Pandas DataFrame is used as the dataset
     """
-    if all(isinstance(x, pd.DataFrame) for x in datasets):
-        df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)])
-        table = pa.Table.from_pandas(df)
-    else:
-        raise NotImplementedError
+    df = pd.concat([dataset.assign(dataset=name) for dataset, name in zip(datasets, dataset_names)])
+    table = pa.Table.from_pandas(df)
 
     if isinstance(path, str):
         path = Path(path)

diff --git a/spectrum_io/raw/msraw.py b/spectrum_io/raw/msraw.py
@@ -116,7 +116,6 @@ def read_mzml(
         source: Union[str, Path, List[Union[str, Path]]],
         ext: str = "mzml",
         package: str = "pyteomics",
-        search_type: str = "Maxquant",
         scanidx: Optional[List] = None,
         *args,
         **kwargs,
@@ -128,7 +127,6 @@ def read_mzml(
         :param ext: file extension for searching a specified directory
         :param package: package for parsing the mzml file. Can eiter be "pymzml" or "pyteomics"
         :param scanidx: optional list of scan numbers to extract. if not specified, all scans will be extracted
-        :param search_type: type of the search (Maxquant, Mascot, Msfragger)
         :param args: additional positional arguments
         :param kwargs: additional keyword arguments
         :raises AssertionError: if package has an unexpected type

diff --git a/spectrum_io/search_result/maxquant.py b/spectrum_io/search_result/maxquant.py
@@ -71,7 +71,21 @@ def read_result(
             parsed_mods["^_"] = f"_{unimod_tag}-"
 
         logger.info("Reading msms.txt file")
-        self.results = pd.read_csv(self.path / "msms.txt", sep="\t")
+        self.results = pd.read_csv(
+            self.path / "msms.txt",
+            usecols=[
+                "Raw file",
+                "Scan number",
+                "Modified sequence",
+                "Charge",
+                "Scan event number",
+                "Mass",  # = Calculated Precursor mass; TODO get column with experimental Precursor mass instead
+                "Score",
+                "Reverse",
+                "Proteins",
+            ],
+            sep="\t",
+        )
 
         logger.info("Finished reading msms.txt file")
 

diff --git a/spectrum_io/search_result/search_results.py b/spectrum_io/search_result/search_results.py
@@ -11,6 +11,20 @@
 logger = logging.getLogger(__name__)
 
 
+COLUMNS = [
+    "RAW_FILE",
+    "SCAN_NUMBER",
+    "MODIFIED_SEQUENCE",
+    "PRECURSOR_CHARGE",
+    "MASS",
+    "SCORE",
+    "REVERSE",
+    "SEQUENCE",
+    "PEPTIDE_LENGTH",
+    "PROTEINS",
+]
+
+
 def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
     """
     Filter valid Prosit sequences.
@@ -22,11 +36,8 @@ def filter_valid_prosit_sequences(df: pd.DataFrame) -> pd.DataFrame:
     # retain only peptides that fall within [7, 30] length supported by Prosit
     df = df[(df["PEPTIDE_LENGTH"] <= 30) & (df["PEPTIDE_LENGTH"] >= 7)]
     # remove unsupported mods to exclude
-    unsupported_mods = [r"Acetyl \(Protein N\-term\)", "ac", r"\[[0-9]+\]", r"\+"]
-    exclude_mods_pattern = re.compile("|".join(unsupported_mods))
-    df = df[~df["MODIFIED_SEQUENCE"].str.contains(exclude_mods_pattern)]
-    # remove non-canonical aas
-    df = df[(~df["SEQUENCE"].str.contains("U|O"))]
+    supported_pattern = re.compile(r"^(?:\[UNIMOD:\d+\]\-)?(?:[ACDEFGHIKLMNPQRSTVWY]+(?:\[UNIMOD:\d+\])?)*$")
+    df = df[df["MODIFIED_SEQUENCE"].str.match(supported_pattern)]
     # remove precursor charges greater than 6
     df = df[df["PRECURSOR_CHARGE"] <= 6]
     logger.info(f"#sequences after filtering for valid prosit sequences: {len(df.index)}")
@@ -124,8 +135,8 @@ def generate_internal(
         """
         if out_path is None:
             # convert and return
-            return self.read_result(tmt_label, custom_mods=custom_mods)
-
+            filtered_df = self.read_result(tmt_label, custom_mods=custom_mods)
+            return filtered_df[COLUMNS]
         if isinstance(out_path, str):
             out_path = Path(out_path)
 
@@ -136,7 +147,7 @@ def generate_internal(
             return csv.read_file(out_path)
 
         # convert, save and return
-        df = self.read_result(tmt_label, custom_mods=custom_mods)
+        df = self.read_result(tmt_label, custom_mods=custom_mods)[COLUMNS]
         csv.write_file(df, out_path)
         return df