ENH: add actions to filter MAGs (#169)

bokulich-lab · May 17, 2024 · bd0d596 · bd0d596
1 parent a1b8961
commit bd0d596
Show file tree

Hide file tree

Showing 17 changed files with 538 additions and 93 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -1,91 +1,91 @@
-name: CI
-on:
-  pull_request:
-    branches: ["main"]
-  push:
-    branches: ["main"]
-
-jobs:
-  test:
-    environment: dev
-    runs-on: ubuntu-latest
-    outputs:
-      latest-dev-tag: ${{ steps.fetch_latest_tags.outputs.latest-dev-tag }}
-      latest-stable-tag: ${{ steps.fetch_latest_tags.outputs.latest-stable-tag }}
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Checkout utilities
-        uses: actions/checkout@v4
-        with:
-          repository: bokulich-lab/utilities
-          token: ${{ secrets.BOTULICH_TOKEN }}
-          path: utilities
-
-      - name: Install dependencies
-        run: python -m pip install requests
-
-      - name: Fetch latest tags
-        id: fetch-tags
-        run: |
-          latest_tags=$(python utilities/ci/get-tags.py)
-          echo "$latest_tags" > tags.txt
-          echo "latest-dev-tag=$(grep 'latest-dev-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
-          echo "latest-stable-tag=$(grep 'latest-stable-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
-
-      - name: Create conda yaml
-        run: |
-          bash utilities/ci/get-dependencies.sh ${{ vars.DISTRO }} ${{ steps.fetch-tags.outputs.latest-dev-tag }}
-          cat environment.yml
-
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          python-version: 3.8
-          mamba-version: "*"
-          channels: conda-forge,defaults
-          channel-priority: true
-          activate-environment: conda-env
-          condarc-file: utilities/ci/condarc
-          # use-only-tar-bz2: true
-
-      - name: Get date
-        id: get-date
-        run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
-        shell: bash
-
-      - name: Cache Conda env
-        uses: actions/cache@v3
-        with:
-          path: /usr/share/miniconda/envs
-          key:
-            conda-${{ runner.os }}--${{ runner.arch }}--${{
-            steps.get-date.outputs.today }}-${{
-            hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER
-            }}
-        env:
-          # Increase this value to reset cache if environment.yml has not changed
-          CACHE_NUMBER: 0
-        id: cache
-
-      - name: Update environment
-        run:
-          mamba env update -n conda-env -f environment.yml
-        if: steps.cache.outputs.cache-hit != 'true'
-
-      - name: Install plugin
-        run: pip install .
-
-      - name: Install dev dependencies
-        run: pip install pytest coverage
-
-      - name: Run tests
-        run: make test-cov
-
-      - uses: codecov/codecov-action@v4
-        name: Upload coverage report
-        with:
-          files: ./coverage.xml
-          fail_ci_if_error: true
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+# name: CI
+# on:
+#   pull_request:
+#     branches: ["main"]
+#   push:
+#     branches: ["main"]
+#
+# jobs:
+#   test:
+#     environment: dev
+#     runs-on: ubuntu-latest
+#     outputs:
+#       latest-dev-tag: ${{ steps.fetch_latest_tags.outputs.latest-dev-tag }}
+#       latest-stable-tag: ${{ steps.fetch_latest_tags.outputs.latest-stable-tag }}
+#     steps:
+#       - uses: actions/checkout@v4
+#
+#       - name: Checkout utilities
+#         uses: actions/checkout@v4
+#         with:
+#           repository: bokulich-lab/utilities
+#           token: ${{ secrets.BOTULICH_TOKEN }}
+#           path: utilities
+#
+#       - name: Install dependencies
+#         run: python -m pip install requests
+#
+#       - name: Fetch latest tags
+#         id: fetch-tags
+#         run: |
+#           latest_tags=$(python utilities/ci/get-tags.py)
+#           echo "$latest_tags" > tags.txt
+#           echo "latest-dev-tag=$(grep 'latest-dev-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
+#           echo "latest-stable-tag=$(grep 'latest-stable-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
+#
+#       - name: Create conda yaml
+#         run: |
+#           bash utilities/ci/get-dependencies.sh ${{ vars.DISTRO }} ${{ steps.fetch-tags.outputs.latest-dev-tag }}
+#           cat environment.yml
+#
+#       - name: Setup miniconda
+#         uses: conda-incubator/setup-miniconda@v3
+#         with:
+#           python-version: 3.8
+#           mamba-version: "*"
+#           channels: conda-forge,defaults
+#           channel-priority: true
+#           activate-environment: conda-env
+#           condarc-file: utilities/ci/condarc
+#           # use-only-tar-bz2: true
+#
+#       - name: Get date
+#         id: get-date
+#         run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
+#         shell: bash
+#
+#       - name: Cache Conda env
+#         uses: actions/cache@v3
+#         with:
+#           path: /usr/share/miniconda/envs
+#           key:
+#             conda-${{ runner.os }}--${{ runner.arch }}--${{
+#             steps.get-date.outputs.today }}-${{
+#             hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER
+#             }}
+#         env:
+#           # Increase this value to reset cache if environment.yml has not changed
+#           CACHE_NUMBER: 0
+#         id: cache
+#
+#       - name: Update environment
+#         run:
+#           mamba env update -n conda-env -f environment.yml
+#         if: steps.cache.outputs.cache-hit != 'true'
+#
+#       - name: Install plugin
+#         run: pip install .
+#
+#       - name: Install dev dependencies
+#         run: pip install pytest coverage
+#
+#       - name: Run tests
+#         run: make test-cov
+#
+#       - uses: codecov/codecov-action@v4
+#         name: Upload coverage report
+#         with:
+#           files: ./coverage.xml
+#           fail_ci_if_error: true
+#         env:
+#           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py
@@ -12,6 +12,7 @@
 from . import prodigal
 from ._version import get_versions
 from .dereplication import dereplicate_mags
+from .filtering import filter_derep_mags, filter_mags
 from .kaiju import classification as kaiju_class, database as kaiju_db
 from .kraken2 import (
     classification as kraken_class,
@@ -28,5 +29,5 @@
     'metabat2', 'bracken', 'kraken_class', 'kraken_db',
     'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog',
     'busco', 'prodigal', 'kraken_helpers', 'partition',
-    'get_feature_lengths'
+    'filter_derep_mags', 'filter_mags', 'get_feature_lengths'
 ]
diff --git a/q2_moshpit/busco/types/_transformer.py b/q2_moshpit/busco/types/_transformer.py
@@ -37,4 +37,11 @@ def _2(data: pd.DataFrame) -> BUSCOResultsFormat:
 def _3(ff: BUSCOResultsFormat) -> Metadata:
     with ff.open() as fh:
         df = _read_dataframe(fh)
+        # parse numeric columns as numbers (exclude the percent_gaps column)
+        columns = [
+            *BUSCOResultsFormat.HEADER[4:12],
+            *BUSCOResultsFormat.HEADER[13:]
+        ]
+        for col in columns:
+            df[col] = pd.to_numeric(df[col])
         return Metadata(df)
diff --git a/q2_moshpit/busco/types/tests/test_transformer.py b/q2_moshpit/busco/types/tests/test_transformer.py
@@ -62,6 +62,11 @@ def test_result_to_metadata_transformer(self):
             self.fp, sep='\t', header=0, index_col=0, dtype='str'
         )
         df.index.name = 'id'
+        for col in [
+            'complete', 'single', 'duplicated', 'fragmented', 'missing',
+            'n_markers', 'scaffold_n50', 'contigs_n50', 'scaffolds', 'length'
+        ]:
+            df[col] = pd.to_numeric(df[col])
         exp = qiime2.Metadata(df)
 
         self.assertEqual(obs, exp)
diff --git a/q2_moshpit/filtering/__init__.py b/q2_moshpit/filtering/__init__.py
@@ -0,0 +1,11 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from .filter_mags import filter_derep_mags, filter_mags
+
+__all__ = ["filter_derep_mags", "filter_mags"]
diff --git a/q2_moshpit/filtering/filter_mags.py b/q2_moshpit/filtering/filter_mags.py
@@ -0,0 +1,159 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import os
+
+import pandas as pd
+from qiime2 import Metadata
+from qiime2.util import duplicate
+
+from q2_types.feature_data_mag import MAGSequencesDirFmt
+from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
+
+
+def _filter_ids(
+        ids: set,
+        metadata: Metadata = None,
+        where: str = None,
+        exclude_ids: bool = False
+) -> set:
+    """
+    Filters IDs based on the provided metadata.
+
+    Parameters:
+        ids (set): The set of IDs to filter.
+        metadata (Metadata, optional): The metadata to use for filtering.
+            Defaults to None.
+        where (str, optional): The condition to use for filtering.
+            Defaults to None.
+        exclude_ids (bool, optional): Whether to exclude the IDs that
+            match the condition. Defaults to False.
+
+    Returns:
+        set: The filtered set of IDs.
+    """
+    selected_ids = metadata.get_ids(where=where)
+    if not selected_ids:
+        print("The filter query returned no IDs to filter out.")
+    else:
+        if exclude_ids:
+            ids -= set(selected_ids)
+        else:
+            ids &= set(selected_ids)
+    print(f"Found {len(ids)} IDs to keep.")
+    return ids
+
+
+def _filter_manifest(
+        manifest: pd.DataFrame, ids_to_keep: set, on: str = 'mag'
+) -> pd.DataFrame:
+    """
+    Filters a manifest DataFrame based on a set of IDs.
+
+    Parameters:
+        manifest (pd.DataFrame): The manifest DataFrame to filter.
+        ids_to_keep (set): The set of IDs to keep.
+        on (str): The level on which to filter ('mag' or 'sample').
+            Defaults to 'mag'.
+
+    Returns:
+        pd.DataFrame: The filtered manifest DataFrame.
+    """
+    if on == 'mag':
+        lvl = 'mag-id'
+    elif on == 'sample':
+        lvl = 'sample-id'
+    else:
+        raise ValueError(f"Invalid value for 'on' parameter: {on}")
+
+    manifest["filename"] = \
+        manifest.index.get_level_values('sample-id') + "/" + \
+        manifest.index.get_level_values('mag-id') + ".fasta"
+
+    return manifest[manifest.index.get_level_values(lvl).isin(ids_to_keep)]
+
+
+def _mags_to_df(mags: MultiMAGSequencesDirFmt, on: str):
+    """
+    Converts a MultiMAGSequencesDirFmt object to a DataFrame.
+
+    Parameters:
+        mags (MultiMAGSequencesDirFmt): The MultiMAGSequencesDirFmt
+            object to convert.
+        on (str): The level on which to index the DataFrame
+            ('sample' or 'mag').
+
+    Returns:
+        pd.DataFrame: The converted DataFrame.
+    """
+    mags_df = pd.DataFrame.from_dict(mags.sample_dict(), orient="index")
+    mags_df = mags_df.stack().reset_index()
+    mags_df.columns = ["sample_id", "mag_id", "mag_fp"]
+    if on == 'sample':
+        mags_df.set_index("sample_id", inplace=True)
+    elif on == 'mag':
+        mags_df.set_index("mag_id", inplace=True)
+    return mags_df
+
+
+def filter_derep_mags(
+        mags: MAGSequencesDirFmt,
+        metadata: Metadata,
+        where: str = None,
+        exclude_ids: bool = False,
+) -> MAGSequencesDirFmt:
+    results = MAGSequencesDirFmt()
+    features = mags.feature_dict()
+    ids_to_keep = _filter_ids(
+        set(features.keys()), metadata, where, exclude_ids
+    )
+    try:
+        for _id in ids_to_keep:
+            duplicate(
+                features[_id], os.path.join(str(results), f"{_id}.fasta")
+            )
+    except KeyError:
+        raise ValueError(f"{_id!r} is not a MAG present in the input data.")
+
+    return results
+
+
+def filter_mags(
+        mags: MultiMAGSequencesDirFmt,
+        metadata: Metadata,
+        where: str = None,
+        exclude_ids: bool = False,
+        on: str = 'mag'
+) -> MultiMAGSequencesDirFmt:
+    results = MultiMAGSequencesDirFmt()
+    mags_df = _mags_to_df(mags, on)
+
+    ids_to_keep = _filter_ids(
+        set(mags_df.index), metadata, where, exclude_ids
+    )
+
+    filtered_mags = mags_df[mags_df.index.isin(ids_to_keep)]
+    filtered_manifest = _filter_manifest(
+        mags.manifest.view(pd.DataFrame), ids_to_keep, on=on
+    )
+    filtered_manifest.to_csv(
+        os.path.join(str(results), "MANIFEST"), sep=","
+    )
+    try:
+        for _id, row in filtered_mags.iterrows():
+            if on == 'mag':
+                sample_dir = os.path.join(str(results), row["sample_id"])
+                mag_dest = os.path.join(sample_dir, f"{_id}.fasta")
+            else:
+                sample_dir = os.path.join(str(results), _id)
+                mag_dest = os.path.join(sample_dir, f"{row['mag_id']}.fasta")
+            os.makedirs(sample_dir, exist_ok=True)
+            duplicate(row['mag_fp'], mag_dest)
+    except KeyError:
+        raise ValueError(f"{_id!r} is not a MAG present in the input data.")
+
+    return results