Skip to content

Commit

Permalink
ENH: add actions to filter MAGs (#169)
Browse files Browse the repository at this point in the history
  • Loading branch information
misialq authored May 17, 2024
1 parent a1b8961 commit bd0d596
Show file tree
Hide file tree
Showing 17 changed files with 538 additions and 93 deletions.
182 changes: 91 additions & 91 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,91 +1,91 @@
name: CI
on:
pull_request:
branches: ["main"]
push:
branches: ["main"]

jobs:
test:
environment: dev
runs-on: ubuntu-latest
outputs:
latest-dev-tag: ${{ steps.fetch_latest_tags.outputs.latest-dev-tag }}
latest-stable-tag: ${{ steps.fetch_latest_tags.outputs.latest-stable-tag }}
steps:
- uses: actions/checkout@v4

- name: Checkout utilities
uses: actions/checkout@v4
with:
repository: bokulich-lab/utilities
token: ${{ secrets.BOTULICH_TOKEN }}
path: utilities

- name: Install dependencies
run: python -m pip install requests

- name: Fetch latest tags
id: fetch-tags
run: |
latest_tags=$(python utilities/ci/get-tags.py)
echo "$latest_tags" > tags.txt
echo "latest-dev-tag=$(grep 'latest-dev-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
echo "latest-stable-tag=$(grep 'latest-stable-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
- name: Create conda yaml
run: |
bash utilities/ci/get-dependencies.sh ${{ vars.DISTRO }} ${{ steps.fetch-tags.outputs.latest-dev-tag }}
cat environment.yml
- name: Setup miniconda
uses: conda-incubator/setup-miniconda@v3
with:
python-version: 3.8
mamba-version: "*"
channels: conda-forge,defaults
channel-priority: true
activate-environment: conda-env
condarc-file: utilities/ci/condarc
# use-only-tar-bz2: true

- name: Get date
id: get-date
run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
shell: bash

- name: Cache Conda env
uses: actions/cache@v3
with:
path: /usr/share/miniconda/envs
key:
conda-${{ runner.os }}--${{ runner.arch }}--${{
steps.get-date.outputs.today }}-${{
hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER
}}
env:
# Increase this value to reset cache if environment.yml has not changed
CACHE_NUMBER: 0
id: cache

- name: Update environment
run:
mamba env update -n conda-env -f environment.yml
if: steps.cache.outputs.cache-hit != 'true'

- name: Install plugin
run: pip install .

- name: Install dev dependencies
run: pip install pytest coverage

- name: Run tests
run: make test-cov

- uses: codecov/codecov-action@v4
name: Upload coverage report
with:
files: ./coverage.xml
fail_ci_if_error: true
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
# name: CI
# on:
# pull_request:
# branches: ["main"]
# push:
# branches: ["main"]
#
# jobs:
# test:
# environment: dev
# runs-on: ubuntu-latest
# outputs:
# latest-dev-tag: ${{ steps.fetch_latest_tags.outputs.latest-dev-tag }}
# latest-stable-tag: ${{ steps.fetch_latest_tags.outputs.latest-stable-tag }}
# steps:
# - uses: actions/checkout@v4
#
# - name: Checkout utilities
# uses: actions/checkout@v4
# with:
# repository: bokulich-lab/utilities
# token: ${{ secrets.BOTULICH_TOKEN }}
# path: utilities
#
# - name: Install dependencies
# run: python -m pip install requests
#
# - name: Fetch latest tags
# id: fetch-tags
# run: |
# latest_tags=$(python utilities/ci/get-tags.py)
# echo "$latest_tags" > tags.txt
# echo "latest-dev-tag=$(grep 'latest-dev-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
# echo "latest-stable-tag=$(grep 'latest-stable-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT
#
# - name: Create conda yaml
# run: |
# bash utilities/ci/get-dependencies.sh ${{ vars.DISTRO }} ${{ steps.fetch-tags.outputs.latest-dev-tag }}
# cat environment.yml
#
# - name: Setup miniconda
# uses: conda-incubator/setup-miniconda@v3
# with:
# python-version: 3.8
# mamba-version: "*"
# channels: conda-forge,defaults
# channel-priority: true
# activate-environment: conda-env
# condarc-file: utilities/ci/condarc
# # use-only-tar-bz2: true
#
# - name: Get date
# id: get-date
# run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
# shell: bash
#
# - name: Cache Conda env
# uses: actions/cache@v3
# with:
# path: /usr/share/miniconda/envs
# key:
# conda-${{ runner.os }}--${{ runner.arch }}--${{
# steps.get-date.outputs.today }}-${{
# hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER
# }}
# env:
# # Increase this value to reset cache if environment.yml has not changed
# CACHE_NUMBER: 0
# id: cache
#
# - name: Update environment
# run:
# mamba env update -n conda-env -f environment.yml
# if: steps.cache.outputs.cache-hit != 'true'
#
# - name: Install plugin
# run: pip install .
#
# - name: Install dev dependencies
# run: pip install pytest coverage
#
# - name: Run tests
# run: make test-cov
#
# - uses: codecov/codecov-action@v4
# name: Upload coverage report
# with:
# files: ./coverage.xml
# fail_ci_if_error: true
# env:
# CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
3 changes: 2 additions & 1 deletion q2_moshpit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from . import prodigal
from ._version import get_versions
from .dereplication import dereplicate_mags
from .filtering import filter_derep_mags, filter_mags
from .kaiju import classification as kaiju_class, database as kaiju_db
from .kraken2 import (
classification as kraken_class,
Expand All @@ -28,5 +29,5 @@
'metabat2', 'bracken', 'kraken_class', 'kraken_db',
'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog',
'busco', 'prodigal', 'kraken_helpers', 'partition',
'get_feature_lengths'
'filter_derep_mags', 'filter_mags', 'get_feature_lengths'
]
7 changes: 7 additions & 0 deletions q2_moshpit/busco/types/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,11 @@ def _2(data: pd.DataFrame) -> BUSCOResultsFormat:
def _3(ff: BUSCOResultsFormat) -> Metadata:
with ff.open() as fh:
df = _read_dataframe(fh)
# parse numeric columns as numbers (exclude the percent_gaps column)
columns = [
*BUSCOResultsFormat.HEADER[4:12],
*BUSCOResultsFormat.HEADER[13:]
]
for col in columns:
df[col] = pd.to_numeric(df[col])
return Metadata(df)
5 changes: 5 additions & 0 deletions q2_moshpit/busco/types/tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ def test_result_to_metadata_transformer(self):
self.fp, sep='\t', header=0, index_col=0, dtype='str'
)
df.index.name = 'id'
for col in [
'complete', 'single', 'duplicated', 'fragmented', 'missing',
'n_markers', 'scaffold_n50', 'contigs_n50', 'scaffolds', 'length'
]:
df[col] = pd.to_numeric(df[col])
exp = qiime2.Metadata(df)

self.assertEqual(obs, exp)
11 changes: 11 additions & 0 deletions q2_moshpit/filtering/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .filter_mags import filter_derep_mags, filter_mags

__all__ = ["filter_derep_mags", "filter_mags"]
159 changes: 159 additions & 0 deletions q2_moshpit/filtering/filter_mags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os

import pandas as pd
from qiime2 import Metadata
from qiime2.util import duplicate

from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt


def _filter_ids(
ids: set,
metadata: Metadata = None,
where: str = None,
exclude_ids: bool = False
) -> set:
"""
Filters IDs based on the provided metadata.
Parameters:
ids (set): The set of IDs to filter.
metadata (Metadata, optional): The metadata to use for filtering.
Defaults to None.
where (str, optional): The condition to use for filtering.
Defaults to None.
exclude_ids (bool, optional): Whether to exclude the IDs that
match the condition. Defaults to False.
Returns:
set: The filtered set of IDs.
"""
selected_ids = metadata.get_ids(where=where)
if not selected_ids:
print("The filter query returned no IDs to filter out.")
else:
if exclude_ids:
ids -= set(selected_ids)
else:
ids &= set(selected_ids)
print(f"Found {len(ids)} IDs to keep.")
return ids


def _filter_manifest(
manifest: pd.DataFrame, ids_to_keep: set, on: str = 'mag'
) -> pd.DataFrame:
"""
Filters a manifest DataFrame based on a set of IDs.
Parameters:
manifest (pd.DataFrame): The manifest DataFrame to filter.
ids_to_keep (set): The set of IDs to keep.
on (str): The level on which to filter ('mag' or 'sample').
Defaults to 'mag'.
Returns:
pd.DataFrame: The filtered manifest DataFrame.
"""
if on == 'mag':
lvl = 'mag-id'
elif on == 'sample':
lvl = 'sample-id'
else:
raise ValueError(f"Invalid value for 'on' parameter: {on}")

manifest["filename"] = \
manifest.index.get_level_values('sample-id') + "/" + \
manifest.index.get_level_values('mag-id') + ".fasta"

return manifest[manifest.index.get_level_values(lvl).isin(ids_to_keep)]


def _mags_to_df(mags: MultiMAGSequencesDirFmt, on: str):
"""
Converts a MultiMAGSequencesDirFmt object to a DataFrame.
Parameters:
mags (MultiMAGSequencesDirFmt): The MultiMAGSequencesDirFmt
object to convert.
on (str): The level on which to index the DataFrame
('sample' or 'mag').
Returns:
pd.DataFrame: The converted DataFrame.
"""
mags_df = pd.DataFrame.from_dict(mags.sample_dict(), orient="index")
mags_df = mags_df.stack().reset_index()
mags_df.columns = ["sample_id", "mag_id", "mag_fp"]
if on == 'sample':
mags_df.set_index("sample_id", inplace=True)
elif on == 'mag':
mags_df.set_index("mag_id", inplace=True)
return mags_df


def filter_derep_mags(
mags: MAGSequencesDirFmt,
metadata: Metadata,
where: str = None,
exclude_ids: bool = False,
) -> MAGSequencesDirFmt:
results = MAGSequencesDirFmt()
features = mags.feature_dict()
ids_to_keep = _filter_ids(
set(features.keys()), metadata, where, exclude_ids
)
try:
for _id in ids_to_keep:
duplicate(
features[_id], os.path.join(str(results), f"{_id}.fasta")
)
except KeyError:
raise ValueError(f"{_id!r} is not a MAG present in the input data.")

return results


def filter_mags(
mags: MultiMAGSequencesDirFmt,
metadata: Metadata,
where: str = None,
exclude_ids: bool = False,
on: str = 'mag'
) -> MultiMAGSequencesDirFmt:
results = MultiMAGSequencesDirFmt()
mags_df = _mags_to_df(mags, on)

ids_to_keep = _filter_ids(
set(mags_df.index), metadata, where, exclude_ids
)

filtered_mags = mags_df[mags_df.index.isin(ids_to_keep)]
filtered_manifest = _filter_manifest(
mags.manifest.view(pd.DataFrame), ids_to_keep, on=on
)
filtered_manifest.to_csv(
os.path.join(str(results), "MANIFEST"), sep=","
)
try:
for _id, row in filtered_mags.iterrows():
if on == 'mag':
sample_dir = os.path.join(str(results), row["sample_id"])
mag_dest = os.path.join(sample_dir, f"{_id}.fasta")
else:
sample_dir = os.path.join(str(results), _id)
mag_dest = os.path.join(sample_dir, f"{row['mag_id']}.fasta")
os.makedirs(sample_dir, exist_ok=True)
duplicate(row['mag_fp'], mag_dest)
except KeyError:
raise ValueError(f"{_id!r} is not a MAG present in the input data.")

return results
Loading

0 comments on commit bd0d596

Please sign in to comment.