generated from bokulich-lab/q2-plugin-template
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: add actions to filter MAGs (#169)
- Loading branch information
Showing
17 changed files
with
538 additions
and
93 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,91 +1,91 @@ | ||
name: CI | ||
on: | ||
pull_request: | ||
branches: ["main"] | ||
push: | ||
branches: ["main"] | ||
|
||
jobs: | ||
test: | ||
environment: dev | ||
runs-on: ubuntu-latest | ||
outputs: | ||
latest-dev-tag: ${{ steps.fetch_latest_tags.outputs.latest-dev-tag }} | ||
latest-stable-tag: ${{ steps.fetch_latest_tags.outputs.latest-stable-tag }} | ||
steps: | ||
- uses: actions/checkout@v4 | ||
|
||
- name: Checkout utilities | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: bokulich-lab/utilities | ||
token: ${{ secrets.BOTULICH_TOKEN }} | ||
path: utilities | ||
|
||
- name: Install dependencies | ||
run: python -m pip install requests | ||
|
||
- name: Fetch latest tags | ||
id: fetch-tags | ||
run: | | ||
latest_tags=$(python utilities/ci/get-tags.py) | ||
echo "$latest_tags" > tags.txt | ||
echo "latest-dev-tag=$(grep 'latest-dev-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT | ||
echo "latest-stable-tag=$(grep 'latest-stable-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT | ||
- name: Create conda yaml | ||
run: | | ||
bash utilities/ci/get-dependencies.sh ${{ vars.DISTRO }} ${{ steps.fetch-tags.outputs.latest-dev-tag }} | ||
cat environment.yml | ||
- name: Setup miniconda | ||
uses: conda-incubator/setup-miniconda@v3 | ||
with: | ||
python-version: 3.8 | ||
mamba-version: "*" | ||
channels: conda-forge,defaults | ||
channel-priority: true | ||
activate-environment: conda-env | ||
condarc-file: utilities/ci/condarc | ||
# use-only-tar-bz2: true | ||
|
||
- name: Get date | ||
id: get-date | ||
run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT | ||
shell: bash | ||
|
||
- name: Cache Conda env | ||
uses: actions/cache@v3 | ||
with: | ||
path: /usr/share/miniconda/envs | ||
key: | ||
conda-${{ runner.os }}--${{ runner.arch }}--${{ | ||
steps.get-date.outputs.today }}-${{ | ||
hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER | ||
}} | ||
env: | ||
# Increase this value to reset cache if environment.yml has not changed | ||
CACHE_NUMBER: 0 | ||
id: cache | ||
|
||
- name: Update environment | ||
run: | ||
mamba env update -n conda-env -f environment.yml | ||
if: steps.cache.outputs.cache-hit != 'true' | ||
|
||
- name: Install plugin | ||
run: pip install . | ||
|
||
- name: Install dev dependencies | ||
run: pip install pytest coverage | ||
|
||
- name: Run tests | ||
run: make test-cov | ||
|
||
- uses: codecov/codecov-action@v4 | ||
name: Upload coverage report | ||
with: | ||
files: ./coverage.xml | ||
fail_ci_if_error: true | ||
env: | ||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} | ||
# name: CI | ||
# on: | ||
# pull_request: | ||
# branches: ["main"] | ||
# push: | ||
# branches: ["main"] | ||
# | ||
# jobs: | ||
# test: | ||
# environment: dev | ||
# runs-on: ubuntu-latest | ||
# outputs: | ||
# latest-dev-tag: ${{ steps.fetch_latest_tags.outputs.latest-dev-tag }} | ||
# latest-stable-tag: ${{ steps.fetch_latest_tags.outputs.latest-stable-tag }} | ||
# steps: | ||
# - uses: actions/checkout@v4 | ||
# | ||
# - name: Checkout utilities | ||
# uses: actions/checkout@v4 | ||
# with: | ||
# repository: bokulich-lab/utilities | ||
# token: ${{ secrets.BOTULICH_TOKEN }} | ||
# path: utilities | ||
# | ||
# - name: Install dependencies | ||
# run: python -m pip install requests | ||
# | ||
# - name: Fetch latest tags | ||
# id: fetch-tags | ||
# run: | | ||
# latest_tags=$(python utilities/ci/get-tags.py) | ||
# echo "$latest_tags" > tags.txt | ||
# echo "latest-dev-tag=$(grep 'latest-dev-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT | ||
# echo "latest-stable-tag=$(grep 'latest-stable-tag' tags.txt | cut -d '=' -f 2)" >> $GITHUB_OUTPUT | ||
# | ||
# - name: Create conda yaml | ||
# run: | | ||
# bash utilities/ci/get-dependencies.sh ${{ vars.DISTRO }} ${{ steps.fetch-tags.outputs.latest-dev-tag }} | ||
# cat environment.yml | ||
# | ||
# - name: Setup miniconda | ||
# uses: conda-incubator/setup-miniconda@v3 | ||
# with: | ||
# python-version: 3.8 | ||
# mamba-version: "*" | ||
# channels: conda-forge,defaults | ||
# channel-priority: true | ||
# activate-environment: conda-env | ||
# condarc-file: utilities/ci/condarc | ||
# # use-only-tar-bz2: true | ||
# | ||
# - name: Get date | ||
# id: get-date | ||
# run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT | ||
# shell: bash | ||
# | ||
# - name: Cache Conda env | ||
# uses: actions/cache@v3 | ||
# with: | ||
# path: /usr/share/miniconda/envs | ||
# key: | ||
# conda-${{ runner.os }}--${{ runner.arch }}--${{ | ||
# steps.get-date.outputs.today }}-${{ | ||
# hashFiles('environment.yml') }}-${{ env.CACHE_NUMBER | ||
# }} | ||
# env: | ||
# # Increase this value to reset cache if environment.yml has not changed | ||
# CACHE_NUMBER: 0 | ||
# id: cache | ||
# | ||
# - name: Update environment | ||
# run: | ||
# mamba env update -n conda-env -f environment.yml | ||
# if: steps.cache.outputs.cache-hit != 'true' | ||
# | ||
# - name: Install plugin | ||
# run: pip install . | ||
# | ||
# - name: Install dev dependencies | ||
# run: pip install pytest coverage | ||
# | ||
# - name: Run tests | ||
# run: make test-cov | ||
# | ||
# - uses: codecov/codecov-action@v4 | ||
# name: Upload coverage report | ||
# with: | ||
# files: ./coverage.xml | ||
# fail_ci_if_error: true | ||
# env: | ||
# CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2022-2023, QIIME 2 development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file LICENSE, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
|
||
from .filter_mags import filter_derep_mags, filter_mags | ||
|
||
__all__ = ["filter_derep_mags", "filter_mags"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2022-2023, QIIME 2 development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file LICENSE, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
import os | ||
|
||
import pandas as pd | ||
from qiime2 import Metadata | ||
from qiime2.util import duplicate | ||
|
||
from q2_types.feature_data_mag import MAGSequencesDirFmt | ||
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt | ||
|
||
|
||
def _filter_ids( | ||
ids: set, | ||
metadata: Metadata = None, | ||
where: str = None, | ||
exclude_ids: bool = False | ||
) -> set: | ||
""" | ||
Filters IDs based on the provided metadata. | ||
Parameters: | ||
ids (set): The set of IDs to filter. | ||
metadata (Metadata, optional): The metadata to use for filtering. | ||
Defaults to None. | ||
where (str, optional): The condition to use for filtering. | ||
Defaults to None. | ||
exclude_ids (bool, optional): Whether to exclude the IDs that | ||
match the condition. Defaults to False. | ||
Returns: | ||
set: The filtered set of IDs. | ||
""" | ||
selected_ids = metadata.get_ids(where=where) | ||
if not selected_ids: | ||
print("The filter query returned no IDs to filter out.") | ||
else: | ||
if exclude_ids: | ||
ids -= set(selected_ids) | ||
else: | ||
ids &= set(selected_ids) | ||
print(f"Found {len(ids)} IDs to keep.") | ||
return ids | ||
|
||
|
||
def _filter_manifest( | ||
manifest: pd.DataFrame, ids_to_keep: set, on: str = 'mag' | ||
) -> pd.DataFrame: | ||
""" | ||
Filters a manifest DataFrame based on a set of IDs. | ||
Parameters: | ||
manifest (pd.DataFrame): The manifest DataFrame to filter. | ||
ids_to_keep (set): The set of IDs to keep. | ||
on (str): The level on which to filter ('mag' or 'sample'). | ||
Defaults to 'mag'. | ||
Returns: | ||
pd.DataFrame: The filtered manifest DataFrame. | ||
""" | ||
if on == 'mag': | ||
lvl = 'mag-id' | ||
elif on == 'sample': | ||
lvl = 'sample-id' | ||
else: | ||
raise ValueError(f"Invalid value for 'on' parameter: {on}") | ||
|
||
manifest["filename"] = \ | ||
manifest.index.get_level_values('sample-id') + "/" + \ | ||
manifest.index.get_level_values('mag-id') + ".fasta" | ||
|
||
return manifest[manifest.index.get_level_values(lvl).isin(ids_to_keep)] | ||
|
||
|
||
def _mags_to_df(mags: MultiMAGSequencesDirFmt, on: str): | ||
""" | ||
Converts a MultiMAGSequencesDirFmt object to a DataFrame. | ||
Parameters: | ||
mags (MultiMAGSequencesDirFmt): The MultiMAGSequencesDirFmt | ||
object to convert. | ||
on (str): The level on which to index the DataFrame | ||
('sample' or 'mag'). | ||
Returns: | ||
pd.DataFrame: The converted DataFrame. | ||
""" | ||
mags_df = pd.DataFrame.from_dict(mags.sample_dict(), orient="index") | ||
mags_df = mags_df.stack().reset_index() | ||
mags_df.columns = ["sample_id", "mag_id", "mag_fp"] | ||
if on == 'sample': | ||
mags_df.set_index("sample_id", inplace=True) | ||
elif on == 'mag': | ||
mags_df.set_index("mag_id", inplace=True) | ||
return mags_df | ||
|
||
|
||
def filter_derep_mags( | ||
mags: MAGSequencesDirFmt, | ||
metadata: Metadata, | ||
where: str = None, | ||
exclude_ids: bool = False, | ||
) -> MAGSequencesDirFmt: | ||
results = MAGSequencesDirFmt() | ||
features = mags.feature_dict() | ||
ids_to_keep = _filter_ids( | ||
set(features.keys()), metadata, where, exclude_ids | ||
) | ||
try: | ||
for _id in ids_to_keep: | ||
duplicate( | ||
features[_id], os.path.join(str(results), f"{_id}.fasta") | ||
) | ||
except KeyError: | ||
raise ValueError(f"{_id!r} is not a MAG present in the input data.") | ||
|
||
return results | ||
|
||
|
||
def filter_mags( | ||
mags: MultiMAGSequencesDirFmt, | ||
metadata: Metadata, | ||
where: str = None, | ||
exclude_ids: bool = False, | ||
on: str = 'mag' | ||
) -> MultiMAGSequencesDirFmt: | ||
results = MultiMAGSequencesDirFmt() | ||
mags_df = _mags_to_df(mags, on) | ||
|
||
ids_to_keep = _filter_ids( | ||
set(mags_df.index), metadata, where, exclude_ids | ||
) | ||
|
||
filtered_mags = mags_df[mags_df.index.isin(ids_to_keep)] | ||
filtered_manifest = _filter_manifest( | ||
mags.manifest.view(pd.DataFrame), ids_to_keep, on=on | ||
) | ||
filtered_manifest.to_csv( | ||
os.path.join(str(results), "MANIFEST"), sep="," | ||
) | ||
try: | ||
for _id, row in filtered_mags.iterrows(): | ||
if on == 'mag': | ||
sample_dir = os.path.join(str(results), row["sample_id"]) | ||
mag_dest = os.path.join(sample_dir, f"{_id}.fasta") | ||
else: | ||
sample_dir = os.path.join(str(results), _id) | ||
mag_dest = os.path.join(sample_dir, f"{row['mag_id']}.fasta") | ||
os.makedirs(sample_dir, exist_ok=True) | ||
duplicate(row['mag_fp'], mag_dest) | ||
except KeyError: | ||
raise ValueError(f"{_id!r} is not a MAG present in the input data.") | ||
|
||
return results |
Oops, something went wrong.