Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add 'filter-contigs' action #92

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion q2_assembly/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from ._version import get_versions
from .bowtie2 import indexing, mapping
from .filter import filter
from .helpers import helpers
from .iss import iss
from .megahit import megahit
Expand All @@ -17,4 +18,13 @@
__version__ = get_versions()["version"]
del get_versions

__all__ = ["indexing", "mapping", "iss", "megahit", "quast", "spades", "helpers"]
__all__ = [
"indexing",
"mapping",
"iss",
"megahit",
"quast",
"spades",
"helpers",
"filter",
]
28 changes: 28 additions & 0 deletions q2_assembly/_action_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# ----------------------------------------------------------------------------

from qiime2.core.type import Bool, Choices, Float, Int, List, Range, Str
from qiime2.plugin import Metadata

megahit_params = {
"presets": Str % Choices(["meta", "meta-sensitive", "meta-large", "disabled"]),
Expand Down Expand Up @@ -455,3 +456,30 @@
" samples."
}
# fmt: on

filter_contigs_params = {
"metadata": Metadata,
"where": Str,
"exclude_ids": Bool,
"remove_empty": Bool,
"length_threshold": Int % Range(0, None),
}
# fmt: off
filter_contigs_param_descriptions = {
"metadata": "Sample metadata indicating which sample ids to filter. "
"The optional `where` parameter may be used to filter ids "
"based on specified conditions in the metadata. The "
"optional `exclude_ids` parameter may be used to exclude "
"the ids specified in the metadata from the filter.",
"where": "Optional SQLite WHERE clause specifying sample metadata "
"criteria that must be met to be included in the filtered "
"data. If not provided, all samples in `metadata` that are "
"also in the contig data will be retained.",
"exclude_ids": "Defaults to False. If True, the samples selected by "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"exclude_ids": "Defaults to False. If True, the samples selected by "
"exclude_ids": "If True, the samples selected by "

Default is already displayed

"the `metadata` and optional `where` parameter will be "
"excluded from the filtered data.",
"remove_empty": "If True, samples with no contigs will be removed from "
"the filtered data.",
"length_threshold": "Only keep contigs of the given length and longer."
}
# fmt: on
13 changes: 13 additions & 0 deletions q2_assembly/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .filter import filter_contigs

__all__ = [
"filter_contigs",
]
88 changes: 88 additions & 0 deletions q2_assembly/filter/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os

import skbio
from q2_types.per_sample_sequences import ContigSequencesDirFmt
from qiime2 import Metadata
from qiime2.util import duplicate


def _find_empty_samples(samples: dict) -> set:
empty_samples = set()
for sample_id, sample_fp in samples.items():
if os.path.getsize(sample_fp) == 0:
empty_samples.add(sample_id)
return empty_samples


def _filter_by_length(
contigs: ContigSequencesDirFmt, threshold: int
) -> ContigSequencesDirFmt:
results = ContigSequencesDirFmt()
print(
f"Filtering contigs by length - only contigs >= {threshold} bp long will "
f"be retained."
)
for sample_id, sample_fp in contigs.sample_dict().items():
out_fp = os.path.join(str(results), f"{sample_id}_contigs.fa")
keep, remove = 0, 0
with open(out_fp, "w") as f_out:
for contig in skbio.io.read(sample_fp, format="fasta"):
if len(contig) >= threshold:
skbio.io.write(contig, format="fasta", into=f_out)
keep += 1
else:
remove += 1
print(
f"Sample {sample_id}: {remove + keep} contigs\n {remove} contigs "
f"removed\n {keep} contigs retained"
)
return results


def filter_contigs(
contigs: ContigSequencesDirFmt,
metadata: Metadata = None,
where: str = None,
exclude_ids: bool = False,
length_threshold: int = 0,
remove_empty: bool = False,
) -> ContigSequencesDirFmt:
if length_threshold > 0:
contigs = _filter_by_length(contigs, length_threshold)

results = ContigSequencesDirFmt()
samples = contigs.sample_dict()
ids_to_keep = set(samples.keys())
if remove_empty:
ids_to_remove = _find_empty_samples(samples)
ids_to_keep -= ids_to_remove
if ids_to_remove:
print(f"Removing empty samples: {', '.join(sorted(ids_to_remove))}")

if metadata:
selected_ids = metadata.get_ids(where=where)
if not selected_ids:
print("The filter query returned no IDs to filter out.")

if exclude_ids:
ids_to_keep -= set(selected_ids)
else:
ids_to_keep &= set(selected_ids)

if len(ids_to_keep) == 0:
raise ValueError("No samples remain after filtering.")

try:
for _id in ids_to_keep:
duplicate(samples[_id], os.path.join(str(results), f"{_id}_contigs.fa"))
except KeyError:
raise ValueError(f"{_id!r} is not a sample present in the contig data.")

return results
14 changes: 14 additions & 0 deletions q2_assembly/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SingleBowtie2Index,
)
from q2_types.sample_data import SampleData
from qiime2 import Metadata
from qiime2.core.type import Bool, Choices, Properties, Str, TypeMap, Visualization
from qiime2.plugin import Citations, Collection, Int, List, Plugin, Range

Expand All @@ -41,6 +42,8 @@
quast_params,
spades_param_descriptions,
spades_params,
filter_contigs_params,
filter_contigs_param_descriptions,
)
from q2_assembly.quast.types import (
QUASTResults,
Expand Down Expand Up @@ -438,6 +441,17 @@
"GenomeData[DNASequence] to a GenomeData[DNASequence] artifact.",
)

plugin.methods.register_function(
function=q2_assembly.filter.filter_contigs,
inputs={"contigs": SampleData[Contigs]},
parameters=filter_contigs_params,
outputs={"filtered_contigs": SampleData[Contigs]},
input_descriptions={"contigs": "The contigs to filter."},
parameter_descriptions=filter_contigs_param_descriptions,
name="Filter contigs.",
description="Filter contigs based on metadata.",
)

plugin.register_semantic_types(QUASTResults)
plugin.register_semantic_type_to_format(
QUASTResults, artifact_format=QUASTResultsDirectoryFormat
Expand Down
Loading