From 10066c46fa87790e18154f45ab52758754feb401 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 11 Mar 2024 10:42:58 +0100 Subject: [PATCH] ENH: add 'filter-contigs' action --- q2_assembly/__init__.py | 12 ++++- q2_assembly/_action_params.py | 28 +++++++++++ q2_assembly/filter/__init__.py | 13 +++++ q2_assembly/filter/filter.py | 88 ++++++++++++++++++++++++++++++++++ q2_assembly/plugin_setup.py | 14 ++++++ 5 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 q2_assembly/filter/__init__.py create mode 100644 q2_assembly/filter/filter.py diff --git a/q2_assembly/__init__.py b/q2_assembly/__init__.py index 24bafa0..3c4f738 100644 --- a/q2_assembly/__init__.py +++ b/q2_assembly/__init__.py @@ -8,6 +8,7 @@ from ._version import get_versions from .bowtie2 import indexing, mapping +from .filter import filter from .helpers import helpers from .iss import iss from .megahit import megahit @@ -17,4 +18,13 @@ __version__ = get_versions()["version"] del get_versions -__all__ = ["indexing", "mapping", "iss", "megahit", "quast", "spades", "helpers"] +__all__ = [ + "indexing", + "mapping", + "iss", + "megahit", + "quast", + "spades", + "helpers", + "filter", +] diff --git a/q2_assembly/_action_params.py b/q2_assembly/_action_params.py index ea3ca27..6857785 100644 --- a/q2_assembly/_action_params.py +++ b/q2_assembly/_action_params.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- from qiime2.core.type import Bool, Choices, Float, Int, List, Range, Str +from qiime2.plugin import Metadata megahit_params = { "presets": Str % Choices(["meta", "meta-sensitive", "meta-large", "disabled"]), @@ -451,3 +452,30 @@ " samples." } # fmt: on + +filter_contigs_params = { + "metadata": Metadata, + "where": Str, + "exclude_ids": Bool, + "remove_empty": Bool, + "length_threshold": Int % Range(0, None), +} +# fmt: off +filter_contigs_param_descriptions = { + "metadata": "Sample metadata indicating which sample ids to filter. " + "The optional `where` parameter may be used to filter ids " + "based on specified conditions in the metadata. The " + "optional `exclude_ids` parameter may be used to exclude " + "the ids specified in the metadata from the filter.", + "where": "Optional SQLite WHERE clause specifying sample metadata " + "criteria that must be met to be included in the filtered " + "data. If not provided, all samples in `metadata` that are " + "also in the contig data will be retained.", + "exclude_ids": "Defaults to False. If True, the samples selected by " + "the `metadata` and optional `where` parameter will be " + "excluded from the filtered data.", + "remove_empty": "If True, samples with no contigs will be removed from " + "the filtered data.", + "length_threshold": "Only keep contigs of the given length and longer." +} +# fmt: on diff --git a/q2_assembly/filter/__init__.py b/q2_assembly/filter/__init__.py new file mode 100644 index 0000000..e7b64dc --- /dev/null +++ b/q2_assembly/filter/__init__.py @@ -0,0 +1,13 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from .filter import filter_contigs + +__all__ = [ + "filter_contigs", +] diff --git a/q2_assembly/filter/filter.py b/q2_assembly/filter/filter.py new file mode 100644 index 0000000..9a3f46f --- /dev/null +++ b/q2_assembly/filter/filter.py @@ -0,0 +1,88 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os + +import skbio +from q2_types.per_sample_sequences import ContigSequencesDirFmt +from qiime2 import Metadata +from qiime2.util import duplicate + + +def _find_empty_samples(samples: dict) -> set: + empty_samples = set() + for sample_id, sample_fp in samples.items(): + if os.path.getsize(sample_fp) == 0: + empty_samples.add(sample_id) + return empty_samples + + +def _filter_by_length( + contigs: ContigSequencesDirFmt, threshold: int +) -> ContigSequencesDirFmt: + results = ContigSequencesDirFmt() + print( + f"Filtering contigs by length - only contigs >= {threshold} bp long will " + f"be retained." + ) + for sample_id, sample_fp in contigs.sample_dict().items(): + out_fp = os.path.join(str(results), f"{sample_id}_contigs.fa") + keep, remove = 0, 0 + with open(out_fp, "w") as f_out: + for contig in skbio.io.read(sample_fp, format="fasta"): + if len(contig) >= threshold: + skbio.io.write(contig, format="fasta", into=f_out) + keep += 1 + else: + remove += 1 + print( + f"Sample {sample_id}: {remove + keep} contigs\n {remove} contigs " + f"removed\n {keep} contigs retained" + ) + return results + + +def filter_contigs( + contigs: ContigSequencesDirFmt, + metadata: Metadata = None, + where: str = None, + exclude_ids: bool = False, + length_threshold: int = 0, + remove_empty: bool = False, +) -> ContigSequencesDirFmt: + if length_threshold > 0: + contigs = _filter_by_length(contigs, length_threshold) + + results = ContigSequencesDirFmt() + samples = contigs.sample_dict() + ids_to_keep = set(samples.keys()) + if remove_empty: + ids_to_remove = _find_empty_samples(samples) + ids_to_keep -= ids_to_remove + if ids_to_remove: + print(f"Removing empty samples: {', '.join(sorted(ids_to_remove))}") + + if metadata: + selected_ids = metadata.get_ids(where=where) + if not selected_ids: + print("The filter query returned no IDs to filter out.") + + if exclude_ids: + ids_to_keep -= set(selected_ids) + else: + ids_to_keep &= set(selected_ids) + + if len(ids_to_keep) == 0: + raise ValueError("No samples remain after filtering.") + + try: + for _id in ids_to_keep: + duplicate(samples[_id], os.path.join(str(results), f"{_id}_contigs.fa")) + except KeyError: + raise ValueError(f"{_id!r} is not a sample present in the contig data.") + + return results diff --git a/q2_assembly/plugin_setup.py b/q2_assembly/plugin_setup.py index 25e7c79..a2bb265 100644 --- a/q2_assembly/plugin_setup.py +++ b/q2_assembly/plugin_setup.py @@ -20,6 +20,7 @@ ) from q2_types.per_sample_sequences._type import AlignmentMap from q2_types.sample_data import SampleData +from qiime2 import Metadata from qiime2.core.type import Bool, Choices, Properties, TypeMap, Visualization from qiime2.plugin import Citations, Collection, Int, List, Plugin, Range @@ -40,6 +41,8 @@ quast_params, spades_param_descriptions, spades_params, + filter_contigs_params, + filter_contigs_param_descriptions, ) from q2_assembly.quast.types import ( QUASTResults, @@ -392,6 +395,17 @@ description="Not to be called directly. Used by map_reads.", ) +plugin.methods.register_function( + function=q2_assembly.filter.filter_contigs, + inputs={"contigs": SampleData[Contigs]}, + parameters=filter_contigs_params, + outputs={"filtered_contigs": SampleData[Contigs]}, + input_descriptions={"contigs": "The contigs to filter."}, + parameter_descriptions=filter_contigs_param_descriptions, + name="Filter contigs.", + description="Filter contigs based on metadata.", +) + plugin.register_semantic_types(QUASTResults) plugin.register_semantic_type_to_format( QUASTResults, artifact_format=QUASTResultsDirectoryFormat