Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add filter-kraken-reports #212

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion q2_moshpit/kraken2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
from .bracken import estimate_bracken
from .database import build_kraken_db
from .classification import classify_kraken2, _classify_kraken2
from .filter import filter_kraken_reports
from .select import kraken2_to_features, kraken2_to_mag_features

__all__ = ['build_kraken_db', 'classify_kraken2', '_classify_kraken2',
'estimate_bracken', 'kraken2_to_features',
'kraken2_to_mag_features']
'kraken2_to_mag_features', 'filter_kraken_reports']
119 changes: 119 additions & 0 deletions q2_moshpit/kraken2/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os

from q2_types.kraken2 import Kraken2ReportDirectoryFormat
from qiime2 import Metadata
from qiime2.util import duplicate


def _validate_parameters(metadata, remove_empty, where, exclude_ids):
if not metadata and not remove_empty:
raise ValueError('Please specify parameters "--m-metadata-file" or '
'"--p-remove-empty" to filter accordingly.')

if where and not metadata:
raise ValueError('The parameter "--p-where" can only be specified in '
'combination with the parameter '
'"--m-metadata-file".')

if exclude_ids and not metadata:
raise ValueError('The parameter "--p-exclude-ids" can only be '
'specified in combination with the parameter '
'"--m-metadata-file".')


def _find_empty_reports(file_dict: dict) -> set:
empty_ids = set()
for inner_dict in file_dict.values():
for inner_id, file_fp in inner_dict.items():
with open(file_fp, 'r') as file:
# Read the first line and check if there's a second line
first_line = file.readline().strip()
second_line = file.readline()

# Only process if the file has exactly one line
if not second_line:
columns = first_line.split('\t')

# Check if the 6th column contains "unclassified" or
# "root"
if len(columns) > 5 and columns[5] in ["unclassified",
"root"]:
empty_ids.add(inner_id)

return empty_ids


def _create_filtered_results(file_dict, ids_to_keep):
# Specify output format and file name suffix
results = Kraken2ReportDirectoryFormat()
suffix = ".report"

for outer_id, inner_dict in file_dict.items():
for inner_id, file_fp in inner_dict.items():
if inner_id in ids_to_keep:
if outer_id:
os.makedirs(os.path.join(str(results), outer_id),
exist_ok=True)
duplicate(
src=file_dict[outer_id][inner_id],
dst=os.path.join(str(results), outer_id,
f"{inner_id}{suffix}.txt")
)

return results


def filter_kraken_reports(
reports: Kraken2ReportDirectoryFormat,
metadata: Metadata = None,
where: str = None,
exclude_ids: bool = False,
remove_empty: bool = False,
) -> Kraken2ReportDirectoryFormat:
# Validate parameters
_validate_parameters(metadata, remove_empty, where, exclude_ids)

# Create file_dict
file_dict = reports.file_dict(
suffixes=[".report"],
)

# Create fake outer ID if there is none, to make it easier to iterate
if not any(isinstance(value, dict) for value in file_dict.values()):
file_dict = {"": file_dict}

# Extract all inner IDs
ids_to_keep = {key for inner in file_dict.values() for key in inner}

# Remove IDs that are linked to an empty report
if remove_empty:
ids_to_remove = _find_empty_reports(file_dict)
ids_to_keep -= ids_to_remove
if ids_to_remove:
print(f"Removing empty IDs: {', '.join(sorted(ids_to_remove))}")

if metadata:
selected_ids = metadata.get_ids(where=where)
if not selected_ids:
print("The filter query returned no IDs to filter out.")

if not (set(selected_ids) - ids_to_keep):
print(f"IDs {', '.join(sorted(set(selected_ids) - ids_to_keep))} "
f"are not present in the data.")

if exclude_ids:
ids_to_keep -= set(selected_ids)
else:
ids_to_keep &= set(selected_ids)

if len(ids_to_keep) == 0:
raise ValueError("No IDs remain after filtering.")

return _create_filtered_results(file_dict, ids_to_keep)
3 changes: 3 additions & 0 deletions q2_moshpit/kraken2/tests/data/metadata/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id timepoint seed
3b72d1a7-ddb0-4dc7-ac36-080ceda04aaa 0 Forasteiro
8894435a-c836-4c18-b475-8b38a9ab6c6b 24 Hybrid
130 changes: 130 additions & 0 deletions q2_moshpit/kraken2/tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os.path

import pandas as pd
import qiime2
from q2_types.kraken2 import Kraken2ReportDirectoryFormat
from qiime2.plugin.testing import TestPluginBase

from q2_moshpit.kraken2.filter import _validate_parameters, \
_find_empty_reports, _create_filtered_results, \
filter_kraken_reports


class TestFilterKrakenReports(TestPluginBase):
package = "q2_moshpit.kraken2.tests"

@classmethod
def setUpClass(cls):
super().setUpClass()

instance = cls()

cls.report_mags = Kraken2ReportDirectoryFormat(
instance.get_data_path("reports-mags"), "r"
)
cls.report_mags_unclassified_missing_frac = (
Kraken2ReportDirectoryFormat(
instance.get_data_path(
"reports-mags-unclassified-missing-frac"),
"r"))

cls.file_dict_report_mags = cls.report_mags.file_dict(
suffixes=[".report"])
cls.file_dict_report_unclassified = (
cls.report_mags_unclassified_missing_frac.file_dict(
suffixes=[".report"]))

cls.metadata_df = pd.read_csv(
instance.get_data_path("metadata/metadata.tsv"),
sep="\t",
index_col="id"
)

cls.metadata1 = qiime2.Metadata(cls.metadata_df)

cls.metadata_df.drop(
"8894435a-c836-4c18-b475-8b38a9ab6c6b", inplace=True)
cls.metadata2 = qiime2.Metadata(cls.metadata_df)

def test_find_empty_reports(self):
empty_reports = _find_empty_reports(
file_dict={"": self.file_dict_report_mags}
)
self.assertEqual(
empty_reports, {"8894435a-c836-4c18-b475-8b38a9ab6c6b"}
)

def test_find_empty_reports_missing_frac(self):
empty_reports = _find_empty_reports(
file_dict={"": self.file_dict_report_unclassified}
)
self.assertEqual(
empty_reports, {"8894435a-c836-4c18-b475-8b38a9ab6c6b"}
)

def test_create_filter_results_reports(self):
results = _create_filtered_results(
file_dict={"sample1": self.file_dict_report_unclassified},
ids_to_keep={"8894435a-c836-4c18-b475-8b38a9ab6c6b"}
)
self.assertTrue(
os.path.exists(
os.path.join(
str(results),
"sample1",
"8894435a-c836-4c18-b475-8b38a9ab6c6b.report.txt"
)
)
)

def test_filter_kraken_reports_error(self):
with self.assertRaisesRegex(
ValueError, "No IDs remain after filtering."
):
filter_kraken_reports(
reports=self.report_mags_unclassified_missing_frac,
metadata=self.metadata1,
exclude_ids=True
)

def test_filter_kraken_reports_metadata(self):
results = filter_kraken_reports(
reports=self.report_mags_unclassified_missing_frac,
metadata=self.metadata2,
)
self.assertTrue(
os.path.exists(
os.path.join(
str(results),
"3b72d1a7-ddb0-4dc7-ac36-080ceda04aaa.report.txt"
)
)
)

def test_missing_metadata_and_remove_empty(self):
with self.assertRaisesRegex(
ValueError, r'--m-metadata-file.*--p-remove-empty'
):
_validate_parameters(metadata=None, remove_empty=False,
where=None, exclude_ids=False)

def test_where_without_metadata(self):
with self.assertRaisesRegex(
ValueError, r'--p-where.*--m-metadata-file'
):
_validate_parameters(metadata=None, remove_empty=True,
where=True, exclude_ids=False)

def test_exclude_ids_without_metadata(self):
with self.assertRaisesRegex(
ValueError, r'--p-exclude-ids.*--m-metadata-file'
):
_validate_parameters(metadata=None, remove_empty=True,
where=None, exclude_ids=True)
55 changes: 54 additions & 1 deletion q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from q2_types.sample_data import SampleData
from q2_types.feature_map import FeatureMap, MAGtoContigs
from qiime2.core.type import (
Bool, Range, Int, Str, Float, List, Choices, Visualization
Bool, Range, Int, Str, Float, List, Choices, Visualization, TypeMatch
)
from qiime2.core.type import (Properties, TypeMap)
from qiime2.plugin import (Plugin, Citations)
Expand Down Expand Up @@ -1791,6 +1791,59 @@
citations=[]
)

T_filter_kraken_reports = TypeMatch([
SampleData[Kraken2Reports % Properties('reads', 'contigs', 'mags')],
SampleData[Kraken2Reports % Properties('reads', 'contigs', 'mags')],
SampleData[Kraken2Reports % Properties('reads', 'contigs')],
SampleData[Kraken2Reports % Properties('reads', 'mags')],
SampleData[Kraken2Reports % Properties('contigs', 'mags')],
SampleData[Kraken2Reports % Properties('reads')],
SampleData[Kraken2Reports % Properties('contigs')],
SampleData[Kraken2Reports % Properties('mags')],
FeatureData[Kraken2Reports % Properties('reads', 'contigs', 'mags')],
FeatureData[Kraken2Reports % Properties('reads', 'contigs')],
FeatureData[Kraken2Reports % Properties('reads', 'mags')],
FeatureData[Kraken2Reports % Properties('contigs', 'mags')],
FeatureData[Kraken2Reports % Properties('reads')],
FeatureData[Kraken2Reports % Properties('contigs')],
FeatureData[Kraken2Reports % Properties('mags')],
])

filter_reports_param_descriptions = {
"metadata": "Metadata indicating which IDs to filter. The optional "
"`where` parameter may be used to filter IDs based on "
"specified conditions in the metadata. The optional "
"`exclude_ids` parameter may be used to exclude the IDs "
"specified in the metadata from the filter.",
"where": "Optional SQLite WHERE clause specifying metadata criteria that "
"must be met to be included in the filtered data. If not "
"provided, all IDs in `metadata` that are also in the data will "
"be retained.",
"exclude_ids": "If True, the samples selected by the `metadata` and "
"optional `where` parameter will be excluded from the "
"filtered data.",
"remove_empty": "If True, reports with 100% unclassified reads will be "
"removed from the filtered data.",
}

plugin.methods.register_function(
function=q2_moshpit.kraken2.filter_kraken_reports,
inputs={"reports": T_filter_kraken_reports},
parameters={
"metadata": Metadata,
"where": Str,
"exclude_ids": Bool,
"remove_empty": Bool,
},
outputs={"filtered_reports": T_filter_kraken_reports},
input_descriptions={"reports": "The reports to filter."},
parameter_descriptions=filter_contigs_param_descriptions,
name="Filter Kraken reports.",
description="Filter Kraken reports based on metadata or remove reports "
"with 100% unclassified reads.",
)


plugin.register_semantic_types(BUSCOResults, BuscoDB)
plugin.register_formats(
BUSCOResultsFormat, BUSCOResultsDirectoryFormat, BuscoDatabaseDirFmt
Expand Down
Loading