Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add busco-results input to dereplicate-mags #213

Merged
merged 17 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 70 additions & 7 deletions q2_moshpit/dereplication/derep.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
import skbio
from q2_types.feature_data import DNAFASTAFormat
from qiime2 import Metadata
from scipy.cluster.hierarchy import ward, fcluster

from q2_types.feature_data_mag import MAGSequencesDirFmt
Expand Down Expand Up @@ -119,7 +120,7 @@ def _get_bin_lengths(mags: MultiMAGSequencesDirFmt) -> pd.Series:

def _remap_bins(
bin_clusters: List[List[str]],
longest_bins: List[str],
representative_bins: List[str],
distances: pd.DataFrame
) -> Dict[str, str]:
"""
Expand All @@ -129,7 +130,7 @@ def _remap_bins(
Args:
bin_clusters (list): A list of lists, where each inner list contains
the IDs of similar bins.
longest_bins (str): A list of longest bin for each cluster.
representative_bins (str): A list of longest bin for each cluster.
distances (pd.DataFrame): The original bin distance matrix.

Returns:
Expand All @@ -153,7 +154,7 @@ def _remap_bins(
final_bins = {}
for i, similar_bins in enumerate(bin_clusters):
for bin in similar_bins:
final_bins[bin] = longest_bins[i]
final_bins[bin] = representative_bins[i]
for bin in distances.index:
if bin not in final_bins:
final_bins[bin] = bin
Expand Down Expand Up @@ -257,22 +258,84 @@ def _generate_pa_table(
return presence_absence


def _get_representatives(mags, metadata, column_name, bin_clusters, find_max):
"""
This function identifies the representative bin for each cluster of bins.
If metadata is provided, the selection is based on a numerical metadata
column. In case of a tie, the longest bin is chosen. If metadata is not
provided, the longest bin is selected by default.

Args:
mags: A MultiMAGSequencesDirFmt object containing all bins.
metadata: Qiime metadata.
column_name: Name of a column in metadata.
bin_clusters: A list of lists where each inner list contains the IDs
of bins grouped by similarity.
find_max: Boolean, if true the bin with the highest value in the
metadata column will be chosen, if false the lowest value
is chosen.

Returns:
A list of representative bin IDs, one for each cluster.
"""
bin_lengths = _get_bin_lengths(mags)

# Choose by metadata
if metadata is not None:
try:
metadata_column = (
metadata.to_dataframe()[column_name].astype(float)
)
except KeyError:
raise KeyError(f'The column "{column_name}" does not exist '
f'in the metadata.')
except ValueError:
raise ValueError('The specified metadata column has to be '
'numerical.')

representative_bins = []
for bins in bin_clusters:
# Get bin IDs with the max or min column values in cluster
max_min_value_bins = (
values := metadata_column[bins]
)[values == (values.max() if find_max else values.min())].index

# If there's a tie, resolve by selecting the longest bin
if len(max_min_value_bins) > 1:
representative_bins.append(
bin_lengths[max_min_value_bins].idxmax()
)
else:
representative_bins.append(max_min_value_bins[0])

# Choose by length
else:
representative_bins = \
[bin_lengths[ids].idxmax() for ids in bin_clusters]

return representative_bins


def dereplicate_mags(
mags: MultiMAGSequencesDirFmt,
distance_matrix: skbio.DistanceMatrix,
threshold: float = 0.99,
metadata: Metadata = None,
metadata_column: str = "complete",
find_max: bool = True,
) -> (MAGSequencesDirFmt, pd.DataFrame):
distances = distance_matrix.to_data_frame()

# find similar bins, according to the threshold
bin_clusters = _find_similar_bins_fcluster(distances, threshold)

# find the longest bin in each cluster
bin_lengths = _get_bin_lengths(mags)
longest_bins = [bin_lengths[ids].idxmax() for ids in bin_clusters]
# select one representative bin per cluster by metadata and length
representative_bins = _get_representatives(
mags, metadata, metadata_column, bin_clusters, find_max
)

# generate a map between the original bins and the dereplicated bins
final_bins = _remap_bins(bin_clusters, longest_bins, distances)
final_bins = _remap_bins(bin_clusters, representative_bins, distances)

# generate dereplicated bin sequences
unique_bin_seqs = _write_unique_bins(mags, final_bins)
Expand Down
7 changes: 7 additions & 0 deletions q2_moshpit/dereplication/tests/data/busco_results.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mag_id sample_id input_file dataset complete single duplicated fragmented missing n_markers scaffold_n50 contigs_n50 percent_gaps scaffolds length
24dee6fe-9b84-45bb-8145-de7b092533a1 SRR13221817 bec9c09a-62c3-4fbb-8f7f-9fdf9aefc02f.fasta bacteria_odb10 28.2 27.4 0.8 8.9 62.9 124 4785 4785 0.000% 265 1219165
ca7012fc-ba65-40c3-84f5-05aa478a7585 SRR13221817 5978e667-0476-4921-8cc2-34b9d1b508c1.fasta bacteria_odb10 1.6 1.6 0.0 1.6 96.8 124 3548 3548 0.000% 67 245922
fb0bc871-04f6-486b-a10e-8e0cb66f8de3 SRR13221817 625c95e6-ac2f-4e6e-9470-af8cd11c75dd.fasta bacteria_odb10 26.6 26.6 0.0 3.2 70.2 124 78679 78679 0.000% 17 714893
d65a71fa-4279-4588-b937-0747ed5d604d SRR13221817 77029d46-09f7-4d6e-940f-efcafc6ade9e.fasta bacteria_odb10 96.0 95.2 0.8 1.6 2.4 124 35651 35651 0.000% 165 4293244
db03f8b6-28e1-48c5-a47c-9c65f38f7357 SRR13221817 0367f168-9d6a-48d6-8ff8-b532123a13f5.fasta bacteria_odb10 96.0 96.0 0.0 0.8 3.2 124 52720 52720 0.000% 166 4893325
fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf SRR13221817 509a5fe7-a5ab-42ea-8645-0510aa77e1fe.fasta bacteria_odb10 96.0 19.4 0.0 1.6 79.0 124 71847 71847 0.000% 20 761157
78 changes: 76 additions & 2 deletions q2_moshpit/dereplication/tests/test_dereplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
import unittest

import pandas as pd
import qiime2
import skbio

from q2_moshpit.dereplication.derep import (
_find_similar_bins_fcluster, _get_bin_lengths, _remap_bins,
_reassign_bins_to_samples, _write_unique_bins, _generate_pa_table,
dereplicate_mags
dereplicate_mags, _get_representatives
)
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
Expand Down Expand Up @@ -72,6 +73,13 @@ def setUp(self):
'24dee6fe-9b84-45bb-8145-de7b092533a1': 0
}
}
self.busco_results = pd.read_csv(
filepath_or_buffer=self.get_data_path("busco_results.tsv"),
sep='\t',
index_col=0,
dtype='str'
)
self.busco_results.index.name = 'id'

def test_find_clusters_fcluster_similar(self):
obs = _find_similar_bins_fcluster(self.dist_matrix_df, 0.99)
Expand Down Expand Up @@ -138,7 +146,8 @@ def test_generate_pa_table(self):
def test_dereplicate_mags(self):
mags = MultiMAGSequencesDirFmt(self.get_data_path('mags'), mode='r')

obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix, 0.99)
obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix,
threshold=0.99)
exp_mags = MAGSequencesDirFmt(
self.get_data_path('mags-unique'), mode='r'
)
Expand All @@ -157,6 +166,71 @@ def test_dereplicate_mags(self):
# assert correct PA table was generated
pd.testing.assert_frame_equal(exp_pa, obs_pa)

def test_get_representatives_metadata_max_value(self):
obs = _get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column_name="complete",
bin_clusters=self.clusters_99,
find_max=True
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf',
'd65a71fa-4279-4588-b937-0747ed5d604d'
]
self.assertListEqual(exp, obs)

def test_get_representatives_metadata_min_value(self):
obs = _get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column_name="complete",
bin_clusters=self.clusters_99,
find_max=False
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'fb0bc871-04f6-486b-a10e-8e0cb66f8de3'
]
self.assertListEqual(exp, obs)

def test_get_representatives_length(self):
obs = _get_representatives(
mags=self.bins,
metadata=None,
column_name=None,
bin_clusters=self.clusters_99,
find_max=True
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'd65a71fa-4279-4588-b937-0747ed5d604d'
]
self.assertListEqual(exp, obs)

def test_get_representatives_key_error(self):
with self.assertRaisesRegex(KeyError, "version"):
_get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column_name="version",
bin_clusters=self.clusters_99,
find_max=True
)

def test_get_representatives_value_error(self):
with self.assertRaisesRegex(ValueError, "numerical"):
_get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column_name="dataset",
bin_clusters=self.clusters_99,
find_max=True
)


if __name__ == '__main__':
unittest.main()
27 changes: 22 additions & 5 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,22 +418,32 @@
function=q2_moshpit.dereplication.dereplicate_mags,
inputs={
"mags": SampleData[MAGs],
"distance_matrix": DistanceMatrix
"distance_matrix": DistanceMatrix,
},
parameters={
"threshold": Float % Range(0, 1, inclusive_end=True)
"threshold": Float % Range(0, 1, inclusive_end=True),
"metadata": Metadata,
"metadata_column": Str,
"find_max": Bool
},
outputs=[
('dereplicated_mags', FeatureData[MAG]),
('feature_table', FeatureTable[PresenceAbsence])
],
input_descriptions={
"mags": "MAGs to be dereplicated.",
"distance_matrix": "Matrix of distances between MAGs."
"distance_matrix": "Matrix of distances between MAGs.",
},
parameter_descriptions={
"threshold": "Similarity threshold required to consider "
"two bins identical."
"two bins identical.",
"metadata": "Metadata table.",
"metadata_column": "Name of the metadata column used to choose the "
"most representative bins.",
"find_max": "Set to True to choose the bin with the highest value in "
"the metadata column. Set to False to choose the bin "
"with the lowest value.",

},
output_descriptions={
"dereplicated_mags": "Dereplicated MAGs.",
Expand All @@ -443,7 +453,14 @@
description='This method dereplicates MAGs from multiple samples '
'using distances between them found in the provided '
'distance matrix. For each cluster of similar MAGs, '
'the longest one will be selected as the representative.',
'the longest one will be selected as the representative. If '
'metadata is given as input, the MAG with the '
'highest or lowest value in the specified metadata column '
'is chosen, depending on the parameter "find-max". '
'If there are MAGs with identical values, the longer one is '
'chosen. For example an artifact of type BUSCOResults can be '
'passed as metadata, and the dereplication can be done by '
'highest "completness".',
misialq marked this conversation as resolved.
Show resolved Hide resolved
citations=[]
)

Expand Down
Loading