Skip to content

Commit

Permalink
ENH: Add busco-results input to dereplicate-mags (#213)
Browse files Browse the repository at this point in the history
Co-authored-by: Michal Ziemski <[email protected]>
Co-authored-by: Copilot <[email protected]>
  • Loading branch information
3 people authored Dec 13, 2024
1 parent 57d5bbd commit f09a381
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 14 deletions.
73 changes: 66 additions & 7 deletions q2_moshpit/dereplication/derep.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
import skbio
from q2_types.feature_data import DNAFASTAFormat
from qiime2 import Metadata
from scipy.cluster.hierarchy import ward, fcluster

from q2_types.feature_data_mag import MAGSequencesDirFmt
Expand Down Expand Up @@ -119,7 +120,7 @@ def _get_bin_lengths(mags: MultiMAGSequencesDirFmt) -> pd.Series:

def _remap_bins(
bin_clusters: List[List[str]],
longest_bins: List[str],
representative_bins: List[str],
distances: pd.DataFrame
) -> Dict[str, str]:
"""
Expand All @@ -129,7 +130,7 @@ def _remap_bins(
Args:
bin_clusters (list): A list of lists, where each inner list contains
the IDs of similar bins.
longest_bins (str): A list of longest bin for each cluster.
representative_bins (str): A list of longest bin for each cluster.
distances (pd.DataFrame): The original bin distance matrix.
Returns:
Expand All @@ -153,7 +154,7 @@ def _remap_bins(
final_bins = {}
for i, similar_bins in enumerate(bin_clusters):
for bin in similar_bins:
final_bins[bin] = longest_bins[i]
final_bins[bin] = representative_bins[i]
for bin in distances.index:
if bin not in final_bins:
final_bins[bin] = bin
Expand Down Expand Up @@ -257,22 +258,80 @@ def _generate_pa_table(
return presence_absence


def _get_representatives(mags, metadata, column, bin_clusters, find_max):
"""
This function identifies the representative bin for each cluster of bins.
If metadata is provided, the selection is based on a numerical metadata
column. In case of a tie, the longest bin is chosen. If metadata is not
provided, the longest bin is selected by default.
Args:
mags: A MultiMAGSequencesDirFmt object containing all bins.
metadata: Qiime metadata.
column: Name of a column in metadata.
bin_clusters: A list of lists where each inner list contains the IDs
of bins grouped by similarity.
find_max: Boolean, if true the bin with the highest value in the
metadata column will be chosen, if false the lowest value
is chosen.
Returns:
A list of representative bin IDs, one for each cluster.
"""
bin_lengths = _get_bin_lengths(mags)
method = pd.Series.max if find_max else pd.Series.min

# Choose by metadata
if metadata is not None:
try:
metadata_col = metadata.to_dataframe()[column].astype(float)
except KeyError:
raise KeyError(f'The column "{column}" does not exist '
f'in the metadata.')
except ValueError:
raise ValueError('The specified metadata column has to be '
'numerical.')

representative_bins = []
for bins in bin_clusters:
# Get bin IDs with the max or min column values in cluster
values = metadata_col[bins]
selected_bins = values[values == method(values)].index

# If there's a tie, resolve by selecting the longest bin
if len(selected_bins) > 1:
representative_bins.append(bin_lengths[selected_bins].idxmax())
else:
representative_bins.append(selected_bins[0])

# Choose by length
else:
representative_bins = \
[bin_lengths[ids].idxmax() for ids in bin_clusters]

return representative_bins


def dereplicate_mags(
mags: MultiMAGSequencesDirFmt,
distance_matrix: skbio.DistanceMatrix,
threshold: float = 0.99,
metadata: Metadata = None,
metadata_column: str = "complete",
find_max: bool = True,
) -> (MAGSequencesDirFmt, pd.DataFrame):
distances = distance_matrix.to_data_frame()

# find similar bins, according to the threshold
bin_clusters = _find_similar_bins_fcluster(distances, threshold)

# find the longest bin in each cluster
bin_lengths = _get_bin_lengths(mags)
longest_bins = [bin_lengths[ids].idxmax() for ids in bin_clusters]
# select one representative bin per cluster by metadata and length
representative_bins = _get_representatives(
mags, metadata, metadata_column, bin_clusters, find_max
)

# generate a map between the original bins and the dereplicated bins
final_bins = _remap_bins(bin_clusters, longest_bins, distances)
final_bins = _remap_bins(bin_clusters, representative_bins, distances)

# generate dereplicated bin sequences
unique_bin_seqs = _write_unique_bins(mags, final_bins)
Expand Down
7 changes: 7 additions & 0 deletions q2_moshpit/dereplication/tests/data/busco_results.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mag_id sample_id input_file dataset complete single duplicated fragmented missing n_markers scaffold_n50 contigs_n50 percent_gaps scaffolds length
24dee6fe-9b84-45bb-8145-de7b092533a1 SRR13221817 bec9c09a-62c3-4fbb-8f7f-9fdf9aefc02f.fasta bacteria_odb10 28.2 27.4 0.8 8.9 62.9 124 4785 4785 0.000% 265 1219165
ca7012fc-ba65-40c3-84f5-05aa478a7585 SRR13221817 5978e667-0476-4921-8cc2-34b9d1b508c1.fasta bacteria_odb10 1.6 1.6 0.0 1.6 96.8 124 3548 3548 0.000% 67 245922
fb0bc871-04f6-486b-a10e-8e0cb66f8de3 SRR13221817 625c95e6-ac2f-4e6e-9470-af8cd11c75dd.fasta bacteria_odb10 26.6 26.6 0.0 3.2 70.2 124 78679 78679 0.000% 17 714893
d65a71fa-4279-4588-b937-0747ed5d604d SRR13221817 77029d46-09f7-4d6e-940f-efcafc6ade9e.fasta bacteria_odb10 96.0 95.2 0.8 1.6 2.4 124 35651 35651 0.000% 165 4293244
db03f8b6-28e1-48c5-a47c-9c65f38f7357 SRR13221817 0367f168-9d6a-48d6-8ff8-b532123a13f5.fasta bacteria_odb10 96.0 96.0 0.0 0.8 3.2 124 52720 52720 0.000% 166 4893325
fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf SRR13221817 509a5fe7-a5ab-42ea-8645-0510aa77e1fe.fasta bacteria_odb10 96.0 19.4 0.0 1.6 79.0 124 71847 71847 0.000% 20 761157
78 changes: 76 additions & 2 deletions q2_moshpit/dereplication/tests/test_dereplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
import unittest

import pandas as pd
import qiime2
import skbio

from q2_moshpit.dereplication.derep import (
_find_similar_bins_fcluster, _get_bin_lengths, _remap_bins,
_reassign_bins_to_samples, _write_unique_bins, _generate_pa_table,
dereplicate_mags
dereplicate_mags, _get_representatives
)
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
Expand Down Expand Up @@ -72,6 +73,13 @@ def setUp(self):
'24dee6fe-9b84-45bb-8145-de7b092533a1': 0
}
}
self.busco_results = pd.read_csv(
filepath_or_buffer=self.get_data_path("busco_results.tsv"),
sep='\t',
index_col=0,
dtype='str'
)
self.busco_results.index.name = 'id'

def test_find_clusters_fcluster_similar(self):
obs = _find_similar_bins_fcluster(self.dist_matrix_df, 0.99)
Expand Down Expand Up @@ -138,7 +146,8 @@ def test_generate_pa_table(self):
def test_dereplicate_mags(self):
mags = MultiMAGSequencesDirFmt(self.get_data_path('mags'), mode='r')

obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix, 0.99)
obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix,
threshold=0.99)
exp_mags = MAGSequencesDirFmt(
self.get_data_path('mags-unique'), mode='r'
)
Expand All @@ -157,6 +166,71 @@ def test_dereplicate_mags(self):
# assert correct PA table was generated
pd.testing.assert_frame_equal(exp_pa, obs_pa)

def test_get_representatives_metadata_max_value(self):
obs = _get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column="complete",
bin_clusters=self.clusters_99,
find_max=True
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf',
'd65a71fa-4279-4588-b937-0747ed5d604d'
]
self.assertListEqual(exp, obs)

def test_get_representatives_metadata_min_value(self):
obs = _get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column="complete",
bin_clusters=self.clusters_99,
find_max=False
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'fb0bc871-04f6-486b-a10e-8e0cb66f8de3'
]
self.assertListEqual(exp, obs)

def test_get_representatives_length(self):
obs = _get_representatives(
mags=self.bins,
metadata=None,
column=None,
bin_clusters=self.clusters_99,
find_max=True
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'd65a71fa-4279-4588-b937-0747ed5d604d'
]
self.assertListEqual(exp, obs)

def test_get_representatives_key_error(self):
with self.assertRaisesRegex(KeyError, "version"):
_get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column="version",
bin_clusters=self.clusters_99,
find_max=True
)

def test_get_representatives_value_error(self):
with self.assertRaisesRegex(ValueError, "numerical"):
_get_representatives(
mags=self.bins,
metadata=qiime2.Metadata(self.busco_results),
column="dataset",
bin_clusters=self.clusters_99,
find_max=True
)


if __name__ == '__main__':
unittest.main()
27 changes: 22 additions & 5 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,22 +418,32 @@
function=q2_moshpit.dereplication.dereplicate_mags,
inputs={
"mags": SampleData[MAGs],
"distance_matrix": DistanceMatrix
"distance_matrix": DistanceMatrix,
},
parameters={
"threshold": Float % Range(0, 1, inclusive_end=True)
"threshold": Float % Range(0, 1, inclusive_end=True),
"metadata": Metadata,
"metadata_column": Str,
"find_max": Bool
},
outputs=[
('dereplicated_mags', FeatureData[MAG]),
('feature_table', FeatureTable[PresenceAbsence])
],
input_descriptions={
"mags": "MAGs to be dereplicated.",
"distance_matrix": "Matrix of distances between MAGs."
"distance_matrix": "Matrix of distances between MAGs.",
},
parameter_descriptions={
"threshold": "Similarity threshold required to consider "
"two bins identical."
"two bins identical.",
"metadata": "Metadata table.",
"metadata_column": "Name of the metadata column used to choose the "
"most representative bins.",
"find_max": "Set to True to choose the bin with the highest value in "
"the metadata column. Set to False to choose the bin "
"with the lowest value.",

},
output_descriptions={
"dereplicated_mags": "Dereplicated MAGs.",
Expand All @@ -443,7 +453,14 @@
description='This method dereplicates MAGs from multiple samples '
'using distances between them found in the provided '
'distance matrix. For each cluster of similar MAGs, '
'the longest one will be selected as the representative.',
'the longest one will be selected as the representative. If '
'metadata is given as input, the MAG with the '
'highest or lowest value in the specified metadata column '
'is chosen, depending on the parameter "find-max". '
'If there are MAGs with identical values, the longer one is '
'chosen. For example an artifact of type BUSCOResults can be '
'passed as metadata, and the dereplication can be done by '
'highest "completeness".',
citations=[]
)

Expand Down

0 comments on commit f09a381

Please sign in to comment.