Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add busco-results input to dereplicate-mags #213

Merged
merged 17 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 52 additions & 7 deletions q2_moshpit/dereplication/derep.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _get_bin_lengths(mags: MultiMAGSequencesDirFmt) -> pd.Series:

def _remap_bins(
bin_clusters: List[List[str]],
longest_bins: List[str],
representative_bins: List[str],
distances: pd.DataFrame
) -> Dict[str, str]:
"""
Expand All @@ -129,7 +129,7 @@ def _remap_bins(
Args:
bin_clusters (list): A list of lists, where each inner list contains
the IDs of similar bins.
longest_bins (str): A list of longest bin for each cluster.
representative_bins (str): A list of longest bin for each cluster.
distances (pd.DataFrame): The original bin distance matrix.

Returns:
Expand All @@ -153,7 +153,7 @@ def _remap_bins(
final_bins = {}
for i, similar_bins in enumerate(bin_clusters):
for bin in similar_bins:
final_bins[bin] = longest_bins[i]
final_bins[bin] = representative_bins[i]
for bin in distances.index:
if bin not in final_bins:
final_bins[bin] = bin
Expand Down Expand Up @@ -257,22 +257,67 @@ def _generate_pa_table(
return presence_absence


def _get_representatives(mags, busco_results, bin_clusters):
"""
This function identifies the representative bin for each cluster of bins.
If `busco_results` is provided, the selection is first based on
completeness, and in case of a tie, the longest bin is chosen. If
`busco_results` is not available, the longest bin is selected by default.

Args:
mags: A MultiMAGSequencesDirFmt object containing all bins.
busco_results: A DataFrame containing BUSCO results.
bin_clusters: A list of lists where each inner list contains the IDs
of bins grouped by similarity.

Returns:
A list of representative bin IDs, one for each cluster.
"""
bin_lengths = _get_bin_lengths(mags)

# Choose by BUSCO results
if busco_results is not None:
bin_completeness = busco_results['complete']

representative_bins = []
for bins in bin_clusters:
# Get bins with the highest completeness values in cluster
completest_bins = (completeness_values := bin_completeness[bins])[
completeness_values == completeness_values.max()].index

# If there's a tie, resolve by selecting the longest bin
if len(completest_bins) > 1:
lengths_of_best_bins = bin_lengths[completest_bins]
representative_bins.append(lengths_of_best_bins.idxmax())
else:
representative_bins.append(completest_bins[0])
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved

# Choose by length
else:
representative_bins = \
[bin_lengths[ids].idxmax() for ids in bin_clusters]

return representative_bins


def dereplicate_mags(
mags: MultiMAGSequencesDirFmt,
distance_matrix: skbio.DistanceMatrix,
threshold: float = 0.99,
busco_results: pd.DataFrame = None
) -> (MAGSequencesDirFmt, pd.DataFrame):
distances = distance_matrix.to_data_frame()

# find similar bins, according to the threshold
bin_clusters = _find_similar_bins_fcluster(distances, threshold)

# find the longest bin in each cluster
bin_lengths = _get_bin_lengths(mags)
longest_bins = [bin_lengths[ids].idxmax() for ids in bin_clusters]
# select one representative bin per cluster by BUSCO results and length
representative_bins = (
_get_representatives(mags, busco_results, bin_clusters)
)
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved

# generate a map between the original bins and the dereplicated bins
final_bins = _remap_bins(bin_clusters, longest_bins, distances)
final_bins = _remap_bins(bin_clusters, representative_bins, distances)

# generate dereplicated bin sequences
unique_bin_seqs = _write_unique_bins(mags, final_bins)
Expand Down
7 changes: 7 additions & 0 deletions q2_moshpit/dereplication/tests/data/busco_results.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mag_id sample_id input_file dataset complete single duplicated fragmented missing n_markers scaffold_n50 contigs_n50 percent_gaps scaffolds length
24dee6fe-9b84-45bb-8145-de7b092533a1 SRR13221817 bec9c09a-62c3-4fbb-8f7f-9fdf9aefc02f.fasta bacteria_odb10 28.2 27.4 0.8 8.9 62.9 124 4785 4785 0.000% 265 1219165
ca7012fc-ba65-40c3-84f5-05aa478a7585 SRR13221817 5978e667-0476-4921-8cc2-34b9d1b508c1.fasta bacteria_odb10 1.6 1.6 0.0 1.6 96.8 124 3548 3548 0.000% 67 245922
fb0bc871-04f6-486b-a10e-8e0cb66f8de3 SRR13221817 625c95e6-ac2f-4e6e-9470-af8cd11c75dd.fasta bacteria_odb10 26.6 26.6 0.0 3.2 70.2 124 78679 78679 0.000% 17 714893
d65a71fa-4279-4588-b937-0747ed5d604d SRR13221817 77029d46-09f7-4d6e-940f-efcafc6ade9e.fasta bacteria_odb10 96.0 95.2 0.8 1.6 2.4 124 35651 35651 0.000% 165 4293244
db03f8b6-28e1-48c5-a47c-9c65f38f7357 SRR13221817 0367f168-9d6a-48d6-8ff8-b532123a13f5.fasta bacteria_odb10 96.0 96.0 0.0 0.8 3.2 124 52720 52720 0.000% 166 4893325
fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf SRR13221817 509a5fe7-a5ab-42ea-8645-0510aa77e1fe.fasta bacteria_odb10 96.0 19.4 0.0 1.6 79.0 124 71847 71847 0.000% 20 761157
35 changes: 34 additions & 1 deletion q2_moshpit/dereplication/tests/test_dereplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from q2_moshpit.dereplication.derep import (
_find_similar_bins_fcluster, _get_bin_lengths, _remap_bins,
_reassign_bins_to_samples, _write_unique_bins, _generate_pa_table,
dereplicate_mags
dereplicate_mags, _get_representatives
)
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
Expand Down Expand Up @@ -157,6 +157,39 @@ def test_dereplicate_mags(self):
# assert correct PA table was generated
pd.testing.assert_frame_equal(exp_pa, obs_pa)

def test_get_representatives_busco(self):
busco_results = pd.read_csv(
filepath_or_buffer=self.get_data_path("busco_results.tsv"),
sep='\t',
index_col=0,
dtype='str'
)
busco_results.index.name = 'id'
obs = _get_representatives(
mags=self.bins,
busco_results=busco_results,
bin_clusters=self.clusters_99,
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf',
'd65a71fa-4279-4588-b937-0747ed5d604d'
]
self.assertListEqual(exp, obs)

def test_get_representatives_length(self):
obs = _get_representatives(
mags=self.bins,
busco_results=None,
bin_clusters=self.clusters_99,
)
exp = [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'd65a71fa-4279-4588-b937-0747ed5d604d'
]
self.assertListEqual(exp, obs)


if __name__ == '__main__':
unittest.main()
11 changes: 8 additions & 3 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@
function=q2_moshpit.dereplication.dereplicate_mags,
inputs={
"mags": SampleData[MAGs],
"distance_matrix": DistanceMatrix
"distance_matrix": DistanceMatrix,
"busco_results": BUSCOResults
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
},
parameters={
"threshold": Float % Range(0, 1, inclusive_end=True)
Expand All @@ -429,7 +430,8 @@
],
input_descriptions={
"mags": "MAGs to be dereplicated.",
"distance_matrix": "Matrix of distances between MAGs."
"distance_matrix": "Matrix of distances between MAGs.",
"busco_results": "BUSCO results.",
},
parameter_descriptions={
"threshold": "Similarity threshold required to consider "
Expand All @@ -443,7 +445,10 @@
description='This method dereplicates MAGs from multiple samples '
'using distances between them found in the provided '
'distance matrix. For each cluster of similar MAGs, '
'the longest one will be selected as the representative.',
'the longest one will be selected as the representative. If '
'"busco-results" are given as input, the MAG with the '
'highest completness value is chosen. If there are MAGs with '
'identical completeness, the longer one is chosen.',
citations=[]
)

Expand Down
Loading