From 3553fb031e0893d5dce454b44653529aef0c371b Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 23 Oct 2024 19:35:01 +0200 Subject: [PATCH 01/13] added busco results to dereplicate_mags --- q2_moshpit/dereplication/derep.py | 30 +++++++++++++++++-- .../tests/data/busco_results.tsv | 7 +++++ .../dereplication/tests/test_dereplication.py | 20 ++++++++++++- q2_moshpit/plugin_setup.py | 9 ++++-- 4 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 q2_moshpit/dereplication/tests/data/busco_results.tsv diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 060decb6..d0ce7f4d 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -257,19 +257,45 @@ def _generate_pa_table( return presence_absence +def _get_busco_completeness(busco_results, bin_clusters, bin_lengths): + bin_completeness = busco_results['complete'] + + # Iterate through clusters and select the representative bin + representative_bins = [] + for ids in bin_clusters: + # Get completeness and length for the current cluster + best_bins = (completeness_values := bin_completeness[ids])[ + completeness_values == completeness_values.max()].index + # If there's a tie, resolve by selecting the longest bin + if len(best_bins) > 1: + lengths_of_best_bins = bin_lengths[best_bins] + representative_bins.append(lengths_of_best_bins.idxmax()) + else: + representative_bins.append(best_bins[0]) + + return representative_bins + + def dereplicate_mags( mags: MultiMAGSequencesDirFmt, distance_matrix: skbio.DistanceMatrix, threshold: float = 0.99, + busco_results: pd.DataFrame = None ) -> (MAGSequencesDirFmt, pd.DataFrame): distances = distance_matrix.to_data_frame() # find similar bins, according to the threshold bin_clusters = _find_similar_bins_fcluster(distances, threshold) - # find the longest bin in each cluster bin_lengths = _get_bin_lengths(mags) - longest_bins = [bin_lengths[ids].idxmax() for ids in bin_clusters] + + if busco_results is not None: + representative_bins = ( + _get_busco_completeness(busco_results, bin_clusters, bin_lengths) + ) + else: + representative_bins = \ + [bin_lengths[ids].idxmax() for ids in bin_clusters] # generate a map between the original bins and the dereplicated bins final_bins = _remap_bins(bin_clusters, longest_bins, distances) diff --git a/q2_moshpit/dereplication/tests/data/busco_results.tsv b/q2_moshpit/dereplication/tests/data/busco_results.tsv new file mode 100644 index 00000000..e845ef3f --- /dev/null +++ b/q2_moshpit/dereplication/tests/data/busco_results.tsv @@ -0,0 +1,7 @@ +mag_id sample_id input_file dataset complete single duplicated fragmented missing n_markers scaffold_n50 contigs_n50 percent_gaps scaffolds length +24dee6fe-9b84-45bb-8145-de7b092533a1 SRR13221817 bec9c09a-62c3-4fbb-8f7f-9fdf9aefc02f.fasta bacteria_odb10 28.2 27.4 0.8 8.9 62.9 124 4785 4785 0.000% 265 1219165 +ca7012fc-ba65-40c3-84f5-05aa478a7585 SRR13221817 5978e667-0476-4921-8cc2-34b9d1b508c1.fasta bacteria_odb10 1.6 1.6 0.0 1.6 96.8 124 3548 3548 0.000% 67 245922 +fb0bc871-04f6-486b-a10e-8e0cb66f8de3 SRR13221817 625c95e6-ac2f-4e6e-9470-af8cd11c75dd.fasta bacteria_odb10 26.6 26.6 0.0 3.2 70.2 124 78679 78679 0.000% 17 714893 +d65a71fa-4279-4588-b937-0747ed5d604d SRR13221817 77029d46-09f7-4d6e-940f-efcafc6ade9e.fasta bacteria_odb10 96.0 95.2 0.8 1.6 2.4 124 35651 35651 0.000% 165 4293244 +db03f8b6-28e1-48c5-a47c-9c65f38f7357 SRR13221817 0367f168-9d6a-48d6-8ff8-b532123a13f5.fasta bacteria_odb10 96.0 96.0 0.0 0.8 3.2 124 52720 52720 0.000% 166 4893325 +fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf SRR13221817 509a5fe7-a5ab-42ea-8645-0510aa77e1fe.fasta bacteria_odb10 96.0 19.4 0.0 1.6 79.0 124 71847 71847 0.000% 20 761157 \ No newline at end of file diff --git a/q2_moshpit/dereplication/tests/test_dereplication.py b/q2_moshpit/dereplication/tests/test_dereplication.py index f95fd3c7..abe8ed7b 100644 --- a/q2_moshpit/dereplication/tests/test_dereplication.py +++ b/q2_moshpit/dereplication/tests/test_dereplication.py @@ -16,7 +16,7 @@ from q2_moshpit.dereplication.derep import ( _find_similar_bins_fcluster, _get_bin_lengths, _remap_bins, _reassign_bins_to_samples, _write_unique_bins, _generate_pa_table, - dereplicate_mags + dereplicate_mags, _get_busco_completeness ) from q2_types.feature_data_mag import MAGSequencesDirFmt from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt @@ -157,6 +157,24 @@ def test_dereplicate_mags(self): # assert correct PA table was generated pd.testing.assert_frame_equal(exp_pa, obs_pa) + def test_get_busco_completeness(self): + busco_results = pd.read_csv(self.get_data_path("busco_results.tsv"), sep='\t', header=0, index_col=0, + dtype='str') + busco_results.index.name = 'id' + bin_lengths = pd.Series( + [1935, 3000, 2000, 3000, 2000, 3000], name='length', + index=[ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'ca7012fc-ba65-40c3-84f5-05aa478a7585', + 'fb0bc871-04f6-486b-a10e-8e0cb66f8de3', + 'd65a71fa-4279-4588-b937-0747ed5d604d', + 'db03f8b6-28e1-48c5-a47c-9c65f38f7357', + 'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf', + + ] + ) + chosen_bins = _get_busco_completeness(busco_results, self.clusters_99, bin_lengths) + print(chosen_bins) if __name__ == '__main__': unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 829ede6c..48c196b3 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -418,7 +418,8 @@ function=q2_moshpit.dereplication.dereplicate_mags, inputs={ "mags": SampleData[MAGs], - "distance_matrix": DistanceMatrix + "distance_matrix": DistanceMatrix, + "busco_results": BUSCOResults }, parameters={ "threshold": Float % Range(0, 1, inclusive_end=True) @@ -429,8 +430,10 @@ ], input_descriptions={ "mags": "MAGs to be dereplicated.", - "distance_matrix": "Matrix of distances between MAGs." - }, + "distance_matrix": "Matrix of distances between MAGs.", + "busco_results": "BUSCO results.", + +}, parameter_descriptions={ "threshold": "Similarity threshold required to consider " "two bins identical." From 67303747e9d2d30316e0a51179c231f0b2312d12 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 24 Oct 2024 10:56:02 +0200 Subject: [PATCH 02/13] moved if logic into helper function renamed variables --- q2_moshpit/dereplication/derep.py | 71 ++++++++++++------- .../dereplication/tests/test_dereplication.py | 47 +++++++----- q2_moshpit/plugin_setup.py | 5 +- 3 files changed, 80 insertions(+), 43 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index d0ce7f4d..84c626fa 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -119,7 +119,7 @@ def _get_bin_lengths(mags: MultiMAGSequencesDirFmt) -> pd.Series: def _remap_bins( bin_clusters: List[List[str]], - longest_bins: List[str], + representative_bins: List[str], distances: pd.DataFrame ) -> Dict[str, str]: """ @@ -129,7 +129,7 @@ def _remap_bins( Args: bin_clusters (list): A list of lists, where each inner list contains the IDs of similar bins. - longest_bins (str): A list of longest bin for each cluster. + representative_bins (str): A list of longest bin for each cluster. distances (pd.DataFrame): The original bin distance matrix. Returns: @@ -153,7 +153,7 @@ def _remap_bins( final_bins = {} for i, similar_bins in enumerate(bin_clusters): for bin in similar_bins: - final_bins[bin] = longest_bins[i] + final_bins[bin] = representative_bins[i] for bin in distances.index: if bin not in final_bins: final_bins[bin] = bin @@ -257,21 +257,45 @@ def _generate_pa_table( return presence_absence -def _get_busco_completeness(busco_results, bin_clusters, bin_lengths): - bin_completeness = busco_results['complete'] +def _get_representatives(mags, busco_results, bin_clusters): + """ + This function identifies the representative bin for each cluster of bins. + If `busco_results` is provided, the selection is first based on + completeness, and in case of a tie, the longest bin is chosen. If + `busco_results` is not available, the longest bin is selected by default. - # Iterate through clusters and select the representative bin - representative_bins = [] - for ids in bin_clusters: - # Get completeness and length for the current cluster - best_bins = (completeness_values := bin_completeness[ids])[ - completeness_values == completeness_values.max()].index - # If there's a tie, resolve by selecting the longest bin - if len(best_bins) > 1: - lengths_of_best_bins = bin_lengths[best_bins] - representative_bins.append(lengths_of_best_bins.idxmax()) - else: - representative_bins.append(best_bins[0]) + Args: + mags: A MultiMAGSequencesDirFmt object containing all bins. + busco_results: A DataFrame containing BUSCO results. + bin_clusters: A list of lists where each inner list contains the IDs + of bins grouped by similarity. + + Returns: + A list of representative bin IDs, one for each cluster. + """ + bin_lengths = _get_bin_lengths(mags) + + # Choose by BUSCO results + if busco_results is not None: + bin_completeness = busco_results['complete'] + + representative_bins = [] + for bins in bin_clusters: + # Get bins with the highest completeness values in cluster + completest_bins = (completeness_values := bin_completeness[bins])[ + completeness_values == completeness_values.max()].index + + # If there's a tie, resolve by selecting the longest bin + if len(completest_bins) > 1: + lengths_of_best_bins = bin_lengths[completest_bins] + representative_bins.append(lengths_of_best_bins.idxmax()) + else: + representative_bins.append(completest_bins[0]) + + # Choose by length + else: + representative_bins = \ + [bin_lengths[ids].idxmax() for ids in bin_clusters] return representative_bins @@ -287,18 +311,13 @@ def dereplicate_mags( # find similar bins, according to the threshold bin_clusters = _find_similar_bins_fcluster(distances, threshold) - bin_lengths = _get_bin_lengths(mags) - - if busco_results is not None: - representative_bins = ( - _get_busco_completeness(busco_results, bin_clusters, bin_lengths) + # select one representative bin per cluster by BUSCO results and length + representative_bins = ( + _get_representatives(mags, busco_results, bin_clusters) ) - else: - representative_bins = \ - [bin_lengths[ids].idxmax() for ids in bin_clusters] # generate a map between the original bins and the dereplicated bins - final_bins = _remap_bins(bin_clusters, longest_bins, distances) + final_bins = _remap_bins(bin_clusters, representative_bins, distances) # generate dereplicated bin sequences unique_bin_seqs = _write_unique_bins(mags, final_bins) diff --git a/q2_moshpit/dereplication/tests/test_dereplication.py b/q2_moshpit/dereplication/tests/test_dereplication.py index abe8ed7b..ff1adfd5 100644 --- a/q2_moshpit/dereplication/tests/test_dereplication.py +++ b/q2_moshpit/dereplication/tests/test_dereplication.py @@ -16,7 +16,7 @@ from q2_moshpit.dereplication.derep import ( _find_similar_bins_fcluster, _get_bin_lengths, _remap_bins, _reassign_bins_to_samples, _write_unique_bins, _generate_pa_table, - dereplicate_mags, _get_busco_completeness + dereplicate_mags, _get_representatives ) from q2_types.feature_data_mag import MAGSequencesDirFmt from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt @@ -157,24 +157,39 @@ def test_dereplicate_mags(self): # assert correct PA table was generated pd.testing.assert_frame_equal(exp_pa, obs_pa) - def test_get_busco_completeness(self): - busco_results = pd.read_csv(self.get_data_path("busco_results.tsv"), sep='\t', header=0, index_col=0, - dtype='str') + def test_get_representatives_busco(self): + busco_results = pd.read_csv( + filepath_or_buffer=self.get_data_path("busco_results.tsv"), + sep='\t', + index_col=0, + dtype='str' + ) busco_results.index.name = 'id' - bin_lengths = pd.Series( - [1935, 3000, 2000, 3000, 2000, 3000], name='length', - index=[ - '24dee6fe-9b84-45bb-8145-de7b092533a1', - 'ca7012fc-ba65-40c3-84f5-05aa478a7585', - 'fb0bc871-04f6-486b-a10e-8e0cb66f8de3', - 'd65a71fa-4279-4588-b937-0747ed5d604d', - 'db03f8b6-28e1-48c5-a47c-9c65f38f7357', - 'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf', + obs = _get_representatives( + mags=self.bins, + busco_results=busco_results, + bin_clusters=self.clusters_99, + ) + exp = [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf', + 'd65a71fa-4279-4588-b937-0747ed5d604d' + ] + self.assertListEqual(exp, obs) - ] + def test_get_representatives_length(self): + obs = _get_representatives( + mags=self.bins, + busco_results=None, + bin_clusters=self.clusters_99, ) - chosen_bins = _get_busco_completeness(busco_results, self.clusters_99, bin_lengths) - print(chosen_bins) + exp = [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'ca7012fc-ba65-40c3-84f5-05aa478a7585', + 'd65a71fa-4279-4588-b937-0747ed5d604d' + ] + self.assertListEqual(exp, obs) + if __name__ == '__main__': unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 48c196b3..79e6befe 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -446,7 +446,10 @@ description='This method dereplicates MAGs from multiple samples ' 'using distances between them found in the provided ' 'distance matrix. For each cluster of similar MAGs, ' - 'the longest one will be selected as the representative.', + 'the longest one will be selected as the representative. If ' + '"busco-results" are given as input, the MAG with the ' + 'highest completness value is chosen. If there are MAGs with ' + 'identical completeness, the longer one is chosen.', citations=[] ) From 2a92759c97987e6e9dce7ca48080cc52092e52f2 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 24 Oct 2024 10:57:37 +0200 Subject: [PATCH 03/13] lint --- q2_moshpit/plugin_setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 79e6befe..de495f33 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -432,8 +432,7 @@ "mags": "MAGs to be dereplicated.", "distance_matrix": "Matrix of distances between MAGs.", "busco_results": "BUSCO results.", - -}, + }, parameter_descriptions={ "threshold": "Similarity threshold required to consider " "two bins identical." From ee666c020bf78265648a710dd7cc04bf0d0e7cc6 Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:04:28 +0100 Subject: [PATCH 04/13] Update q2_moshpit/dereplication/derep.py Co-authored-by: Michal Ziemski --- q2_moshpit/dereplication/derep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 84c626fa..270eae22 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -312,9 +312,9 @@ def dereplicate_mags( bin_clusters = _find_similar_bins_fcluster(distances, threshold) # select one representative bin per cluster by BUSCO results and length - representative_bins = ( - _get_representatives(mags, busco_results, bin_clusters) - ) + representative_bins = _get_representatives( + mags, busco_results, bin_clusters + ) # generate a map between the original bins and the dereplicated bins final_bins = _remap_bins(bin_clusters, representative_bins, distances) From 7e1e59de4a2573894699e400f617479726beb4d8 Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:04:44 +0100 Subject: [PATCH 05/13] Update q2_moshpit/dereplication/derep.py Co-authored-by: Michal Ziemski --- q2_moshpit/dereplication/derep.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 270eae22..14849af6 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -278,17 +278,15 @@ def _get_representatives(mags, busco_results, bin_clusters): # Choose by BUSCO results if busco_results is not None: bin_completeness = busco_results['complete'] - + representative_bins = [] for bins in bin_clusters: # Get bins with the highest completeness values in cluster - completest_bins = (completeness_values := bin_completeness[bins])[ - completeness_values == completeness_values.max()].index - + completest_bins = bin_completeness[bins].idxmax() + # If there's a tie, resolve by selecting the longest bin if len(completest_bins) > 1: - lengths_of_best_bins = bin_lengths[completest_bins] - representative_bins.append(lengths_of_best_bins.idxmax()) + representative_bins.append(bin_lengths[completest_bins].idxmax()) else: representative_bins.append(completest_bins[0]) From e9b3e91df03696443c669e8858b2fc5210b2c4e9 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Mon, 4 Nov 2024 16:40:54 +0100 Subject: [PATCH 06/13] added metadata and column parameters --- q2_moshpit/dereplication/derep.py | 50 +++++++++++++------ .../dereplication/tests/test_dereplication.py | 42 ++++++++++++---- q2_moshpit/plugin_setup.py | 20 +++++--- 3 files changed, 80 insertions(+), 32 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 14849af6..101a1039 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -12,6 +12,7 @@ import pandas as pd import skbio from q2_types.feature_data import DNAFASTAFormat +from qiime2 import Metadata from scipy.cluster.hierarchy import ward, fcluster from q2_types.feature_data_mag import MAGSequencesDirFmt @@ -257,16 +258,17 @@ def _generate_pa_table( return presence_absence -def _get_representatives(mags, busco_results, bin_clusters): +def _get_representatives(mags, metadata, column_name, bin_clusters): """ This function identifies the representative bin for each cluster of bins. - If `busco_results` is provided, the selection is first based on - completeness, and in case of a tie, the longest bin is chosen. If - `busco_results` is not available, the longest bin is selected by default. + If metadata is provided, the selection is based on a numerical metadata + column. In case of a tie, the longest bin is chosen. If metadata is not + provided, the longest bin is selected by default. Args: mags: A MultiMAGSequencesDirFmt object containing all bins. - busco_results: A DataFrame containing BUSCO results. + metadata: Qiime metadata. + column_name: Name of a column in metadata. bin_clusters: A list of lists where each inner list contains the IDs of bins grouped by similarity. @@ -275,20 +277,37 @@ def _get_representatives(mags, busco_results, bin_clusters): """ bin_lengths = _get_bin_lengths(mags) - # Choose by BUSCO results - if busco_results is not None: - bin_completeness = busco_results['complete'] + # Choose by metadata + if metadata is not None: + try: + metadata_column = ( + metadata.to_dataframe()[column_name].astype(float) + ) + except KeyError: + raise KeyError(f'The column "{column_name}" does not exist ' + f'in the metadata.') + except ValueError: + raise ValueError('The specified metadata column has to be ' + 'numerical.') representative_bins = [] for bins in bin_clusters: - # Get bins with the highest completeness values in cluster - completest_bins = bin_completeness[bins].idxmax() + # Get bin IDs with the highest column values in cluster + try: + highest_value_bins = ( + values := metadata_column[bins] + )[values == values.max()].index + except TypeError: + raise TypeError(f'The column "{column_name}" has to be ' + f'numerical.') # If there's a tie, resolve by selecting the longest bin - if len(completest_bins) > 1: - representative_bins.append(bin_lengths[completest_bins].idxmax()) + if len(highest_value_bins) > 1: + representative_bins.append( + bin_lengths[highest_value_bins].idxmax() + ) else: - representative_bins.append(completest_bins[0]) + representative_bins.append(highest_value_bins[0]) # Choose by length else: @@ -301,8 +320,9 @@ def _get_representatives(mags, busco_results, bin_clusters): def dereplicate_mags( mags: MultiMAGSequencesDirFmt, distance_matrix: skbio.DistanceMatrix, + metadata: Metadata = None, + metadata_column: str = "complete", threshold: float = 0.99, - busco_results: pd.DataFrame = None ) -> (MAGSequencesDirFmt, pd.DataFrame): distances = distance_matrix.to_data_frame() @@ -311,7 +331,7 @@ def dereplicate_mags( # select one representative bin per cluster by BUSCO results and length representative_bins = _get_representatives( - mags, busco_results, bin_clusters + mags, metadata, metadata_column, bin_clusters ) # generate a map between the original bins and the dereplicated bins diff --git a/q2_moshpit/dereplication/tests/test_dereplication.py b/q2_moshpit/dereplication/tests/test_dereplication.py index ff1adfd5..8b67eb01 100644 --- a/q2_moshpit/dereplication/tests/test_dereplication.py +++ b/q2_moshpit/dereplication/tests/test_dereplication.py @@ -11,6 +11,7 @@ import unittest import pandas as pd +import qiime2 import skbio from q2_moshpit.dereplication.derep import ( @@ -72,6 +73,13 @@ def setUp(self): '24dee6fe-9b84-45bb-8145-de7b092533a1': 0 } } + self.busco_results = pd.read_csv( + filepath_or_buffer=self.get_data_path("busco_results.tsv"), + sep='\t', + index_col=0, + dtype='str' + ) + self.busco_results.index.name = 'id' def test_find_clusters_fcluster_similar(self): obs = _find_similar_bins_fcluster(self.dist_matrix_df, 0.99) @@ -138,7 +146,8 @@ def test_generate_pa_table(self): def test_dereplicate_mags(self): mags = MultiMAGSequencesDirFmt(self.get_data_path('mags'), mode='r') - obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix, 0.99) + obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix, + threshold=0.99) exp_mags = MAGSequencesDirFmt( self.get_data_path('mags-unique'), mode='r' ) @@ -158,16 +167,10 @@ def test_dereplicate_mags(self): pd.testing.assert_frame_equal(exp_pa, obs_pa) def test_get_representatives_busco(self): - busco_results = pd.read_csv( - filepath_or_buffer=self.get_data_path("busco_results.tsv"), - sep='\t', - index_col=0, - dtype='str' - ) - busco_results.index.name = 'id' obs = _get_representatives( mags=self.bins, - busco_results=busco_results, + metadata=qiime2.Metadata(self.busco_results), + column_name="complete", bin_clusters=self.clusters_99, ) exp = [ @@ -180,7 +183,8 @@ def test_get_representatives_busco(self): def test_get_representatives_length(self): obs = _get_representatives( mags=self.bins, - busco_results=None, + metadata=None, + column_name=None, bin_clusters=self.clusters_99, ) exp = [ @@ -190,6 +194,24 @@ def test_get_representatives_length(self): ] self.assertListEqual(exp, obs) + def test_get_representatives_key_error(self): + with self.assertRaisesRegex(KeyError, "version"): + _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column_name="version", + bin_clusters=self.clusters_99, + ) + + def test_get_representatives_value_error(self): + with self.assertRaisesRegex(ValueError, "numerical"): + _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column_name="dataset", + bin_clusters=self.clusters_99, + ) + if __name__ == '__main__': unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index de495f33..a5d301ef 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -419,10 +419,11 @@ inputs={ "mags": SampleData[MAGs], "distance_matrix": DistanceMatrix, - "busco_results": BUSCOResults }, parameters={ - "threshold": Float % Range(0, 1, inclusive_end=True) + "threshold": Float % Range(0, 1, inclusive_end=True), + "metadata": Metadata, + "metadata_column": Str }, outputs=[ ('dereplicated_mags', FeatureData[MAG]), @@ -431,11 +432,13 @@ input_descriptions={ "mags": "MAGs to be dereplicated.", "distance_matrix": "Matrix of distances between MAGs.", - "busco_results": "BUSCO results.", }, parameter_descriptions={ "threshold": "Similarity threshold required to consider " - "two bins identical." + "two bins identical.", + "metadata": "Metadata table.", + "metadata_column": "Name of the metadata column used to choose the " + "most representative bins." }, output_descriptions={ "dereplicated_mags": "Dereplicated MAGs.", @@ -446,9 +449,12 @@ 'using distances between them found in the provided ' 'distance matrix. For each cluster of similar MAGs, ' 'the longest one will be selected as the representative. If ' - '"busco-results" are given as input, the MAG with the ' - 'highest completness value is chosen. If there are MAGs with ' - 'identical completeness, the longer one is chosen.', + 'metadata is given as input, the MAG with the ' + 'highest value in the specified metadata column is chosen. ' + 'If there are MAGs with identical values, the longer one is ' + 'chosen. For example an artifact of type BUSCOResults can be ' + 'passed as metadata, and the dereplication can be done by ' + 'completness.', citations=[] ) From 992c947494eeae49c2cea8f93fad69f150e87b6f Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Mon, 4 Nov 2024 16:44:00 +0100 Subject: [PATCH 07/13] lint --- q2_moshpit/dereplication/derep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 101a1039..77c71c07 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -289,7 +289,7 @@ def _get_representatives(mags, metadata, column_name, bin_clusters): except ValueError: raise ValueError('The specified metadata column has to be ' 'numerical.') - + representative_bins = [] for bins in bin_clusters: # Get bin IDs with the highest column values in cluster @@ -300,7 +300,7 @@ def _get_representatives(mags, metadata, column_name, bin_clusters): except TypeError: raise TypeError(f'The column "{column_name}" has to be ' f'numerical.') - + # If there's a tie, resolve by selecting the longest bin if len(highest_value_bins) > 1: representative_bins.append( From d077ae200e611dc3ff633e83b83f362c802cf446 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 6 Nov 2024 15:22:36 +0100 Subject: [PATCH 08/13] removed rudentant value error --- q2_moshpit/dereplication/derep.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 77c71c07..3fe8b053 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -293,13 +293,9 @@ def _get_representatives(mags, metadata, column_name, bin_clusters): representative_bins = [] for bins in bin_clusters: # Get bin IDs with the highest column values in cluster - try: - highest_value_bins = ( - values := metadata_column[bins] - )[values == values.max()].index - except TypeError: - raise TypeError(f'The column "{column_name}" has to be ' - f'numerical.') + highest_value_bins = ( + values := metadata_column[bins] + )[values == values.max()].index # If there's a tie, resolve by selecting the longest bin if len(highest_value_bins) > 1: From ef6f3d3c6ce44d6874ba36bf9a639f515750e7ac Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 7 Nov 2024 11:28:49 +0100 Subject: [PATCH 09/13] added find_max parameter --- q2_moshpit/dereplication/derep.py | 24 +++++++++++-------- .../dereplication/tests/test_dereplication.py | 21 +++++++++++++++- q2_moshpit/plugin_setup.py | 16 +++++++++---- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 3fe8b053..1eabf680 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -258,7 +258,7 @@ def _generate_pa_table( return presence_absence -def _get_representatives(mags, metadata, column_name, bin_clusters): +def _get_representatives(mags, metadata, column_name, bin_clusters, find_max): """ This function identifies the representative bin for each cluster of bins. If metadata is provided, the selection is based on a numerical metadata @@ -271,6 +271,9 @@ def _get_representatives(mags, metadata, column_name, bin_clusters): column_name: Name of a column in metadata. bin_clusters: A list of lists where each inner list contains the IDs of bins grouped by similarity. + find_max: Boolean, if true the bin with the highest value in the + metadata column will be chosen, if false the lowest value + is chosen. Returns: A list of representative bin IDs, one for each cluster. @@ -292,18 +295,18 @@ def _get_representatives(mags, metadata, column_name, bin_clusters): representative_bins = [] for bins in bin_clusters: - # Get bin IDs with the highest column values in cluster - highest_value_bins = ( + # Get bin IDs with the max or min column values in cluster + max_min_value_bins = ( values := metadata_column[bins] - )[values == values.max()].index + )[values == (values.max() if find_max else values.min())].index # If there's a tie, resolve by selecting the longest bin - if len(highest_value_bins) > 1: + if len(max_min_value_bins) > 1: representative_bins.append( - bin_lengths[highest_value_bins].idxmax() + bin_lengths[max_min_value_bins].idxmax() ) else: - representative_bins.append(highest_value_bins[0]) + representative_bins.append(max_min_value_bins[0]) # Choose by length else: @@ -316,18 +319,19 @@ def _get_representatives(mags, metadata, column_name, bin_clusters): def dereplicate_mags( mags: MultiMAGSequencesDirFmt, distance_matrix: skbio.DistanceMatrix, + threshold: float = 0.99, metadata: Metadata = None, metadata_column: str = "complete", - threshold: float = 0.99, + find_max: bool = True, ) -> (MAGSequencesDirFmt, pd.DataFrame): distances = distance_matrix.to_data_frame() # find similar bins, according to the threshold bin_clusters = _find_similar_bins_fcluster(distances, threshold) - # select one representative bin per cluster by BUSCO results and length + # select one representative bin per cluster by metadata and length representative_bins = _get_representatives( - mags, metadata, metadata_column, bin_clusters + mags, metadata, metadata_column, bin_clusters, find_max ) # generate a map between the original bins and the dereplicated bins diff --git a/q2_moshpit/dereplication/tests/test_dereplication.py b/q2_moshpit/dereplication/tests/test_dereplication.py index 8b67eb01..f6e9fa80 100644 --- a/q2_moshpit/dereplication/tests/test_dereplication.py +++ b/q2_moshpit/dereplication/tests/test_dereplication.py @@ -166,12 +166,13 @@ def test_dereplicate_mags(self): # assert correct PA table was generated pd.testing.assert_frame_equal(exp_pa, obs_pa) - def test_get_representatives_busco(self): + def test_get_representatives_metadata_max_value(self): obs = _get_representatives( mags=self.bins, metadata=qiime2.Metadata(self.busco_results), column_name="complete", bin_clusters=self.clusters_99, + find_max=True ) exp = [ '24dee6fe-9b84-45bb-8145-de7b092533a1', @@ -180,12 +181,28 @@ def test_get_representatives_busco(self): ] self.assertListEqual(exp, obs) + def test_get_representatives_metadata_min_value(self): + obs = _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column_name="complete", + bin_clusters=self.clusters_99, + find_max=False + ) + exp = [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'ca7012fc-ba65-40c3-84f5-05aa478a7585', + 'fb0bc871-04f6-486b-a10e-8e0cb66f8de3' + ] + self.assertListEqual(exp, obs) + def test_get_representatives_length(self): obs = _get_representatives( mags=self.bins, metadata=None, column_name=None, bin_clusters=self.clusters_99, + find_max=True ) exp = [ '24dee6fe-9b84-45bb-8145-de7b092533a1', @@ -201,6 +218,7 @@ def test_get_representatives_key_error(self): metadata=qiime2.Metadata(self.busco_results), column_name="version", bin_clusters=self.clusters_99, + find_max=True ) def test_get_representatives_value_error(self): @@ -210,6 +228,7 @@ def test_get_representatives_value_error(self): metadata=qiime2.Metadata(self.busco_results), column_name="dataset", bin_clusters=self.clusters_99, + find_max=True ) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index dc975af1..728dd272 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -423,7 +423,8 @@ parameters={ "threshold": Float % Range(0, 1, inclusive_end=True), "metadata": Metadata, - "metadata_column": Str + "metadata_column": Str, + "find_max": Bool }, outputs=[ ('dereplicated_mags', FeatureData[MAG]), @@ -438,8 +439,12 @@ "two bins identical.", "metadata": "Metadata table.", "metadata_column": "Name of the metadata column used to choose the " - "most representative bins." - }, + "most representative bins.", + "find_max": "Set to True to choose the bin with the highest value in " + "the metadata column. Set to False to choose the bin " + "with the lowest value.", + +}, output_descriptions={ "dereplicated_mags": "Dereplicated MAGs.", "feature_table": "Mapping between MAGs and samples." @@ -450,11 +455,12 @@ 'distance matrix. For each cluster of similar MAGs, ' 'the longest one will be selected as the representative. If ' 'metadata is given as input, the MAG with the ' - 'highest value in the specified metadata column is chosen. ' + 'highest or lowest value in the specified metadata column ' + 'is chosen, depending on the parameter "find-max". ' 'If there are MAGs with identical values, the longer one is ' 'chosen. For example an artifact of type BUSCOResults can be ' 'passed as metadata, and the dereplication can be done by ' - 'completness.', + 'highest "completness".', citations=[] ) From 924384fb5272412981c4dded04d4ab69555a2bc3 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 7 Nov 2024 11:30:23 +0100 Subject: [PATCH 10/13] lint --- q2_moshpit/plugin_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 728dd272..5ea561e6 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -444,7 +444,7 @@ "the metadata column. Set to False to choose the bin " "with the lowest value.", -}, + }, output_descriptions={ "dereplicated_mags": "Dereplicated MAGs.", "feature_table": "Mapping between MAGs and samples." From c90536ab68392928276444566a7a3f8e7be1b3fe Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Thu, 12 Dec 2024 15:58:05 +0100 Subject: [PATCH 11/13] Update q2_moshpit/plugin_setup.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- q2_moshpit/plugin_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 5ea561e6..a2c290c2 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -460,7 +460,7 @@ 'If there are MAGs with identical values, the longer one is ' 'chosen. For example an artifact of type BUSCOResults can be ' 'passed as metadata, and the dereplication can be done by ' - 'highest "completness".', + 'highest "completeness".', citations=[] ) From 3892a7ff07170e94c3faae2dbab51bb502f2d75c Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Thu, 12 Dec 2024 16:18:58 +0100 Subject: [PATCH 12/13] Trigger CI From a18c13864b0f2f3d1d8b340260f46df097bdb6f4 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Fri, 13 Dec 2024 11:45:11 +0100 Subject: [PATCH 13/13] Simplify --- q2_moshpit/dereplication/derep.py | 24 ++++++++----------- .../dereplication/tests/test_dereplication.py | 10 ++++---- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 1eabf680..5470162e 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -258,7 +258,7 @@ def _generate_pa_table( return presence_absence -def _get_representatives(mags, metadata, column_name, bin_clusters, find_max): +def _get_representatives(mags, metadata, column, bin_clusters, find_max): """ This function identifies the representative bin for each cluster of bins. If metadata is provided, the selection is based on a numerical metadata @@ -268,7 +268,7 @@ def _get_representatives(mags, metadata, column_name, bin_clusters, find_max): Args: mags: A MultiMAGSequencesDirFmt object containing all bins. metadata: Qiime metadata. - column_name: Name of a column in metadata. + column: Name of a column in metadata. bin_clusters: A list of lists where each inner list contains the IDs of bins grouped by similarity. find_max: Boolean, if true the bin with the highest value in the @@ -279,15 +279,14 @@ def _get_representatives(mags, metadata, column_name, bin_clusters, find_max): A list of representative bin IDs, one for each cluster. """ bin_lengths = _get_bin_lengths(mags) + method = pd.Series.max if find_max else pd.Series.min # Choose by metadata if metadata is not None: try: - metadata_column = ( - metadata.to_dataframe()[column_name].astype(float) - ) + metadata_col = metadata.to_dataframe()[column].astype(float) except KeyError: - raise KeyError(f'The column "{column_name}" does not exist ' + raise KeyError(f'The column "{column}" does not exist ' f'in the metadata.') except ValueError: raise ValueError('The specified metadata column has to be ' @@ -296,17 +295,14 @@ def _get_representatives(mags, metadata, column_name, bin_clusters, find_max): representative_bins = [] for bins in bin_clusters: # Get bin IDs with the max or min column values in cluster - max_min_value_bins = ( - values := metadata_column[bins] - )[values == (values.max() if find_max else values.min())].index + values = metadata_col[bins] + selected_bins = values[values == method(values)].index # If there's a tie, resolve by selecting the longest bin - if len(max_min_value_bins) > 1: - representative_bins.append( - bin_lengths[max_min_value_bins].idxmax() - ) + if len(selected_bins) > 1: + representative_bins.append(bin_lengths[selected_bins].idxmax()) else: - representative_bins.append(max_min_value_bins[0]) + representative_bins.append(selected_bins[0]) # Choose by length else: diff --git a/q2_moshpit/dereplication/tests/test_dereplication.py b/q2_moshpit/dereplication/tests/test_dereplication.py index f6e9fa80..3b4f345d 100644 --- a/q2_moshpit/dereplication/tests/test_dereplication.py +++ b/q2_moshpit/dereplication/tests/test_dereplication.py @@ -170,7 +170,7 @@ def test_get_representatives_metadata_max_value(self): obs = _get_representatives( mags=self.bins, metadata=qiime2.Metadata(self.busco_results), - column_name="complete", + column="complete", bin_clusters=self.clusters_99, find_max=True ) @@ -185,7 +185,7 @@ def test_get_representatives_metadata_min_value(self): obs = _get_representatives( mags=self.bins, metadata=qiime2.Metadata(self.busco_results), - column_name="complete", + column="complete", bin_clusters=self.clusters_99, find_max=False ) @@ -200,7 +200,7 @@ def test_get_representatives_length(self): obs = _get_representatives( mags=self.bins, metadata=None, - column_name=None, + column=None, bin_clusters=self.clusters_99, find_max=True ) @@ -216,7 +216,7 @@ def test_get_representatives_key_error(self): _get_representatives( mags=self.bins, metadata=qiime2.Metadata(self.busco_results), - column_name="version", + column="version", bin_clusters=self.clusters_99, find_max=True ) @@ -226,7 +226,7 @@ def test_get_representatives_value_error(self): _get_representatives( mags=self.bins, metadata=qiime2.Metadata(self.busco_results), - column_name="dataset", + column="dataset", bin_clusters=self.clusters_99, find_max=True )