diff --git a/q2_moshpit/dereplication/derep.py b/q2_moshpit/dereplication/derep.py index 060decb6..5470162e 100644 --- a/q2_moshpit/dereplication/derep.py +++ b/q2_moshpit/dereplication/derep.py @@ -12,6 +12,7 @@ import pandas as pd import skbio from q2_types.feature_data import DNAFASTAFormat +from qiime2 import Metadata from scipy.cluster.hierarchy import ward, fcluster from q2_types.feature_data_mag import MAGSequencesDirFmt @@ -119,7 +120,7 @@ def _get_bin_lengths(mags: MultiMAGSequencesDirFmt) -> pd.Series: def _remap_bins( bin_clusters: List[List[str]], - longest_bins: List[str], + representative_bins: List[str], distances: pd.DataFrame ) -> Dict[str, str]: """ @@ -129,7 +130,7 @@ def _remap_bins( Args: bin_clusters (list): A list of lists, where each inner list contains the IDs of similar bins. - longest_bins (str): A list of longest bin for each cluster. + representative_bins (str): A list of longest bin for each cluster. distances (pd.DataFrame): The original bin distance matrix. Returns: @@ -153,7 +154,7 @@ def _remap_bins( final_bins = {} for i, similar_bins in enumerate(bin_clusters): for bin in similar_bins: - final_bins[bin] = longest_bins[i] + final_bins[bin] = representative_bins[i] for bin in distances.index: if bin not in final_bins: final_bins[bin] = bin @@ -257,22 +258,80 @@ def _generate_pa_table( return presence_absence +def _get_representatives(mags, metadata, column, bin_clusters, find_max): + """ + This function identifies the representative bin for each cluster of bins. + If metadata is provided, the selection is based on a numerical metadata + column. In case of a tie, the longest bin is chosen. If metadata is not + provided, the longest bin is selected by default. + + Args: + mags: A MultiMAGSequencesDirFmt object containing all bins. + metadata: Qiime metadata. + column: Name of a column in metadata. + bin_clusters: A list of lists where each inner list contains the IDs + of bins grouped by similarity. + find_max: Boolean, if true the bin with the highest value in the + metadata column will be chosen, if false the lowest value + is chosen. + + Returns: + A list of representative bin IDs, one for each cluster. + """ + bin_lengths = _get_bin_lengths(mags) + method = pd.Series.max if find_max else pd.Series.min + + # Choose by metadata + if metadata is not None: + try: + metadata_col = metadata.to_dataframe()[column].astype(float) + except KeyError: + raise KeyError(f'The column "{column}" does not exist ' + f'in the metadata.') + except ValueError: + raise ValueError('The specified metadata column has to be ' + 'numerical.') + + representative_bins = [] + for bins in bin_clusters: + # Get bin IDs with the max or min column values in cluster + values = metadata_col[bins] + selected_bins = values[values == method(values)].index + + # If there's a tie, resolve by selecting the longest bin + if len(selected_bins) > 1: + representative_bins.append(bin_lengths[selected_bins].idxmax()) + else: + representative_bins.append(selected_bins[0]) + + # Choose by length + else: + representative_bins = \ + [bin_lengths[ids].idxmax() for ids in bin_clusters] + + return representative_bins + + def dereplicate_mags( mags: MultiMAGSequencesDirFmt, distance_matrix: skbio.DistanceMatrix, threshold: float = 0.99, + metadata: Metadata = None, + metadata_column: str = "complete", + find_max: bool = True, ) -> (MAGSequencesDirFmt, pd.DataFrame): distances = distance_matrix.to_data_frame() # find similar bins, according to the threshold bin_clusters = _find_similar_bins_fcluster(distances, threshold) - # find the longest bin in each cluster - bin_lengths = _get_bin_lengths(mags) - longest_bins = [bin_lengths[ids].idxmax() for ids in bin_clusters] + # select one representative bin per cluster by metadata and length + representative_bins = _get_representatives( + mags, metadata, metadata_column, bin_clusters, find_max + ) # generate a map between the original bins and the dereplicated bins - final_bins = _remap_bins(bin_clusters, longest_bins, distances) + final_bins = _remap_bins(bin_clusters, representative_bins, distances) # generate dereplicated bin sequences unique_bin_seqs = _write_unique_bins(mags, final_bins) diff --git a/q2_moshpit/dereplication/tests/data/busco_results.tsv b/q2_moshpit/dereplication/tests/data/busco_results.tsv new file mode 100644 index 00000000..e845ef3f --- /dev/null +++ b/q2_moshpit/dereplication/tests/data/busco_results.tsv @@ -0,0 +1,7 @@ +mag_id sample_id input_file dataset complete single duplicated fragmented missing n_markers scaffold_n50 contigs_n50 percent_gaps scaffolds length +24dee6fe-9b84-45bb-8145-de7b092533a1 SRR13221817 bec9c09a-62c3-4fbb-8f7f-9fdf9aefc02f.fasta bacteria_odb10 28.2 27.4 0.8 8.9 62.9 124 4785 4785 0.000% 265 1219165 +ca7012fc-ba65-40c3-84f5-05aa478a7585 SRR13221817 5978e667-0476-4921-8cc2-34b9d1b508c1.fasta bacteria_odb10 1.6 1.6 0.0 1.6 96.8 124 3548 3548 0.000% 67 245922 +fb0bc871-04f6-486b-a10e-8e0cb66f8de3 SRR13221817 625c95e6-ac2f-4e6e-9470-af8cd11c75dd.fasta bacteria_odb10 26.6 26.6 0.0 3.2 70.2 124 78679 78679 0.000% 17 714893 +d65a71fa-4279-4588-b937-0747ed5d604d SRR13221817 77029d46-09f7-4d6e-940f-efcafc6ade9e.fasta bacteria_odb10 96.0 95.2 0.8 1.6 2.4 124 35651 35651 0.000% 165 4293244 +db03f8b6-28e1-48c5-a47c-9c65f38f7357 SRR13221817 0367f168-9d6a-48d6-8ff8-b532123a13f5.fasta bacteria_odb10 96.0 96.0 0.0 0.8 3.2 124 52720 52720 0.000% 166 4893325 +fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf SRR13221817 509a5fe7-a5ab-42ea-8645-0510aa77e1fe.fasta bacteria_odb10 96.0 19.4 0.0 1.6 79.0 124 71847 71847 0.000% 20 761157 \ No newline at end of file diff --git a/q2_moshpit/dereplication/tests/test_dereplication.py b/q2_moshpit/dereplication/tests/test_dereplication.py index f95fd3c7..3b4f345d 100644 --- a/q2_moshpit/dereplication/tests/test_dereplication.py +++ b/q2_moshpit/dereplication/tests/test_dereplication.py @@ -11,12 +11,13 @@ import unittest import pandas as pd +import qiime2 import skbio from q2_moshpit.dereplication.derep import ( _find_similar_bins_fcluster, _get_bin_lengths, _remap_bins, _reassign_bins_to_samples, _write_unique_bins, _generate_pa_table, - dereplicate_mags + dereplicate_mags, _get_representatives ) from q2_types.feature_data_mag import MAGSequencesDirFmt from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt @@ -72,6 +73,13 @@ def setUp(self): '24dee6fe-9b84-45bb-8145-de7b092533a1': 0 } } + self.busco_results = pd.read_csv( + filepath_or_buffer=self.get_data_path("busco_results.tsv"), + sep='\t', + index_col=0, + dtype='str' + ) + self.busco_results.index.name = 'id' def test_find_clusters_fcluster_similar(self): obs = _find_similar_bins_fcluster(self.dist_matrix_df, 0.99) @@ -138,7 +146,8 @@ def test_generate_pa_table(self): def test_dereplicate_mags(self): mags = MultiMAGSequencesDirFmt(self.get_data_path('mags'), mode='r') - obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix, 0.99) + obs_mags, obs_pa = dereplicate_mags(mags, self.dist_matrix, + threshold=0.99) exp_mags = MAGSequencesDirFmt( self.get_data_path('mags-unique'), mode='r' ) @@ -157,6 +166,71 @@ def test_dereplicate_mags(self): # assert correct PA table was generated pd.testing.assert_frame_equal(exp_pa, obs_pa) + def test_get_representatives_metadata_max_value(self): + obs = _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column="complete", + bin_clusters=self.clusters_99, + find_max=True + ) + exp = [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'fa4d7420-d0a4-455a-b4d7-4fa66e54c9bf', + 'd65a71fa-4279-4588-b937-0747ed5d604d' + ] + self.assertListEqual(exp, obs) + + def test_get_representatives_metadata_min_value(self): + obs = _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column="complete", + bin_clusters=self.clusters_99, + find_max=False + ) + exp = [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'ca7012fc-ba65-40c3-84f5-05aa478a7585', + 'fb0bc871-04f6-486b-a10e-8e0cb66f8de3' + ] + self.assertListEqual(exp, obs) + + def test_get_representatives_length(self): + obs = _get_representatives( + mags=self.bins, + metadata=None, + column=None, + bin_clusters=self.clusters_99, + find_max=True + ) + exp = [ + '24dee6fe-9b84-45bb-8145-de7b092533a1', + 'ca7012fc-ba65-40c3-84f5-05aa478a7585', + 'd65a71fa-4279-4588-b937-0747ed5d604d' + ] + self.assertListEqual(exp, obs) + + def test_get_representatives_key_error(self): + with self.assertRaisesRegex(KeyError, "version"): + _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column="version", + bin_clusters=self.clusters_99, + find_max=True + ) + + def test_get_representatives_value_error(self): + with self.assertRaisesRegex(ValueError, "numerical"): + _get_representatives( + mags=self.bins, + metadata=qiime2.Metadata(self.busco_results), + column="dataset", + bin_clusters=self.clusters_99, + find_max=True + ) + if __name__ == '__main__': unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 4bf77433..a2c290c2 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -418,10 +418,13 @@ function=q2_moshpit.dereplication.dereplicate_mags, inputs={ "mags": SampleData[MAGs], - "distance_matrix": DistanceMatrix + "distance_matrix": DistanceMatrix, }, parameters={ - "threshold": Float % Range(0, 1, inclusive_end=True) + "threshold": Float % Range(0, 1, inclusive_end=True), + "metadata": Metadata, + "metadata_column": Str, + "find_max": Bool }, outputs=[ ('dereplicated_mags', FeatureData[MAG]), @@ -429,11 +432,18 @@ ], input_descriptions={ "mags": "MAGs to be dereplicated.", - "distance_matrix": "Matrix of distances between MAGs." + "distance_matrix": "Matrix of distances between MAGs.", }, parameter_descriptions={ "threshold": "Similarity threshold required to consider " - "two bins identical." + "two bins identical.", + "metadata": "Metadata table.", + "metadata_column": "Name of the metadata column used to choose the " + "most representative bins.", + "find_max": "Set to True to choose the bin with the highest value in " + "the metadata column. Set to False to choose the bin " + "with the lowest value.", + }, output_descriptions={ "dereplicated_mags": "Dereplicated MAGs.", @@ -443,7 +453,14 @@ description='This method dereplicates MAGs from multiple samples ' 'using distances between them found in the provided ' 'distance matrix. For each cluster of similar MAGs, ' - 'the longest one will be selected as the representative.', + 'the longest one will be selected as the representative. If ' + 'metadata is given as input, the MAG with the ' + 'highest or lowest value in the specified metadata column ' + 'is chosen, depending on the parameter "find-max". ' + 'If there are MAGs with identical values, the longer one is ' + 'chosen. For example an artifact of type BUSCOResults can be ' + 'passed as metadata, and the dereplication can be done by ' + 'highest "completeness".', citations=[] )