diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py index fa91619..ec2e736 100644 --- a/rescript/bv_brc.py +++ b/rescript/bv_brc.py @@ -25,7 +25,7 @@ def get_bv_brc_metadata( ids_metadata: Union[qiime2.NumericMetadataColumn, - qiime2.CategoricalMetadataColumn] = None, + qiime2.CategoricalMetadataColumn] = None, data_type: str = None, rql_query: str = None, data_field: str = None, @@ -67,22 +67,22 @@ def get_bv_brc_genomes( # Parameter validation rql_query = parameter_validation(rql_query=rql_query, ids=ids, - data_type="genome_sequence", + data_type="genome", data_field=data_field, metadata=ids_metadata ) - # Get requests response for genome sequences - sequences = download_data(data_type="genome_sequence", - query=rql_query, - accept="application/json", - ) + response_genomes = download_data(data_type="genome", + query=rql_query, + accept="application/json", + select=["genome_id", "taxon_id"] + ) - # Convert sequences in JSON to FASTA file - genomes = create_genome_fasta(genome_sequences=sequences) + # Get genome sequences and create FASTA files + genomes = get_genome_sequences(response_genomes=response_genomes) # Get taxonomy for sequences - taxonomy = get_taxonomy(response_sequences=sequences, + taxonomy = get_taxonomy(response_sequences=response_genomes, ranks=ranks, rank_propagation=rank_propagation, accession_name="accession") @@ -90,6 +90,25 @@ def get_bv_brc_genomes( return genomes, taxonomy +def get_genome_sequences(response_genomes): + # Extract genome ids from response (list of dicts) + genome_ids = set([str(entry['genome_id']) for entry in response_genomes]) + + # Fetch the genome sequences for all genome ids + genome_sequences = download_data( + data_type="genome_sequence", + query=f"in(genome_id,({','.join(genome_ids)}))", + accept="application/json", + select=["accession", "description", "genome_name", + "genome_id", "sequence"] + ) + + # Create FASTA files from sequences + genomes = create_genome_fasta(genome_sequences=genome_sequences) + + return genomes + + def get_bv_brc_genome_features( ids_metadata: Union[qiime2.NumericMetadataColumn, qiime2.CategoricalMetadataColumn] = None, diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py index ee9b2c2..585e95a 100644 --- a/rescript/tests/test_bv_brc.py +++ b/rescript/tests/test_bv_brc.py @@ -16,7 +16,7 @@ get_bv_brc_genome_features, parameter_validation, \ error_handling, download_data, create_genome_fasta, \ create_taxonomy_entry, get_loci, read_tsv_data_with_dtypes, process_loci, \ - get_sequences, get_taxonomy, create_taxonomy + get_sequences, get_taxonomy, create_taxonomy, get_genome_sequences class TestParameterValidation(TestPluginBase): @@ -343,20 +343,12 @@ class TestGetBvBrcGenomes(TestPluginBase): package = 'rescript.tests' @patch('rescript.bv_brc.get_taxonomy') - @patch('rescript.bv_brc.create_genome_fasta') + @patch('rescript.bv_brc.get_genome_sequences') @patch('rescript.bv_brc.download_data') @patch('rescript.bv_brc.parameter_validation') def test_get_bv_brc_genomes(self, mock_parameter_validation, - mock_download_data, mock_create_genome_fasta, + mock_download_data, mock_get_genome_sequences, mock_get_taxonomy): - # Mocked return values for the external functions - mock_parameter_validation.return_value = "mocked_rql_query" - mock_download_data.return_value = [ - {'id': 'genome1', 'sequence': 'ATGC'}, - {'id': 'genome2', 'sequence': 'GCTA'}] - mock_create_genome_fasta.return_value = MagicMock( - name='GenomeSequencesDirectoryFormat') - mock_get_taxonomy.return_value = MagicMock(name='TSVTaxonomyFormat') # Call the function get_bv_brc_genomes( @@ -764,3 +756,29 @@ def test_get_sequences(self, mock_open, mock_download_data): # Check if the correct sequences were written to the proteins file mock_open().write.assert_any_call('>feature1\nMKV\n') + + +class TestGetGenomeSequences(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.create_genome_fasta') + def test_get_genome_sequences(self, mock_create_genome_fasta, + mock_download_data): + # Sample response_genomes to be used as input + response_genomes = [ + {'genome_id': '12345'}, + {'genome_id': '67890'} + ] + + # Call the function + get_genome_sequences(response_genomes) + + # Assert that download_data was called with the correct arguments + mock_download_data.assert_called_once_with( + data_type="genome_sequence", + query=unittest.mock.ANY, + accept="application/json", + select=["accession", "description", "genome_name", "genome_id", + "sequence"] + )