diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py index a13134e..3bac483 100644 --- a/rescript/bv_brc.py +++ b/rescript/bv_brc.py @@ -178,34 +178,46 @@ def fetch_genome_features_bv_brc( response_proteins = download_data(url=proteins_url, data_type="genome_feature") - genes_fasta_upper = convert_fasta_to_uppercase(response_genes.text) - proteins_fasta_upper = convert_fasta_to_uppercase(response_proteins.text) - - # Save genes and proteins as FASTA files - with open(os.path.join(str(genes), "genes.fasta"), 'w') as fasta_file: - fasta_file.write(genes_fasta_upper) - - with open(os.path.join(str(proteins), "proteins.fasta"), - 'w') as fasta_file: - fasta_file.write(proteins_fasta_upper) + # Convert all sequences to upper case characters to conform with + # DNAFASTAFormat + genes_fasta = parse_fasta_to_dict(response_genes.text) + proteins_fasta = parse_fasta_to_dict(response_proteins.text) + + # Save genes and proteins as FASTA files one file per genome_id + for genome_id, fasta_sequences in genes_fasta.items(): + with open(os.path.join(str(genes), f"{genome_id}.fasta"), + 'w') as fasta_file: + fasta_file.write(fasta_sequences) + + for genome_id, fasta_sequences in proteins_fasta.items(): + with open(os.path.join(str(proteins), f"{genome_id}.fasta"), + 'w') as fasta_file: + fasta_file.write(fasta_sequences) return genes, proteins -def convert_fasta_to_uppercase(fasta_string): - # Split string into lines - lines = fasta_string.splitlines() - result_lines = [] - - # Loop through all lines. If line does not start with ">" the characters - # get converted to upper case - for line in lines: - if line.startswith(">"): # This is a header line - result_lines.append(line) - else: # This is a sequence line - result_lines.append(line.upper()) +def parse_fasta_to_dict(fasta_string): + # Creates a dict with genome_id as keys and the corresponding FASTA + # entries in upper case + fasta_dict = {} + + genome_id = None + for line in fasta_string.splitlines(): + if line.startswith(">"): + # Extract the genome ID from the header + genome_id = line.split("|")[-1][:-1].strip() + if genome_id not in fasta_dict: + # Start a new entry with the header + fasta_dict[genome_id] = line + "\n" + else: + # Append the header to the existing entry + fasta_dict[genome_id] += line + "\n" + else: + # Append the sequence line in uppercase + fasta_dict[genome_id] += line.upper() + "\n" - return "\n".join(result_lines) + return fasta_dict def json_to_fasta(json, output_dir): diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py index 929fee7..b6a4076 100644 --- a/rescript/tests/test_bv_brc.py +++ b/rescript/tests/test_bv_brc.py @@ -5,6 +5,7 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import os import unittest from unittest.mock import Mock, patch, mock_open, MagicMock @@ -12,13 +13,14 @@ from q2_types.feature_data import (MixedCaseDNAFASTAFormat, ProteinFASTAFormat, TSVTaxonomyDirectoryFormat) -from q2_types.genome_data import GenomeSequencesDirectoryFormat +from q2_types.genome_data import GenomeSequencesDirectoryFormat, \ + GenesDirectoryFormat, ProteinsDirectoryFormat from qiime2.plugin.testing import TestPluginBase from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \ fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc, id_list_handling, \ error_handling, download_data, json_to_fasta, transform_taxonomy_df, \ - parse_lineage_names_with_ranks, convert_fasta_to_uppercase + parse_lineage_names_with_ranks, parse_fasta_to_dict class TestIDListHandling(TestPluginBase): @@ -219,74 +221,91 @@ def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file): class TestFetchGenomeFeaturesBVBR(TestPluginBase): package = 'rescript.tests' + @patch('rescript.bv_brc.parse_fasta_to_dict') @patch('rescript.bv_brc.download_data') @patch('rescript.bv_brc.id_list_handling') - @patch.object(MixedCaseDNAFASTAFormat, 'open') - @patch.object(ProteinFASTAFormat, 'open') - def test_fetch_genome_features_bv_brc( - self, mock_protein_open, mock_genes_open, mock_id_list_handling, - mock_download_data - ): + @patch('builtins.open', new_callable=mock_open) + def test_fetch_genome_features_bv_brc(self, mock_open, + mock_id_list_handling, + mock_download_data, + mock_parse_fasta_to_dict): # Mock the id_list_handling function - mock_id_list_handling.return_value = ("in(feature_id, " + mock_id_list_handling.return_value = ("in(feature_id," "(feature1,feature2))") - # Mock the responses from download_data - mock_genes_response = MagicMock() - mock_genes_response.text = ">gene1\nATGC\n>gene2\nATGC" - mock_proteins_response = MagicMock() - mock_proteins_response.text = ( - ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK") - mock_download_data.side_effect = [mock_genes_response, - mock_proteins_response] - - # Mock file write actions - mock_genes_file = MagicMock() - mock_protein_file = MagicMock() - mock_genes_open.return_value.__enter__.return_value = mock_genes_file - mock_protein_open.return_value.__enter__.return_value = ( - mock_protein_file) + # Mock the download_data function responses + mock_response_genes = MagicMock() + mock_response_genes.text = "mocked_genes_fasta_data" + mock_response_proteins = MagicMock() + mock_response_proteins.text = "mocked_proteins_fasta_data" + mock_download_data.side_effect = [mock_response_genes, + mock_response_proteins] + + # Mock the parse_fasta_to_dict function + mock_parse_fasta_to_dict.side_effect = [ + {'2030927.4755': '>fig|2030927| GTPase [ABC | ' + '2030927.4755]\nATGA\n'}, + {'1234567.89': '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n'} + ] - # Call the function + # Call the function with the test RQL query genes, proteins = fetch_genome_features_bv_brc( - rql_query="in(feature_id, (feature1,feature2))", - feature_ids=["feature1", "feature2"] + rql_query="in(feature_id,(feature1,feature2))" ) - # Assertions + # Assertions to ensure the correct calls were made mock_id_list_handling.assert_called_once_with( - rql_query="in(feature_id, (feature1,feature2))", - ids=["feature1", "feature2"], + rql_query="in(feature_id,(feature1,feature2))", + ids=None, parameter_name="feature_ids", data_field="feature_id" ) mock_download_data.assert_any_call( - url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, " + url="https://www.bv-brc.org/api/genome_feature/?in(feature_id," "(feature1,feature2))&http_accept=application/dna+fasta", data_type="genome_feature" ) mock_download_data.assert_any_call( - url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, " + url="https://www.bv-brc.org/api/genome_feature/?in(feature_id," "(feature1,feature2))&http_accept=application/protein+fasta", data_type="genome_feature" ) - # Check that the correct data is written to the correct files - mock_genes_file.write.assert_called_once_with( - ">gene1\nATGC\n>gene2\nATGC") - mock_protein_file.write.assert_called_once_with( - ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK") - - self.assertIsInstance(genes, MixedCaseDNAFASTAFormat) - self.assertIsInstance(proteins, ProteinFASTAFormat) - - def test_convert_fasta_to_uppercase(self): - input_fasta = ">header1\natgca\ngtacg\n>header2\nttgaa\ncctg" - expected_output = ">header1\nATGCA\nGTACG\n>header2\nTTGAA\nCCTG" - - result = convert_fasta_to_uppercase(input_fasta) + mock_parse_fasta_to_dict.assert_any_call("mocked_genes_fasta_data") + mock_parse_fasta_to_dict.assert_any_call("mocked_proteins_fasta_data") + + # Check that the files were written correctly for genes + mock_open.assert_any_call( + os.path.join(str(genes), "2030927.4755.fasta"), 'w') + mock_open().write.assert_any_call( + '>fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n') + + # Check that the files were written correctly for proteins + mock_open.assert_any_call( + os.path.join(str(proteins), "1234567.89.fasta"), 'w') + mock_open().write.assert_any_call( + '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n') + + # Check that the return types are correct + self.assertIsInstance(genes, GenesDirectoryFormat) + self.assertIsInstance(proteins, ProteinsDirectoryFormat) + + def test_parse_fasta_to_dict(self): + fasta_string = ( + ">fig|2030927| GTPase [ABC | 2030927.4755]\natga\n" + ">fig|1234567| protein [XYZ | 1234567.89]\ngcgt\n" + ) + expected_output = { + '2030927.4755': ( + ">fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n" + ), + '1234567.89': ( + ">fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n" + ) + } + result = parse_fasta_to_dict(fasta_string) self.assertEqual(result, expected_output)