changed genome features to be saved in files per genome

bokulich-lab · Aug 21, 2024 · 0613365 · 0613365
1 parent d491eda
commit 0613365
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 69 deletions.
diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py
@@ -178,34 +178,46 @@ def fetch_genome_features_bv_brc(
     response_proteins = download_data(url=proteins_url,
                                       data_type="genome_feature")
 
-    genes_fasta_upper = convert_fasta_to_uppercase(response_genes.text)
-    proteins_fasta_upper = convert_fasta_to_uppercase(response_proteins.text)
-
-    # Save genes and proteins as FASTA files
-    with open(os.path.join(str(genes), "genes.fasta"), 'w') as fasta_file:
-        fasta_file.write(genes_fasta_upper)
-
-    with open(os.path.join(str(proteins), "proteins.fasta"),
-              'w') as fasta_file:
-        fasta_file.write(proteins_fasta_upper)
+    # Convert all sequences to upper case characters to conform with
+    # DNAFASTAFormat
+    genes_fasta = parse_fasta_to_dict(response_genes.text)
+    proteins_fasta = parse_fasta_to_dict(response_proteins.text)
+
+    # Save genes and proteins as FASTA files one file per genome_id
+    for genome_id, fasta_sequences in genes_fasta.items():
+        with open(os.path.join(str(genes), f"{genome_id}.fasta"),
+                  'w') as fasta_file:
+            fasta_file.write(fasta_sequences)
+
+    for genome_id, fasta_sequences in proteins_fasta.items():
+        with open(os.path.join(str(proteins), f"{genome_id}.fasta"),
+                  'w') as fasta_file:
+            fasta_file.write(fasta_sequences)
 
     return genes, proteins
 
 
-def convert_fasta_to_uppercase(fasta_string):
-    # Split string into lines
-    lines = fasta_string.splitlines()
-    result_lines = []
-
-    # Loop through all lines. If line does not start with ">" the characters
-    # get converted to upper case
-    for line in lines:
-        if line.startswith(">"):  # This is a header line
-            result_lines.append(line)
-        else:  # This is a sequence line
-            result_lines.append(line.upper())
+def parse_fasta_to_dict(fasta_string):
+    # Creates a dict with genome_id as keys and the corresponding FASTA
+    # entries in upper case
+    fasta_dict = {}
+
+    genome_id = None
+    for line in fasta_string.splitlines():
+        if line.startswith(">"):
+            # Extract the genome ID from the header
+            genome_id = line.split("|")[-1][:-1].strip()
+            if genome_id not in fasta_dict:
+                # Start a new entry with the header
+                fasta_dict[genome_id] = line + "\n"
+            else:
+                # Append the header to the existing entry
+                fasta_dict[genome_id] += line + "\n"
+        else:
+            # Append the sequence line in uppercase
+            fasta_dict[genome_id] += line.upper() + "\n"
 
-    return "\n".join(result_lines)
+    return fasta_dict
 
 
 def json_to_fasta(json, output_dir):

diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py
@@ -5,20 +5,22 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+import os
 import unittest
 from unittest.mock import Mock, patch, mock_open, MagicMock
 
 import pandas as pd
 from q2_types.feature_data import (MixedCaseDNAFASTAFormat,
                                    ProteinFASTAFormat,
                                    TSVTaxonomyDirectoryFormat)
-from q2_types.genome_data import GenomeSequencesDirectoryFormat
+from q2_types.genome_data import GenomeSequencesDirectoryFormat, \
+    GenesDirectoryFormat, ProteinsDirectoryFormat
 from qiime2.plugin.testing import TestPluginBase
 
 from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \
     fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc, id_list_handling, \
     error_handling, download_data, json_to_fasta, transform_taxonomy_df, \
-    parse_lineage_names_with_ranks, convert_fasta_to_uppercase
+    parse_lineage_names_with_ranks, parse_fasta_to_dict
 
 
 class TestIDListHandling(TestPluginBase):
@@ -219,74 +221,91 @@ def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file):
 class TestFetchGenomeFeaturesBVBR(TestPluginBase):
     package = 'rescript.tests'
 
+    @patch('rescript.bv_brc.parse_fasta_to_dict')
     @patch('rescript.bv_brc.download_data')
     @patch('rescript.bv_brc.id_list_handling')
-    @patch.object(MixedCaseDNAFASTAFormat, 'open')
-    @patch.object(ProteinFASTAFormat, 'open')
-    def test_fetch_genome_features_bv_brc(
-            self, mock_protein_open, mock_genes_open, mock_id_list_handling,
-            mock_download_data
-    ):
+    @patch('builtins.open', new_callable=mock_open)
+    def test_fetch_genome_features_bv_brc(self, mock_open,
+                                          mock_id_list_handling,
+                                          mock_download_data,
+                                          mock_parse_fasta_to_dict):
         # Mock the id_list_handling function
-        mock_id_list_handling.return_value = ("in(feature_id, "
+        mock_id_list_handling.return_value = ("in(feature_id,"
                                               "(feature1,feature2))")
 
-        # Mock the responses from download_data
-        mock_genes_response = MagicMock()
-        mock_genes_response.text = ">gene1\nATGC\n>gene2\nATGC"
-        mock_proteins_response = MagicMock()
-        mock_proteins_response.text = (
-            ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK")
-        mock_download_data.side_effect = [mock_genes_response,
-                                          mock_proteins_response]
-
-        # Mock file write actions
-        mock_genes_file = MagicMock()
-        mock_protein_file = MagicMock()
-        mock_genes_open.return_value.__enter__.return_value = mock_genes_file
-        mock_protein_open.return_value.__enter__.return_value = (
-            mock_protein_file)
+        # Mock the download_data function responses
+        mock_response_genes = MagicMock()
+        mock_response_genes.text = "mocked_genes_fasta_data"
+        mock_response_proteins = MagicMock()
+        mock_response_proteins.text = "mocked_proteins_fasta_data"
+        mock_download_data.side_effect = [mock_response_genes,
+                                          mock_response_proteins]
+
+        # Mock the parse_fasta_to_dict function
+        mock_parse_fasta_to_dict.side_effect = [
+            {'2030927.4755': '>fig|2030927| GTPase [ABC | '
+                             '2030927.4755]\nATGA\n'},
+            {'1234567.89': '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n'}
+        ]
 
-        # Call the function
+        # Call the function with the test RQL query
         genes, proteins = fetch_genome_features_bv_brc(
-            rql_query="in(feature_id, (feature1,feature2))",
-            feature_ids=["feature1", "feature2"]
+            rql_query="in(feature_id,(feature1,feature2))"
         )
 
-        # Assertions
+        # Assertions to ensure the correct calls were made
         mock_id_list_handling.assert_called_once_with(
-            rql_query="in(feature_id, (feature1,feature2))",
-            ids=["feature1", "feature2"],
+            rql_query="in(feature_id,(feature1,feature2))",
+            ids=None,
             parameter_name="feature_ids",
             data_field="feature_id"
         )
 
         mock_download_data.assert_any_call(
-            url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, "
+            url="https://www.bv-brc.org/api/genome_feature/?in(feature_id,"
                 "(feature1,feature2))&http_accept=application/dna+fasta",
             data_type="genome_feature"
         )
 
         mock_download_data.assert_any_call(
-            url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, "
+            url="https://www.bv-brc.org/api/genome_feature/?in(feature_id,"
                 "(feature1,feature2))&http_accept=application/protein+fasta",
             data_type="genome_feature"
         )
 
-        # Check that the correct data is written to the correct files
-        mock_genes_file.write.assert_called_once_with(
-            ">gene1\nATGC\n>gene2\nATGC")
-        mock_protein_file.write.assert_called_once_with(
-            ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK")
-
-        self.assertIsInstance(genes, MixedCaseDNAFASTAFormat)
-        self.assertIsInstance(proteins, ProteinFASTAFormat)
-
-    def test_convert_fasta_to_uppercase(self):
-        input_fasta = ">header1\natgca\ngtacg\n>header2\nttgaa\ncctg"
-        expected_output = ">header1\nATGCA\nGTACG\n>header2\nTTGAA\nCCTG"
-
-        result = convert_fasta_to_uppercase(input_fasta)
+        mock_parse_fasta_to_dict.assert_any_call("mocked_genes_fasta_data")
+        mock_parse_fasta_to_dict.assert_any_call("mocked_proteins_fasta_data")
+
+        # Check that the files were written correctly for genes
+        mock_open.assert_any_call(
+            os.path.join(str(genes), "2030927.4755.fasta"), 'w')
+        mock_open().write.assert_any_call(
+            '>fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n')
+
+        # Check that the files were written correctly for proteins
+        mock_open.assert_any_call(
+            os.path.join(str(proteins), "1234567.89.fasta"), 'w')
+        mock_open().write.assert_any_call(
+            '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n')
+
+        # Check that the return types are correct
+        self.assertIsInstance(genes, GenesDirectoryFormat)
+        self.assertIsInstance(proteins, ProteinsDirectoryFormat)
+
+    def test_parse_fasta_to_dict(self):
+        fasta_string = (
+            ">fig|2030927| GTPase [ABC | 2030927.4755]\natga\n"
+            ">fig|1234567| protein [XYZ | 1234567.89]\ngcgt\n"
+        )
+        expected_output = {
+            '2030927.4755': (
+                ">fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n"
+            ),
+            '1234567.89': (
+                ">fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n"
+            )
+        }
+        result = parse_fasta_to_dict(fasta_string)
         self.assertEqual(result, expected_output)