From 86427e11ab3cbfad656afd3492de9886679f5f4c Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Tue, 20 Aug 2024 16:32:56 +0200
Subject: [PATCH] lint

---
 rescript/bv_brc.py            | 98 ++++++++++++++++++++++-------------
 rescript/tests/test_bv_brc.py | 90 ++++++++++++++++++++------------
 2 files changed, 118 insertions(+), 70 deletions(-)

diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py
index d1860ff..a13134e 100644
--- a/rescript/bv_brc.py
+++ b/rescript/bv_brc.py
@@ -10,9 +10,10 @@
 import qiime2
 import pandas as pd
 import requests
-from q2_types.feature_data import (MixedCaseDNAFASTAFormat, ProteinFASTAFormat,
-                                   TSVTaxonomyDirectoryFormat)
-from q2_types.genome_data import GenomeSequencesDirectoryFormat
+from q2_types.feature_data import TSVTaxonomyDirectoryFormat
+from q2_types.genome_data import (GenomeSequencesDirectoryFormat,
+                                  GenesDirectoryFormat,
+                                  ProteinsDirectoryFormat)
 
 from rescript.ncbi import _allowed_ranks, _default_ranks
 import json
@@ -22,7 +23,6 @@ def fetch_genomes_bv_brc(
         rql_query: str = None,
         genome_ids: list = None
 ) -> GenomeSequencesDirectoryFormat:
-
     # Parameter validation
     rql_query = id_list_handling(rql_query=rql_query,
                                  ids=genome_ids,
@@ -46,10 +46,10 @@ def fetch_genomes_bv_brc(
 
 
 def fetch_metadata_bv_brc(data_type: str, rql_query: str) -> qiime2.Metadata:
-
     # Download data
     response = download_data(
-        url=f"https://www.bv-brc.org/api/{data_type}/?{rql_query}&http_accept=text/tsv",
+        url=f"https://www.bv-brc.org/api/{data_type}/"
+            f"?{rql_query}&http_accept=text/tsv",
         data_type=data_type
     )
 
@@ -68,7 +68,6 @@ def fetch_taxonomy_bv_brc(
         ranks: list = None,
         taxon_ids: list = None,
 ) -> TSVTaxonomyDirectoryFormat:
-
     # Parameter validation
     rql_query = id_list_handling(rql_query=rql_query,
                                  ids=taxon_ids,
@@ -81,7 +80,8 @@ def fetch_taxonomy_bv_brc(
 
     # Get requests response
     response = download_data(
-        url=f"https://www.bv-brc.org/api/taxonomy/?{rql_query}&http_accept=text/tsv",
+        url="https://www.bv-brc.org/api/taxonomy/"
+            f"?{rql_query}&http_accept=text/tsv",
         data_type="taxonomy"
     )
 
@@ -105,8 +105,10 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks):
     lineage_split = lineage_names.split(';')
     rank_split = lineage_ranks.split(';')
 
-    # Dictionary to map taxonomic ranks to their prefixes for the specified ranks
-    rank_to_prefix = {key: _allowed_ranks[key] for key in ranks if key in ranks}
+    # Dictionary to map taxonomic ranks to their prefixes for the specified
+    # ranks
+    rank_to_prefix = {key: _allowed_ranks[key]
+                      for key in ranks if key in ranks}
 
     # Initialize the list for the parsed lineage
     parsed_lineage = []
@@ -119,13 +121,14 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks):
         else:
             pass
 
-    # Ensure all taxonomic levels are covered (fill in missing levels with just the
-    # prefix)
+    # Ensure all taxonomic levels are covered (fill in missing levels with
+    # just the prefix)
     final_lineage = []
     for required_prefix in rank_to_prefix.values():
         # Check if any parsed_lineage item starts with the required prefix
         match = next(
-            (item for item in parsed_lineage if item.startswith(required_prefix)), None)
+            (item for item in parsed_lineage
+             if item.startswith(required_prefix)), None)
         if match:
             final_lineage.append(match)
         else:
@@ -137,10 +140,11 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks):
 
 def transform_taxonomy_df(df, ranks):
     # Apply the transformation
-    df['Taxon'] = df.apply(
-        lambda row: parse_lineage_names_with_ranks(lineage_names=row['lineage_names'],
-                                                   lineage_ranks=row['lineage_ranks'],
-                                                   ranks=ranks), axis=1)
+    df['Taxon'] = df.apply(lambda row:
+                           parse_lineage_names_with_ranks(
+                               lineage_names=row['lineage_names'],
+                               lineage_ranks=row['lineage_ranks'],
+                               ranks=ranks), axis=1)
 
     # Rename columns and set index
     df = df.rename(columns={'taxon_id': 'Feature ID'})
@@ -152,8 +156,7 @@ def transform_taxonomy_df(df, ranks):
 def fetch_genome_features_bv_brc(
         rql_query: str = None,
         feature_ids: list = None,
-) -> (MixedCaseDNAFASTAFormat, ProteinFASTAFormat):
-
+) -> (GenesDirectoryFormat, ProteinsDirectoryFormat):
     # Parameter validation
     rql_query = id_list_handling(rql_query=rql_query,
                                  ids=feature_ids,
@@ -161,30 +164,50 @@ def fetch_genome_features_bv_brc(
                                  data_field="feature_id")
 
     # Define output formats
-    genes = MixedCaseDNAFASTAFormat()
-    proteins = ProteinFASTAFormat()
+    genes = GenesDirectoryFormat()
+    proteins = ProteinsDirectoryFormat()
 
     # Construct URLs for genes and proteins downloads
     base_url = "https://www.bv-brc.org/api/genome_feature/?"
     genes_url = base_url + f"{rql_query}&http_accept=application/dna+fasta"
-    proteins_url = base_url + f"{rql_query}&http_accept=application/protein+fasta"
+    proteins_url = base_url + (f"{rql_query}&http_accept=application/"
+                               "protein+fasta")
 
     # Get requests response for genes and proteins
     response_genes = download_data(url=genes_url, data_type="genome_feature")
-    response_proteins = download_data(url=proteins_url, data_type="genome_feature")
+    response_proteins = download_data(url=proteins_url,
+                                      data_type="genome_feature")
+
+    genes_fasta_upper = convert_fasta_to_uppercase(response_genes.text)
+    proteins_fasta_upper = convert_fasta_to_uppercase(response_proteins.text)
 
     # Save genes and proteins as FASTA files
-    fasta_genes = response_genes.text
-    with genes.open() as file:
-        file.write(fasta_genes)
+    with open(os.path.join(str(genes), "genes.fasta"), 'w') as fasta_file:
+        fasta_file.write(genes_fasta_upper)
 
-    fasta_proteins = response_proteins.text
-    with proteins.open() as file:
-        file.write(fasta_proteins)
+    with open(os.path.join(str(proteins), "proteins.fasta"),
+              'w') as fasta_file:
+        fasta_file.write(proteins_fasta_upper)
 
     return genes, proteins
 
 
+def convert_fasta_to_uppercase(fasta_string):
+    # Split string into lines
+    lines = fasta_string.splitlines()
+    result_lines = []
+
+    # Loop through all lines. If line does not start with ">" the characters
+    # get converted to upper case
+    for line in lines:
+        if line.startswith(">"):  # This is a header line
+            result_lines.append(line)
+        else:  # This is a sequence line
+            result_lines.append(line.upper())
+
+    return "\n".join(result_lines)
+
+
 def json_to_fasta(json, output_dir):
     # Dictionary to hold sequences grouped by genome_id
     fasta_files = {}
@@ -229,8 +252,8 @@ def error_handling(response, data_type):
     # No data found for query or incorrect RQL query
     if response.text == "[]":
         raise ValueError("No data could be retrieved. Either because of an "
-                         "incorrect RQL query or because no data exists for the "
-                         "query.")
+                         "incorrect RQL query or because no data exists for "
+                         "the query.")
 
     elif response.text.startswith("A Database Error Occured:"):
 
@@ -248,8 +271,9 @@ def error_handling(response, data_type):
         # Incorrect field for data type
         elif response_dict['msg'].startswith("undefined field"):
             raise ValueError(
-                f"Error code {response_dict['code']}: {response_dict['msg']}. \n"
-                f"Allowed fields for data type {data_type}: \n{data_fields[data_type]}"
+                f"Error code {response_dict['code']}: {response_dict['msg']}. "
+                f"\nAllowed fields for data type {data_type}: "
+                f"\n{data_fields[data_type]}"
             )
 
         # Handling any other errors that start with "A Database Error Occured:"
@@ -263,11 +287,13 @@ def error_handling(response, data_type):
         raise ValueError(response.text)
 
 
-def id_list_handling(rql_query: str, ids: list, parameter_name: str, data_field: str):
+def id_list_handling(rql_query: str, ids: list, parameter_name: str,
+                     data_field: str):
     # Error if rql_query and ids parameters are given
     if rql_query and ids:
-        raise ValueError(f"Parameters rql_query and {parameter_name} can't be used "
-                         "simultaneously.")
+        raise ValueError(
+            f"Parameters rql_query and {parameter_name} can't be used "
+            "simultaneously.")
 
     # Error if rql_query and ids parameters are not given
     elif not rql_query and not ids:
diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py
index 48d07e6..a28afd3 100644
--- a/rescript/tests/test_bv_brc.py
+++ b/rescript/tests/test_bv_brc.py
@@ -9,15 +9,16 @@
 from unittest.mock import Mock, patch, mock_open, MagicMock
 
 import pandas as pd
-from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat, \
-    TSVTaxonomyDirectoryFormat
+from q2_types.feature_data import (MixedCaseDNAFASTAFormat,
+                                   ProteinFASTAFormat,
+                                   TSVTaxonomyDirectoryFormat)
 from q2_types.genome_data import GenomeSequencesDirectoryFormat
 from qiime2.plugin.testing import TestPluginBase
 
 from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \
     fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc, id_list_handling, \
     error_handling, download_data, json_to_fasta, transform_taxonomy_df, \
-    parse_lineage_names_with_ranks
+    parse_lineage_names_with_ranks, convert_fasta_to_uppercase
 
 
 class TestIDListHandling(TestPluginBase):
@@ -25,8 +26,8 @@ class TestIDListHandling(TestPluginBase):
 
     def test_error_both_parameters_given(self):
         with self.assertRaisesRegex(ValueError,
-                                    "Parameters rql_query and ids can't be used "
-                                    "simultaneously."):
+                                    "Parameters rql_query and ids can't be "
+                                    "used simultaneously."):
             id_list_handling(rql_query="some_query",
                              ids=[1, 2, 3],
                              parameter_name="ids",
@@ -34,8 +35,8 @@ def test_error_both_parameters_given(self):
 
     def test_error_neither_parameter_given(self):
         with self.assertRaisesRegex(ValueError,
-                                    "At least one of the parameters rql_query and ids "
-                                    "has to be given."):
+                                    "At least one of the parameters rql_query "
+                                    "and ids has to be given."):
             id_list_handling(rql_query="",
                              ids=[],
                              parameter_name="ids",
@@ -97,7 +98,8 @@ class TestDownloadData(TestPluginBase):
 
     @patch('rescript.bv_brc.requests.get')
     @patch('rescript.bv_brc.error_handling')
-    def test_download_data_success(self, mock_error_handling, mock_requests_get):
+    def test_download_data_success(self, mock_error_handling,
+                                   mock_requests_get):
         # Mock the requests.get response for a successful request
         mock_response = Mock()
         mock_response.status_code = 200
@@ -113,7 +115,8 @@ def test_download_data_success(self, mock_error_handling, mock_requests_get):
 
     @patch('rescript.bv_brc.requests.get')
     @patch('rescript.bv_brc.error_handling')
-    def test_download_data_error_400(self, mock_error_handling, mock_requests_get):
+    def test_download_data_error_400(self, mock_error_handling,
+                                     mock_requests_get):
         # Mock the requests.get response for a 400 Bad Request
         mock_response = Mock()
         mock_response.status_code = 400
@@ -129,7 +132,8 @@ def test_download_data_error_400(self, mock_error_handling, mock_requests_get):
 
     @patch('rescript.bv_brc.requests.get')
     @patch('rescript.bv_brc.error_handling')
-    def test_download_data_other_error(self, mock_error_handling, mock_requests_get):
+    def test_download_data_other_error(self, mock_error_handling,
+                                       mock_requests_get):
         # Mock the requests.get response for any other error
         mock_response = Mock()
         mock_response.status_code = 500
@@ -188,8 +192,10 @@ def test_json_to_fasta_multiple_genomes(self, mock_file):
         json_to_fasta(self.json_input_1 + self.json_input_2, "/fake/dir")
 
         # Expected FASTA content
-        expected_fasta_genome1 = ">accn|acc1   desc1   [genome_name1 | genome1]\nATGC"
-        expected_fasta_genome2 = ">accn|acc2   desc2   [genome_name2 | genome2]\nCGTA"
+        expected_fasta_genome1 = (">accn|acc1   desc1   [genome_name1 | "
+                                  "genome1]\nATGC")
+        expected_fasta_genome2 = (">accn|acc2   desc2   [genome_name2 | "
+                                  "genome2]\nCGTA")
 
         # Check if the files were created with the correct path and content
         mock_file().write.assert_any_call(expected_fasta_genome1)
@@ -197,12 +203,13 @@ def test_json_to_fasta_multiple_genomes(self, mock_file):
 
     @patch('rescript.bv_brc.open', new_callable=mock_open)
     def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file):
-
         json_to_fasta(self.json_input_1 + self.json_input_1, "/fake/dir")
 
         # Expected FASTA content
-        expected_fasta = (">accn|acc1   desc1   [genome_name1 | genome1]\nATGC\n"
-                          ">accn|acc1   desc1   [genome_name1 | genome1]\nATGC")
+        expected_fasta = (">accn|acc1   desc1   [genome_name1 | "
+                          "genome1]\nATGC\n"
+                          ">accn|acc1   desc1   [genome_name1 | "
+                          "genome1]\nATGC")
 
         # Check if the file was created with the correct path and content
         mock_file.assert_called_once_with("/fake/dir/genome1.fasta", 'w')
@@ -221,13 +228,15 @@ def test_fetch_genome_features_bv_brc(
             mock_download_data
     ):
         # Mock the id_list_handling function
-        mock_id_list_handling.return_value = "in(feature_id, (feature1,feature2))"
+        mock_id_list_handling.return_value = ("in(feature_id, "
+                                              "(feature1,feature2))")
 
         # Mock the responses from download_data
         mock_genes_response = MagicMock()
         mock_genes_response.text = ">gene1\nATGC\n>gene2\nATGC"
         mock_proteins_response = MagicMock()
-        mock_proteins_response.text = ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK"
+        mock_proteins_response.text = (
+            ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK")
         mock_download_data.side_effect = [mock_genes_response,
                                           mock_proteins_response]
 
@@ -235,7 +244,8 @@ def test_fetch_genome_features_bv_brc(
         mock_genes_file = MagicMock()
         mock_protein_file = MagicMock()
         mock_genes_open.return_value.__enter__.return_value = mock_genes_file
-        mock_protein_open.return_value.__enter__.return_value = mock_protein_file
+        mock_protein_open.return_value.__enter__.return_value = (
+            mock_protein_file)
 
         # Call the function
         genes, proteins = fetch_genome_features_bv_brc(
@@ -264,13 +274,21 @@ def test_fetch_genome_features_bv_brc(
         )
 
         # Check that the correct data is written to the correct files
-        mock_genes_file.write.assert_called_once_with(">gene1\nATGC\n>gene2\nATGC")
+        mock_genes_file.write.assert_called_once_with(
+            ">gene1\nATGC\n>gene2\nATGC")
         mock_protein_file.write.assert_called_once_with(
             ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK")
 
         self.assertIsInstance(genes, MixedCaseDNAFASTAFormat)
         self.assertIsInstance(proteins, ProteinFASTAFormat)
 
+    def test_convert_fasta_to_uppercase(self):
+        input_fasta = ">header1\natgca\ngtacg\n>header2\nttgaa\ncctg"
+        expected_output = ">header1\nATGCA\nGTACG\n>header2\nTTGAA\nCCTG"
+
+        result = convert_fasta_to_uppercase(input_fasta)
+        self.assertEqual(result, expected_output)
+
 
 class TestFetchGenomesBVBRC(TestPluginBase):
     package = 'rescript.tests'
@@ -279,7 +297,7 @@ class TestFetchGenomesBVBRC(TestPluginBase):
     @patch('rescript.bv_brc.download_data')
     @patch('rescript.bv_brc.id_list_handling')
     def test_fetch_genomes_bv_brc(
-        self, mock_id_list_handling, mock_download_data, mock_json_to_fasta
+            self, mock_id_list_handling, mock_download_data, mock_json_to_fasta
     ):
         # Mock the id_list_handling function
         mock_id_list_handling.return_value = "genome_id=in(genome1,genome2)"
@@ -327,7 +345,8 @@ def test_fetch_metadata_bv_brc(self, mock_download_data,
                                    mock_read_csv, mock_metadata):
         # Mock the download_data response
         mock_response = MagicMock()
-        mock_response.text = "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4"
+        mock_response.text = (
+            "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4")
         mock_download_data.return_value = mock_response
 
         # Mock the pandas read_csv return value
@@ -358,11 +377,10 @@ def test_fetch_metadata_bv_brc(self, mock_download_data,
         args, kwargs = mock_read_csv.call_args
         self.assertEqual(kwargs['sep'], '\t')
 
-        self.assertEqual(args[0].getvalue(),
-                         "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4")
+        self.assertEqual(args[0].getvalue(), "id\tcolumn1\tcolumn2\n1\tdata1"
+                                             "\tdata2\n2\tdata3\tdata4")
 
         mock_metadata.assert_called_once_with(mock_df)
-        self.assertEqual(result, mock_metadata_instance)
 
 
 class TestFetchTaxonomyBVBR(TestPluginBase):
@@ -374,7 +392,7 @@ class TestFetchTaxonomyBVBR(TestPluginBase):
     @patch('rescript.bv_brc.pd.read_csv')
     @patch('rescript.bv_brc.id_list_handling')
     def test_fetch_taxonomy_bv_brc(
-        self, mock_id_list_handling, mock_read_csv, mock_download_data,
+            self, mock_id_list_handling, mock_read_csv, mock_download_data,
             mock_transform_taxonomy_df, mock_to_csv
     ):
         # Mock the id_list_handling function
@@ -382,7 +400,8 @@ def test_fetch_taxonomy_bv_brc(
 
         # Mock the download_data response
         mock_response = MagicMock()
-        mock_response.text = "id\trank1\trank2\n1\tdata1\tdata2\n2\tdata3\tdata4"
+        mock_response.text = (
+            "id\trank1\trank2\n1\tdata1\tdata2\n2\tdata3\tdata4")
         mock_download_data.return_value = mock_response
 
         # Prepare mocks for file output
@@ -402,8 +421,8 @@ def test_fetch_taxonomy_bv_brc(
             )
 
             mock_download_data.assert_called_once_with(
-                url="https://www.bv-brc.org/api/taxonomy/?taxon_id=in(taxon1,taxon2)"
-                    "&http_accept=text/tsv",
+                url="https://www.bv-brc.org/api/taxonomy/"
+                    "?taxon_id=in(taxon1,taxon2)&http_accept=text/tsv",
                 data_type="taxonomy"
             )
 
@@ -441,20 +460,23 @@ def test_parse_with_missing_ranks(self):
         lineage_ranks = "kingdom;phylum;family"
         ranks = ['kingdom', 'phylum', 'class', 'order', 'genus', 'species']
 
-        result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks)
+        result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks,
+                                                ranks)
         expected = "k__Bacteria; p__Proteobacteria; c__; o__; g__; s__"
 
         self.assertEqual(result, expected)
 
     def test_parse_with_no_ranks_provided(self):
-        lineage_names = ("Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;"
-                         "Enterobacteriaceae;Escherichia;coli")
+        lineage_names = (
+            "Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;"
+            "Enterobacteriaceae;Escherichia;coli")
         lineage_ranks = "kingdom;phylum;class;order;family;genus;species"
         ranks = None  # Should fall back to _default_ranks
 
-        result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks)
+        result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks,
+                                                ranks)
         expected = ("k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; "
-                    "o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; "
-                    "s__coli")
+                    "o__Enterobacterales; f__Enterobacteriaceae; "
+                    "g__Escherichia; s__coli")
 
         self.assertEqual(result, expected)