From 86427e11ab3cbfad656afd3492de9886679f5f4c Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 20 Aug 2024 16:32:56 +0200 Subject: [PATCH] lint --- rescript/bv_brc.py | 98 ++++++++++++++++++++++------------- rescript/tests/test_bv_brc.py | 90 ++++++++++++++++++++------------ 2 files changed, 118 insertions(+), 70 deletions(-) diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py index d1860ff..a13134e 100644 --- a/rescript/bv_brc.py +++ b/rescript/bv_brc.py @@ -10,9 +10,10 @@ import qiime2 import pandas as pd import requests -from q2_types.feature_data import (MixedCaseDNAFASTAFormat, ProteinFASTAFormat, - TSVTaxonomyDirectoryFormat) -from q2_types.genome_data import GenomeSequencesDirectoryFormat +from q2_types.feature_data import TSVTaxonomyDirectoryFormat +from q2_types.genome_data import (GenomeSequencesDirectoryFormat, + GenesDirectoryFormat, + ProteinsDirectoryFormat) from rescript.ncbi import _allowed_ranks, _default_ranks import json @@ -22,7 +23,6 @@ def fetch_genomes_bv_brc( rql_query: str = None, genome_ids: list = None ) -> GenomeSequencesDirectoryFormat: - # Parameter validation rql_query = id_list_handling(rql_query=rql_query, ids=genome_ids, @@ -46,10 +46,10 @@ def fetch_genomes_bv_brc( def fetch_metadata_bv_brc(data_type: str, rql_query: str) -> qiime2.Metadata: - # Download data response = download_data( - url=f"https://www.bv-brc.org/api/{data_type}/?{rql_query}&http_accept=text/tsv", + url=f"https://www.bv-brc.org/api/{data_type}/" + f"?{rql_query}&http_accept=text/tsv", data_type=data_type ) @@ -68,7 +68,6 @@ def fetch_taxonomy_bv_brc( ranks: list = None, taxon_ids: list = None, ) -> TSVTaxonomyDirectoryFormat: - # Parameter validation rql_query = id_list_handling(rql_query=rql_query, ids=taxon_ids, @@ -81,7 +80,8 @@ def fetch_taxonomy_bv_brc( # Get requests response response = download_data( - url=f"https://www.bv-brc.org/api/taxonomy/?{rql_query}&http_accept=text/tsv", + url="https://www.bv-brc.org/api/taxonomy/" + f"?{rql_query}&http_accept=text/tsv", data_type="taxonomy" ) @@ -105,8 +105,10 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks): lineage_split = lineage_names.split(';') rank_split = lineage_ranks.split(';') - # Dictionary to map taxonomic ranks to their prefixes for the specified ranks - rank_to_prefix = {key: _allowed_ranks[key] for key in ranks if key in ranks} + # Dictionary to map taxonomic ranks to their prefixes for the specified + # ranks + rank_to_prefix = {key: _allowed_ranks[key] + for key in ranks if key in ranks} # Initialize the list for the parsed lineage parsed_lineage = [] @@ -119,13 +121,14 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks): else: pass - # Ensure all taxonomic levels are covered (fill in missing levels with just the - # prefix) + # Ensure all taxonomic levels are covered (fill in missing levels with + # just the prefix) final_lineage = [] for required_prefix in rank_to_prefix.values(): # Check if any parsed_lineage item starts with the required prefix match = next( - (item for item in parsed_lineage if item.startswith(required_prefix)), None) + (item for item in parsed_lineage + if item.startswith(required_prefix)), None) if match: final_lineage.append(match) else: @@ -137,10 +140,11 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks): def transform_taxonomy_df(df, ranks): # Apply the transformation - df['Taxon'] = df.apply( - lambda row: parse_lineage_names_with_ranks(lineage_names=row['lineage_names'], - lineage_ranks=row['lineage_ranks'], - ranks=ranks), axis=1) + df['Taxon'] = df.apply(lambda row: + parse_lineage_names_with_ranks( + lineage_names=row['lineage_names'], + lineage_ranks=row['lineage_ranks'], + ranks=ranks), axis=1) # Rename columns and set index df = df.rename(columns={'taxon_id': 'Feature ID'}) @@ -152,8 +156,7 @@ def transform_taxonomy_df(df, ranks): def fetch_genome_features_bv_brc( rql_query: str = None, feature_ids: list = None, -) -> (MixedCaseDNAFASTAFormat, ProteinFASTAFormat): - +) -> (GenesDirectoryFormat, ProteinsDirectoryFormat): # Parameter validation rql_query = id_list_handling(rql_query=rql_query, ids=feature_ids, @@ -161,30 +164,50 @@ def fetch_genome_features_bv_brc( data_field="feature_id") # Define output formats - genes = MixedCaseDNAFASTAFormat() - proteins = ProteinFASTAFormat() + genes = GenesDirectoryFormat() + proteins = ProteinsDirectoryFormat() # Construct URLs for genes and proteins downloads base_url = "https://www.bv-brc.org/api/genome_feature/?" genes_url = base_url + f"{rql_query}&http_accept=application/dna+fasta" - proteins_url = base_url + f"{rql_query}&http_accept=application/protein+fasta" + proteins_url = base_url + (f"{rql_query}&http_accept=application/" + "protein+fasta") # Get requests response for genes and proteins response_genes = download_data(url=genes_url, data_type="genome_feature") - response_proteins = download_data(url=proteins_url, data_type="genome_feature") + response_proteins = download_data(url=proteins_url, + data_type="genome_feature") + + genes_fasta_upper = convert_fasta_to_uppercase(response_genes.text) + proteins_fasta_upper = convert_fasta_to_uppercase(response_proteins.text) # Save genes and proteins as FASTA files - fasta_genes = response_genes.text - with genes.open() as file: - file.write(fasta_genes) + with open(os.path.join(str(genes), "genes.fasta"), 'w') as fasta_file: + fasta_file.write(genes_fasta_upper) - fasta_proteins = response_proteins.text - with proteins.open() as file: - file.write(fasta_proteins) + with open(os.path.join(str(proteins), "proteins.fasta"), + 'w') as fasta_file: + fasta_file.write(proteins_fasta_upper) return genes, proteins +def convert_fasta_to_uppercase(fasta_string): + # Split string into lines + lines = fasta_string.splitlines() + result_lines = [] + + # Loop through all lines. If line does not start with ">" the characters + # get converted to upper case + for line in lines: + if line.startswith(">"): # This is a header line + result_lines.append(line) + else: # This is a sequence line + result_lines.append(line.upper()) + + return "\n".join(result_lines) + + def json_to_fasta(json, output_dir): # Dictionary to hold sequences grouped by genome_id fasta_files = {} @@ -229,8 +252,8 @@ def error_handling(response, data_type): # No data found for query or incorrect RQL query if response.text == "[]": raise ValueError("No data could be retrieved. Either because of an " - "incorrect RQL query or because no data exists for the " - "query.") + "incorrect RQL query or because no data exists for " + "the query.") elif response.text.startswith("A Database Error Occured:"): @@ -248,8 +271,9 @@ def error_handling(response, data_type): # Incorrect field for data type elif response_dict['msg'].startswith("undefined field"): raise ValueError( - f"Error code {response_dict['code']}: {response_dict['msg']}. \n" - f"Allowed fields for data type {data_type}: \n{data_fields[data_type]}" + f"Error code {response_dict['code']}: {response_dict['msg']}. " + f"\nAllowed fields for data type {data_type}: " + f"\n{data_fields[data_type]}" ) # Handling any other errors that start with "A Database Error Occured:" @@ -263,11 +287,13 @@ def error_handling(response, data_type): raise ValueError(response.text) -def id_list_handling(rql_query: str, ids: list, parameter_name: str, data_field: str): +def id_list_handling(rql_query: str, ids: list, parameter_name: str, + data_field: str): # Error if rql_query and ids parameters are given if rql_query and ids: - raise ValueError(f"Parameters rql_query and {parameter_name} can't be used " - "simultaneously.") + raise ValueError( + f"Parameters rql_query and {parameter_name} can't be used " + "simultaneously.") # Error if rql_query and ids parameters are not given elif not rql_query and not ids: diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py index 48d07e6..a28afd3 100644 --- a/rescript/tests/test_bv_brc.py +++ b/rescript/tests/test_bv_brc.py @@ -9,15 +9,16 @@ from unittest.mock import Mock, patch, mock_open, MagicMock import pandas as pd -from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat, \ - TSVTaxonomyDirectoryFormat +from q2_types.feature_data import (MixedCaseDNAFASTAFormat, + ProteinFASTAFormat, + TSVTaxonomyDirectoryFormat) from q2_types.genome_data import GenomeSequencesDirectoryFormat from qiime2.plugin.testing import TestPluginBase from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \ fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc, id_list_handling, \ error_handling, download_data, json_to_fasta, transform_taxonomy_df, \ - parse_lineage_names_with_ranks + parse_lineage_names_with_ranks, convert_fasta_to_uppercase class TestIDListHandling(TestPluginBase): @@ -25,8 +26,8 @@ class TestIDListHandling(TestPluginBase): def test_error_both_parameters_given(self): with self.assertRaisesRegex(ValueError, - "Parameters rql_query and ids can't be used " - "simultaneously."): + "Parameters rql_query and ids can't be " + "used simultaneously."): id_list_handling(rql_query="some_query", ids=[1, 2, 3], parameter_name="ids", @@ -34,8 +35,8 @@ def test_error_both_parameters_given(self): def test_error_neither_parameter_given(self): with self.assertRaisesRegex(ValueError, - "At least one of the parameters rql_query and ids " - "has to be given."): + "At least one of the parameters rql_query " + "and ids has to be given."): id_list_handling(rql_query="", ids=[], parameter_name="ids", @@ -97,7 +98,8 @@ class TestDownloadData(TestPluginBase): @patch('rescript.bv_brc.requests.get') @patch('rescript.bv_brc.error_handling') - def test_download_data_success(self, mock_error_handling, mock_requests_get): + def test_download_data_success(self, mock_error_handling, + mock_requests_get): # Mock the requests.get response for a successful request mock_response = Mock() mock_response.status_code = 200 @@ -113,7 +115,8 @@ def test_download_data_success(self, mock_error_handling, mock_requests_get): @patch('rescript.bv_brc.requests.get') @patch('rescript.bv_brc.error_handling') - def test_download_data_error_400(self, mock_error_handling, mock_requests_get): + def test_download_data_error_400(self, mock_error_handling, + mock_requests_get): # Mock the requests.get response for a 400 Bad Request mock_response = Mock() mock_response.status_code = 400 @@ -129,7 +132,8 @@ def test_download_data_error_400(self, mock_error_handling, mock_requests_get): @patch('rescript.bv_brc.requests.get') @patch('rescript.bv_brc.error_handling') - def test_download_data_other_error(self, mock_error_handling, mock_requests_get): + def test_download_data_other_error(self, mock_error_handling, + mock_requests_get): # Mock the requests.get response for any other error mock_response = Mock() mock_response.status_code = 500 @@ -188,8 +192,10 @@ def test_json_to_fasta_multiple_genomes(self, mock_file): json_to_fasta(self.json_input_1 + self.json_input_2, "/fake/dir") # Expected FASTA content - expected_fasta_genome1 = ">accn|acc1 desc1 [genome_name1 | genome1]\nATGC" - expected_fasta_genome2 = ">accn|acc2 desc2 [genome_name2 | genome2]\nCGTA" + expected_fasta_genome1 = (">accn|acc1 desc1 [genome_name1 | " + "genome1]\nATGC") + expected_fasta_genome2 = (">accn|acc2 desc2 [genome_name2 | " + "genome2]\nCGTA") # Check if the files were created with the correct path and content mock_file().write.assert_any_call(expected_fasta_genome1) @@ -197,12 +203,13 @@ def test_json_to_fasta_multiple_genomes(self, mock_file): @patch('rescript.bv_brc.open', new_callable=mock_open) def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file): - json_to_fasta(self.json_input_1 + self.json_input_1, "/fake/dir") # Expected FASTA content - expected_fasta = (">accn|acc1 desc1 [genome_name1 | genome1]\nATGC\n" - ">accn|acc1 desc1 [genome_name1 | genome1]\nATGC") + expected_fasta = (">accn|acc1 desc1 [genome_name1 | " + "genome1]\nATGC\n" + ">accn|acc1 desc1 [genome_name1 | " + "genome1]\nATGC") # Check if the file was created with the correct path and content mock_file.assert_called_once_with("/fake/dir/genome1.fasta", 'w') @@ -221,13 +228,15 @@ def test_fetch_genome_features_bv_brc( mock_download_data ): # Mock the id_list_handling function - mock_id_list_handling.return_value = "in(feature_id, (feature1,feature2))" + mock_id_list_handling.return_value = ("in(feature_id, " + "(feature1,feature2))") # Mock the responses from download_data mock_genes_response = MagicMock() mock_genes_response.text = ">gene1\nATGC\n>gene2\nATGC" mock_proteins_response = MagicMock() - mock_proteins_response.text = ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK" + mock_proteins_response.text = ( + ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK") mock_download_data.side_effect = [mock_genes_response, mock_proteins_response] @@ -235,7 +244,8 @@ def test_fetch_genome_features_bv_brc( mock_genes_file = MagicMock() mock_protein_file = MagicMock() mock_genes_open.return_value.__enter__.return_value = mock_genes_file - mock_protein_open.return_value.__enter__.return_value = mock_protein_file + mock_protein_open.return_value.__enter__.return_value = ( + mock_protein_file) # Call the function genes, proteins = fetch_genome_features_bv_brc( @@ -264,13 +274,21 @@ def test_fetch_genome_features_bv_brc( ) # Check that the correct data is written to the correct files - mock_genes_file.write.assert_called_once_with(">gene1\nATGC\n>gene2\nATGC") + mock_genes_file.write.assert_called_once_with( + ">gene1\nATGC\n>gene2\nATGC") mock_protein_file.write.assert_called_once_with( ">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK") self.assertIsInstance(genes, MixedCaseDNAFASTAFormat) self.assertIsInstance(proteins, ProteinFASTAFormat) + def test_convert_fasta_to_uppercase(self): + input_fasta = ">header1\natgca\ngtacg\n>header2\nttgaa\ncctg" + expected_output = ">header1\nATGCA\nGTACG\n>header2\nTTGAA\nCCTG" + + result = convert_fasta_to_uppercase(input_fasta) + self.assertEqual(result, expected_output) + class TestFetchGenomesBVBRC(TestPluginBase): package = 'rescript.tests' @@ -279,7 +297,7 @@ class TestFetchGenomesBVBRC(TestPluginBase): @patch('rescript.bv_brc.download_data') @patch('rescript.bv_brc.id_list_handling') def test_fetch_genomes_bv_brc( - self, mock_id_list_handling, mock_download_data, mock_json_to_fasta + self, mock_id_list_handling, mock_download_data, mock_json_to_fasta ): # Mock the id_list_handling function mock_id_list_handling.return_value = "genome_id=in(genome1,genome2)" @@ -327,7 +345,8 @@ def test_fetch_metadata_bv_brc(self, mock_download_data, mock_read_csv, mock_metadata): # Mock the download_data response mock_response = MagicMock() - mock_response.text = "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4" + mock_response.text = ( + "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4") mock_download_data.return_value = mock_response # Mock the pandas read_csv return value @@ -358,11 +377,10 @@ def test_fetch_metadata_bv_brc(self, mock_download_data, args, kwargs = mock_read_csv.call_args self.assertEqual(kwargs['sep'], '\t') - self.assertEqual(args[0].getvalue(), - "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4") + self.assertEqual(args[0].getvalue(), "id\tcolumn1\tcolumn2\n1\tdata1" + "\tdata2\n2\tdata3\tdata4") mock_metadata.assert_called_once_with(mock_df) - self.assertEqual(result, mock_metadata_instance) class TestFetchTaxonomyBVBR(TestPluginBase): @@ -374,7 +392,7 @@ class TestFetchTaxonomyBVBR(TestPluginBase): @patch('rescript.bv_brc.pd.read_csv') @patch('rescript.bv_brc.id_list_handling') def test_fetch_taxonomy_bv_brc( - self, mock_id_list_handling, mock_read_csv, mock_download_data, + self, mock_id_list_handling, mock_read_csv, mock_download_data, mock_transform_taxonomy_df, mock_to_csv ): # Mock the id_list_handling function @@ -382,7 +400,8 @@ def test_fetch_taxonomy_bv_brc( # Mock the download_data response mock_response = MagicMock() - mock_response.text = "id\trank1\trank2\n1\tdata1\tdata2\n2\tdata3\tdata4" + mock_response.text = ( + "id\trank1\trank2\n1\tdata1\tdata2\n2\tdata3\tdata4") mock_download_data.return_value = mock_response # Prepare mocks for file output @@ -402,8 +421,8 @@ def test_fetch_taxonomy_bv_brc( ) mock_download_data.assert_called_once_with( - url="https://www.bv-brc.org/api/taxonomy/?taxon_id=in(taxon1,taxon2)" - "&http_accept=text/tsv", + url="https://www.bv-brc.org/api/taxonomy/" + "?taxon_id=in(taxon1,taxon2)&http_accept=text/tsv", data_type="taxonomy" ) @@ -441,20 +460,23 @@ def test_parse_with_missing_ranks(self): lineage_ranks = "kingdom;phylum;family" ranks = ['kingdom', 'phylum', 'class', 'order', 'genus', 'species'] - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks) + result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, + ranks) expected = "k__Bacteria; p__Proteobacteria; c__; o__; g__; s__" self.assertEqual(result, expected) def test_parse_with_no_ranks_provided(self): - lineage_names = ("Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;" - "Enterobacteriaceae;Escherichia;coli") + lineage_names = ( + "Bacteria;Proteobacteria;Gammaproteobacteria;Enterobacterales;" + "Enterobacteriaceae;Escherichia;coli") lineage_ranks = "kingdom;phylum;class;order;family;genus;species" ranks = None # Should fall back to _default_ranks - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks) + result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, + ranks) expected = ("k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; " - "o__Enterobacterales; f__Enterobacteriaceae; g__Escherichia; " - "s__coli") + "o__Enterobacterales; f__Enterobacteriaceae; " + "g__Escherichia; s__coli") self.assertEqual(result, expected)