Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Aug 20, 2024
1 parent 3a0daf1 commit 86427e1
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 70 deletions.
98 changes: 62 additions & 36 deletions rescript/bv_brc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
import qiime2
import pandas as pd
import requests
from q2_types.feature_data import (MixedCaseDNAFASTAFormat, ProteinFASTAFormat,
TSVTaxonomyDirectoryFormat)
from q2_types.genome_data import GenomeSequencesDirectoryFormat
from q2_types.feature_data import TSVTaxonomyDirectoryFormat
from q2_types.genome_data import (GenomeSequencesDirectoryFormat,
GenesDirectoryFormat,
ProteinsDirectoryFormat)

from rescript.ncbi import _allowed_ranks, _default_ranks
import json
Expand All @@ -22,7 +23,6 @@ def fetch_genomes_bv_brc(
rql_query: str = None,
genome_ids: list = None
) -> GenomeSequencesDirectoryFormat:

# Parameter validation
rql_query = id_list_handling(rql_query=rql_query,
ids=genome_ids,
Expand All @@ -46,10 +46,10 @@ def fetch_genomes_bv_brc(


def fetch_metadata_bv_brc(data_type: str, rql_query: str) -> qiime2.Metadata:

# Download data
response = download_data(
url=f"https://www.bv-brc.org/api/{data_type}/?{rql_query}&http_accept=text/tsv",
url=f"https://www.bv-brc.org/api/{data_type}/"
f"?{rql_query}&http_accept=text/tsv",
data_type=data_type
)

Expand All @@ -68,7 +68,6 @@ def fetch_taxonomy_bv_brc(
ranks: list = None,
taxon_ids: list = None,
) -> TSVTaxonomyDirectoryFormat:

# Parameter validation
rql_query = id_list_handling(rql_query=rql_query,
ids=taxon_ids,
Expand All @@ -81,7 +80,8 @@ def fetch_taxonomy_bv_brc(

# Get requests response
response = download_data(
url=f"https://www.bv-brc.org/api/taxonomy/?{rql_query}&http_accept=text/tsv",
url="https://www.bv-brc.org/api/taxonomy/"
f"?{rql_query}&http_accept=text/tsv",
data_type="taxonomy"
)

Expand All @@ -105,8 +105,10 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks):
lineage_split = lineage_names.split(';')
rank_split = lineage_ranks.split(';')

# Dictionary to map taxonomic ranks to their prefixes for the specified ranks
rank_to_prefix = {key: _allowed_ranks[key] for key in ranks if key in ranks}
# Dictionary to map taxonomic ranks to their prefixes for the specified
# ranks
rank_to_prefix = {key: _allowed_ranks[key]
for key in ranks if key in ranks}

# Initialize the list for the parsed lineage
parsed_lineage = []
Expand All @@ -119,13 +121,14 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks):
else:
pass

# Ensure all taxonomic levels are covered (fill in missing levels with just the
# prefix)
# Ensure all taxonomic levels are covered (fill in missing levels with
# just the prefix)
final_lineage = []
for required_prefix in rank_to_prefix.values():
# Check if any parsed_lineage item starts with the required prefix
match = next(
(item for item in parsed_lineage if item.startswith(required_prefix)), None)
(item for item in parsed_lineage
if item.startswith(required_prefix)), None)
if match:
final_lineage.append(match)
else:
Expand All @@ -137,10 +140,11 @@ def parse_lineage_names_with_ranks(lineage_names, lineage_ranks, ranks):

def transform_taxonomy_df(df, ranks):
# Apply the transformation
df['Taxon'] = df.apply(
lambda row: parse_lineage_names_with_ranks(lineage_names=row['lineage_names'],
lineage_ranks=row['lineage_ranks'],
ranks=ranks), axis=1)
df['Taxon'] = df.apply(lambda row:
parse_lineage_names_with_ranks(
lineage_names=row['lineage_names'],
lineage_ranks=row['lineage_ranks'],
ranks=ranks), axis=1)

# Rename columns and set index
df = df.rename(columns={'taxon_id': 'Feature ID'})
Expand All @@ -152,39 +156,58 @@ def transform_taxonomy_df(df, ranks):
def fetch_genome_features_bv_brc(
rql_query: str = None,
feature_ids: list = None,
) -> (MixedCaseDNAFASTAFormat, ProteinFASTAFormat):

) -> (GenesDirectoryFormat, ProteinsDirectoryFormat):
# Parameter validation
rql_query = id_list_handling(rql_query=rql_query,
ids=feature_ids,
parameter_name="feature_ids",
data_field="feature_id")

# Define output formats
genes = MixedCaseDNAFASTAFormat()
proteins = ProteinFASTAFormat()
genes = GenesDirectoryFormat()
proteins = ProteinsDirectoryFormat()

# Construct URLs for genes and proteins downloads
base_url = "https://www.bv-brc.org/api/genome_feature/?"
genes_url = base_url + f"{rql_query}&http_accept=application/dna+fasta"
proteins_url = base_url + f"{rql_query}&http_accept=application/protein+fasta"
proteins_url = base_url + (f"{rql_query}&http_accept=application/"
"protein+fasta")

# Get requests response for genes and proteins
response_genes = download_data(url=genes_url, data_type="genome_feature")
response_proteins = download_data(url=proteins_url, data_type="genome_feature")
response_proteins = download_data(url=proteins_url,
data_type="genome_feature")

genes_fasta_upper = convert_fasta_to_uppercase(response_genes.text)
proteins_fasta_upper = convert_fasta_to_uppercase(response_proteins.text)

# Save genes and proteins as FASTA files
fasta_genes = response_genes.text
with genes.open() as file:
file.write(fasta_genes)
with open(os.path.join(str(genes), "genes.fasta"), 'w') as fasta_file:
fasta_file.write(genes_fasta_upper)

fasta_proteins = response_proteins.text
with proteins.open() as file:
file.write(fasta_proteins)
with open(os.path.join(str(proteins), "proteins.fasta"),
'w') as fasta_file:
fasta_file.write(proteins_fasta_upper)

return genes, proteins


def convert_fasta_to_uppercase(fasta_string):
# Split string into lines
lines = fasta_string.splitlines()
result_lines = []

# Loop through all lines. If line does not start with ">" the characters
# get converted to upper case
for line in lines:
if line.startswith(">"): # This is a header line
result_lines.append(line)
else: # This is a sequence line
result_lines.append(line.upper())

return "\n".join(result_lines)


def json_to_fasta(json, output_dir):
# Dictionary to hold sequences grouped by genome_id
fasta_files = {}
Expand Down Expand Up @@ -229,8 +252,8 @@ def error_handling(response, data_type):
# No data found for query or incorrect RQL query
if response.text == "[]":
raise ValueError("No data could be retrieved. Either because of an "
"incorrect RQL query or because no data exists for the "
"query.")
"incorrect RQL query or because no data exists for "
"the query.")

elif response.text.startswith("A Database Error Occured:"):

Expand All @@ -248,8 +271,9 @@ def error_handling(response, data_type):
# Incorrect field for data type
elif response_dict['msg'].startswith("undefined field"):
raise ValueError(
f"Error code {response_dict['code']}: {response_dict['msg']}. \n"
f"Allowed fields for data type {data_type}: \n{data_fields[data_type]}"
f"Error code {response_dict['code']}: {response_dict['msg']}. "
f"\nAllowed fields for data type {data_type}: "
f"\n{data_fields[data_type]}"
)

# Handling any other errors that start with "A Database Error Occured:"
Expand All @@ -263,11 +287,13 @@ def error_handling(response, data_type):
raise ValueError(response.text)


def id_list_handling(rql_query: str, ids: list, parameter_name: str, data_field: str):
def id_list_handling(rql_query: str, ids: list, parameter_name: str,
data_field: str):
# Error if rql_query and ids parameters are given
if rql_query and ids:
raise ValueError(f"Parameters rql_query and {parameter_name} can't be used "
"simultaneously.")
raise ValueError(
f"Parameters rql_query and {parameter_name} can't be used "
"simultaneously.")

# Error if rql_query and ids parameters are not given
elif not rql_query and not ids:
Expand Down
Loading

0 comments on commit 86427e1

Please sign in to comment.