diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py index 2bad090..c8d5ecb 100644 --- a/rescript/bv_brc.py +++ b/rescript/bv_brc.py @@ -8,137 +8,257 @@ from collections import OrderedDict from io import StringIO import os +from typing import Union +from urllib.parse import quote + +import numpy as np import qiime2 import pandas as pd import requests -from q2_types.feature_data import TSVTaxonomyDirectoryFormat, TSVTaxonomyFormat +from q2_types.feature_data import TSVTaxonomyFormat from q2_types.genome_data import (GenomeSequencesDirectoryFormat, GenesDirectoryFormat, - ProteinsDirectoryFormat) + ProteinsDirectoryFormat, LociDirectoryFormat) from rescript.ncbi import _allowed_ranks, _default_ranks import json -def fetch_metadata_bv_brc(data_type: str, rql_query: str) -> qiime2.Metadata: - # Download data - response = download_data( - url=f"https://www.bv-brc.org/api/{data_type}/" - f"?{rql_query}&http_accept=text/tsv", - ) - - # Convert data to data frame - tsv_data = StringIO(response.text) - metadata = pd.read_csv(tsv_data, sep='\t') +def get_bv_brc_metadata( + ids_metadata: Union[qiime2.NumericMetadataColumn, + qiime2.CategoricalMetadataColumn] = None, + data_type: str = None, + rql_query: str = None, + data_field: str = None, + ids: list = None, +) -> qiime2.Metadata: + # Parameter validation and creation of RQL query + rql_query = parameter_validation(rql_query=rql_query, + ids=ids, + data_type=data_type, + data_field=data_field, + metadata=ids_metadata) + + # Download metadata as df + metadata = download_data(data_type=data_type, + query=rql_query, + accept="text/tsv", + ) + + # Set index of metadata df metadata.index.name = "id" metadata.index = metadata.index.astype(str) - # Return as qiime2 metadata + # Replace empty values and values consisting of spaces only with np.nan + metadata.replace(to_replace=r'^\s*$', value=np.nan, regex=True, + inplace=True) + metadata.replace([None], np.nan, inplace=True) + return qiime2.Metadata(metadata) -def fetch_genomes_bv_brc( +def get_bv_brc_genomes( + ids_metadata: Union[qiime2.NumericMetadataColumn, + qiime2.CategoricalMetadataColumn] = None, rql_query: str = None, - genome_ids: list = None, + data_field: str = None, + ids: list = None, ranks: list = None, + rank_propagation: bool = True, ) -> (GenomeSequencesDirectoryFormat, TSVTaxonomyFormat): # Parameter validation - rql_query = id_list_handling(rql_query=rql_query, - ids=genome_ids, - parameter_name="genome_ids", - data_field="genome_id" - ) - - # Define output formats - genomes = GenomeSequencesDirectoryFormat() + rql_query = parameter_validation(rql_query=rql_query, + ids=ids, + data_type="genome_sequence", + data_field=data_field, + metadata=ids_metadata + ) # Get requests response for genome sequences - response_sequences = download_data( - url=f"https://www.bv-brc.org/api/genome_sequence/?{rql_query}", - ) - # Convert sequences in response_sequences JSON to FASTA - json_to_fasta(json=response_sequences.json(), output_dir=str(genomes)) + sequences = download_data(data_type="genome_sequence", + query=rql_query, + accept="application/json", + ) - taxonomy = get_taxonomy(bv_brc_response=response_sequences, ranks=ranks) + # Convert sequences in JSON to FASTA file + genomes = create_genome_fasta(genome_sequences=sequences) + + # Get taxonomy for sequences + taxonomy = get_taxonomy(response_sequences=sequences, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name="accession") return genomes, taxonomy -def fetch_genome_features_bv_brc( +def get_bv_brc_genome_features( + ids_metadata: Union[qiime2.NumericMetadataColumn, + qiime2.CategoricalMetadataColumn] = None, rql_query: str = None, - taxon_ids: list = None, + data_field: str = None, + ids: list = None, ranks: list = None, + rank_propagation: bool = True, ) -> ( GenesDirectoryFormat, ProteinsDirectoryFormat, - TSVTaxonomyFormat): + TSVTaxonomyFormat, LociDirectoryFormat): # Parameter validation - rql_query = id_list_handling(rql_query=rql_query, - ids=taxon_ids, - parameter_name="feature_ids", - data_field="feature_id") + rql_query = parameter_validation(rql_query=rql_query, + ids=ids, + data_type="genome_feature", + data_field=data_field, + metadata=ids_metadata) + + # Download genome_features data object as JSON + genome_features = download_data(data_type="genome_feature", + query=rql_query, + accept="application/json", + select=["genome_id", "feature_id", + "aa_sequence_md5", + "na_sequence_md5", "taxon_id"] + ) + + # Download nucleotide and protein sequences for features + genes, proteins = get_sequences(genome_features=genome_features) + + # Download taxonomy for feature sequences + taxonomy = get_taxonomy(response_sequences=genome_features, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name="feature_id" + ) + + # Download GFF files for genome ids + loci = get_loci(response_sequences=genome_features) + + return genes, proteins, taxonomy, loci + + +def get_sequences(genome_features): + genes = GenesDirectoryFormat() + proteins = ProteinsDirectoryFormat() + + # Extract md5 values for nucleotide and protein sequences from + # genome_features JSON + md5_ids = set([item[key] for item in genome_features for key in + ['aa_sequence_md5', 'na_sequence_md5'] if + key in item]) + + # Download sequences corresponding to the md5 values + feature_sequences = download_data(data_type="feature_sequence", + query=f"in(md5,({','.join(md5_ids)}))", + accept="application/json", + select=["md5", "sequence"]) + + # Create dict with md5 id as keys and sequence as value for faster look up + lookup_dict = {entry['md5']: entry['sequence'] for entry in + feature_sequences} + + fasta_genes = {} + fasta_proteins = {} + + # Loop through list of dicts + for entry in genome_features: + # Extract genome_id + genome_id = entry["genome_id"] + + if 'na_sequence_md5' in entry: + # Create FASTA entry for the sequence + fasta_na = (f">{entry['feature_id']}\n" + f"{lookup_dict[entry['na_sequence_md5']].upper()}\n") + + # Add FASTA entry to entries with the same genome id + fasta_genes[genome_id] = fasta_genes.get(genome_id, "") + fasta_na + + if 'aa_sequence_md5' in entry: + # Create FASTA entry for the sequence + fasta_aa = (f">{entry['feature_id']}\n" + f"{lookup_dict[entry['aa_sequence_md5']].upper()}\n") + + # Add FASTA entry to entries with the same genome id + fasta_proteins[genome_id] = (fasta_proteins.get(genome_id, "") + + fasta_aa) + + # Save genes and proteins as FASTA files one file per genome_id + for genome_id, fasta_sequences in fasta_genes.items(): + with open(os.path.join(str(genes), f"{genome_id}.fasta"), + 'w') as fasta_file: + fasta_file.write(fasta_sequences) - genes = get_sequences(rql_query=rql_query, sequence_type="genes") - proteins = get_sequences(rql_query=rql_query, sequence_type="proteins") + for genome_id, fasta_sequences in fasta_proteins.items(): + with open(os.path.join(str(proteins), f"{genome_id}.fasta"), + 'w') as fasta_file: + fasta_file.write(fasta_sequences) - response_features = download_data( - url=f"https://www.bv-brc.org/api/genome_feature/?{rql_query}" - ) - taxonomy = get_taxonomy(bv_brc_response=response_features, ranks=ranks) + return genes, proteins - return genes, proteins, taxonomy -def get_sequences(rql_query, sequence_type): - base_url = ("https://www.bv-brc.org/api/genome_feature/?" - f"&limit(1000000000){rql_query}&http_accept=application/") +def get_loci(response_sequences): + # Init loci dir format + loci = LociDirectoryFormat() - if sequence_type == "genes": - dir_format = GenesDirectoryFormat() - url = base_url + "dna+fasta" - else: - dir_format = ProteinsDirectoryFormat() - url = base_url + "protein+fasta" + # Extract genome ids from genome feature JSON + genome_ids = set([str(entry['genome_id']) for entry in response_sequences]) - # Get requests response - response = download_data(url=url) + # Download GFF files for all genome ids. For every id there is a separate + # request + for genome_id in genome_ids: + gff = download_data(data_type="genome_feature", + query=f"eq(genome_id,{genome_id})", + accept="application/gff", + ) - # Convert all sequences to upper case characters to conform with - # QIIME formats - sequences_fasta = parse_fasta_to_dict(fasta_string=response.text) + # Process the loci data and write to file + with open(os.path.join(str(loci), f"{genome_id}.gff"), 'w') as file: + file.write(process_loci(gff_string=gff)) - # Save genes and proteins as FASTA files one file per genome_id - for genome_id, fasta_sequences in sequences_fasta.items(): - with open(os.path.join(str(dir_format), f"{genome_id}.fasta"), - 'w') as fasta_file: - fasta_file.write(fasta_sequences) + return loci - return dir_format -# def get_loci(rql_query): +def process_loci(gff_string): + # Split the string into lines + lines = gff_string.splitlines() + modified_lines = [] + for line in lines: + if line.startswith("#"): + # Keep header lines unchanged + modified_lines.append(line) + else: + # Remove the first five characters "accn|" + modified_lines.append(line[5:]) -def get_taxonomy(bv_brc_response, ranks): - # Extract all taxon_ids from bv_brc_response JSON - taxon_ids = [str(entry['taxon_id']) for entry in bv_brc_response.json()] + # Join the lines back into a single string + return "\n".join(modified_lines) - # Get requests response for taxonomies - response_taxonomy = download_data( - url="https://www.bv-brc.org/api/taxonomy/" - f"?in(taxon_id,({','.join(taxon_ids)}))&http_accept=text/tsv", - ) + +def get_taxonomy(response_sequences, ranks, rank_propagation, accession_name): + # Extract all taxon_ids from response_sequences JSON + taxon_ids = set([str(entry['taxon_id']) for entry in response_sequences]) + + # Download taxonomy for all taxon_ids as df + taxonomy = download_data(data_type="taxonomy", + query=f"in(taxon_id,({','.join(taxon_ids)}))", + accept="text/tsv", + select=["taxon_id", "lineage_names", + "lineage_ranks"]) # Transform the df to conform with TSVTaxonomyFormat return create_taxonomy( - response_taxonomy=response_taxonomy, - response_sequences=bv_brc_response.json(), - ranks=ranks + taxonomy_bvbrc=taxonomy, + response_sequences=response_sequences, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name=accession_name, ) -def parse_lineage_names_with_ranks( +def create_taxonomy_entry( lineage_names, lineage_ranks, ranks=None, - rank_propagation=False + rank_propagation=True ): # Set ranks to default if no list is specified if not ranks: @@ -159,16 +279,19 @@ def parse_lineage_names_with_ranks( # Handle genus and species splitting logic if 'genus' in ranks and taxonomy.get('species'): species = taxonomy.get('species') + # If genus includes the species name cut it out and put it into species if taxonomy.get('genus'): if species.startswith(taxonomy['genus'] + ' '): species = species[len(taxonomy['genus']) + 1:] taxonomy['species'] = species + # If species includes the genus name cut it out and put it into genus elif ' ' in species: genus, species = species.split(' ', 1) taxonomy['genus'] = genus taxonomy['species'] = species - # Apply rank propagation if enabled + # Apply rank propagation if enabled. Assignes last higher rank to lower + # undefined ranks if rank_propagation: last_label = None for rank in taxonomy: @@ -176,23 +299,23 @@ def parse_lineage_names_with_ranks( taxonomy[rank] = last_label last_label = taxonomy[rank] - result = '; '.join( + # Create taxonomy entry + taxonomy_entry = '; '.join( f"{_allowed_ranks.get(rank, '')}{name if name else ''}" for rank, name in taxonomy.items() ) - return result - + return taxonomy_entry -def create_taxonomy(response_taxonomy, response_sequences, ranks, data_type): - # Read the taxonomy TSV data into a DataFrame - taxonomy_bvbrc = pd.read_csv(StringIO(response_taxonomy.text), sep='\t') - # Apply the transformation +def create_taxonomy(taxonomy_bvbrc, response_sequences, + ranks, rank_propagation, accession_name): + # Create qiime style taxonomy entries for all rows in taxonomy_bvbrc taxonomy_bvbrc['Taxon'] = ( taxonomy_bvbrc.apply(lambda row: - parse_lineage_names_with_ranks( + create_taxonomy_entry( lineage_names=row['lineage_names'], lineage_ranks=row['lineage_ranks'], + rank_propagation=rank_propagation, ranks=ranks), axis=1)) taxonomy_df = pd.DataFrame(columns=['Feature ID', 'Taxon']) @@ -200,11 +323,8 @@ def create_taxonomy(response_taxonomy, response_sequences, ranks, data_type): # Loop through each JSON dictionary in the list for entry in response_sequences: # Get the accession and taxon_id from the JSON dictionary - if data_type == "genomes": - accession = entry.get('accession') - else: - accession = entry.get('refseq_locus_tag') - taxon_id = entry.get('taxon_id') + accession = entry.get(accession_name) + taxon_id = str(entry.get('taxon_id')) # Look up the corresponding taxon in taxonomy_bvbrc using taxon_id taxon_name = taxonomy_bvbrc.loc[ @@ -230,77 +350,117 @@ def create_taxonomy(response_taxonomy, response_sequences, ranks, data_type): return taxonomy -def parse_fasta_to_dict(fasta_string): - # Creates a dict with genome_id as keys and the corresponding FASTA - # entries in upper case - fasta_dict = {} - - genome_id = None - for line in fasta_string.splitlines(): - if line.startswith(">"): - # Split the header and rearrange so the NCBI accession is first - parts = line.split('|') - rearranged_header = f">{parts[2]}|{parts[0][1:]}|{parts[1]}" - - # Append any remaining parts at the end - if len(parts) > 3: - remaining_parts = '|'.join(parts[3:]) - rearranged_header = f"{rearranged_header}|{remaining_parts}" - - # Extract the genome ID from the header - genome_id = parts[-1][:-1].strip() - - if genome_id not in fasta_dict: - # Start a new entry with the header - fasta_dict[genome_id] = rearranged_header + "\n" - else: - # Append the header to the existing entry - fasta_dict[genome_id] += rearranged_header + "\n" - else: - # Append the sequence line in uppercase - fasta_dict[genome_id] += line.upper() + "\n" - - return fasta_dict - +def create_genome_fasta(genome_sequences): + genomes = GenomeSequencesDirectoryFormat() -def json_to_fasta(json, output_dir): # Dictionary to hold sequences grouped by genome_id - fasta_files = {} + fasta_entries = {} - # Loop over all entries in dict - for entry in json: + # Loop over all dicts in genome_sequences + for entry in genome_sequences: genome_id = entry['genome_id'] - if genome_id not in fasta_files: - fasta_files[genome_id] = [] + if genome_id not in fasta_entries: + fasta_entries[genome_id] = [] # Construct FASTA format to be identical to BV-BRC FASTA headers header = (f">{entry['accession']} {entry['description']} " f"[{entry['genome_name']} | {entry['genome_id']}]") - fasta_files[genome_id].append(f"{header}\n{entry['sequence'].upper()}") + # Append FASTA entry to fasta_entries + fasta_entries[genome_id].append( + f"{header}\n{entry['sequence'].upper()}") # Write each genome_id's sequences to a separate FASTA file - for genome_id, sequences in fasta_files.items(): + for genome_id, sequences in fasta_entries.items(): fasta_content = "\n".join(sequences) - fasta_filename = os.path.join(output_dir, f"{genome_id}.fasta") + fasta_filename = os.path.join(str(genomes), f"{genome_id}.fasta") with open(fasta_filename, 'w') as fasta_file: fasta_file.write(fasta_content) + return genomes + + +def download_data(data_type, query, accept, select=None): + # URL and headers for requests + base_url = "https://www.bv-brc.org/api/" + url = base_url + data_type + "/" + headers = {'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': accept} + + results = [] if accept == "application/json" else pd.DataFrame() + + # BV-BRC sets an upper limit of 25000 entries that can be fetched at once + batch_size = 25000 + start = 0 + + while True: + # Create the data string with current batch start and limit + data = f"{query}&limit({batch_size},{start})" + + # Add select parameter to only download specified fields + if select: + data = data + f"&select({','.join(select)})" + + # POST request + response = requests.post(url=url, data=data, headers=headers) + a = len(response.json()) + # If the response is successful, process the data + if response.status_code == 200: + if accept == "application/json": + result = response.json() + results.extend(result) # Add the current batch to the list + elif accept == "text/tsv": + result = read_tsv_data_with_dtypes(response=response, + data_type=data_type) + + results = pd.concat([results, result], ignore_index=True) + else: # application == application/gff + return response.text + + # If the number of results is less than the batch size, break the + # loop + if len(result) < batch_size: + break + + # Increment the start for the next batch + start += batch_size + + # Handle errors + elif response.status_code == 400: + error_handling(response=response, data_type=data_type) + break + else: + raise ValueError(response.text) -def download_data(url): - # Get requests response - response = requests.get(url) + return results - # If response is correct return it - if response.status_code == 200: - return response - # Error handling if response incorrect - elif response.status_code == 400: - error_handling(response=response, data_type=url.split('/')[4]) - else: - raise ValueError(response.text) +def read_tsv_data_with_dtypes(response, data_type): + tsv_data = StringIO(response.text) + + # Read only the header to get a list of the column names + columns_in_file = pd.read_csv(tsv_data, sep='\t', nrows=0).columns.tolist() + + # Filter the dtype dictionary to include only the columns that + # exist in the file + dtype_dict = data_fields_bvbrc.get(data_type, {}) + filtered_dtype_dict = {col: dtype_dict[col] for col in columns_in_file if + col in dtype_dict} + + # Move the file pointer to the beginning of the file-like object + tsv_data.seek(0) + + # Read the entire file with the filtered dtype dictionary + df = pd.read_csv(tsv_data, sep='\t', dtype=filtered_dtype_dict) + + # Raise value error if no data was retrieved + if len(df) == 0: + raise ValueError("No data could be retrieved. Either because of an " + "incorrect RQL query or because no data exists for " + "the query.") + + return df def error_handling(response, data_type): @@ -328,7 +488,7 @@ def error_handling(response, data_type): raise ValueError( f"Error code {response_dict['code']}: {response_dict['msg']}. " f"\nAllowed fields for data type {data_type}: " - f"\n{data_fields[data_type]}" + f"\n{list(data_fields_bvbrc[data_type].keys())}" ) # Handling any other errors that start with "A Database Error Occured:" @@ -342,1083 +502,1115 @@ def error_handling(response, data_type): raise ValueError(response.text) -def id_list_handling(rql_query: str, ids: list, parameter_name: str, - data_field: str): - # Error if rql_query and ids parameters are given - if rql_query and ids: +def parameter_validation(rql_query=None, + ids=None, + data_type=None, + data_field=None, + metadata=None): + # Error if any data_type is None + if not data_type: + raise ValueError("Parameter 'data-type' has to be specified.") + + local = locals().copy() + # Error if any other parameter is specified simultaneously with rql_query + # or metadata + for parameter_1 in ["rql_query", "metadata"]: + for parameter_2, value in local.items(): + if parameter_2 != parameter_1: + if parameter_2 != "data_type": + if local[parameter_1] is not None and value is not None: + raise ValueError( + f"Parameters '{parameter_1}' and '{parameter_2}' " + "can't be used simultaneously.") + + # Error if ids or data_fields is specified without the other + if (ids and not data_field) or (data_field and not ids): raise ValueError( - f"Parameters rql_query and {parameter_name} can't be used " - "simultaneously.") + "If parameter 'ids' is given, parameter 'data-field' has to be " + "specified and vice versa.") + + # Error if rql_query, metadata and ids parameters are not given + if not rql_query and not ids and not metadata: + raise ValueError("At least one of the parameters 'rql-query', 'ids' " + "or 'ids_metadata' has to be specified.") - # Error if rql_query and ids parameters are not given - elif not rql_query and not ids: - raise ValueError("At least one of the parameters rql_query and " - f"{parameter_name} has to be given.") + if metadata is not None: + data_field = metadata.to_series().name + ids = metadata.to_series() + + if (data_field is not None and + data_field not in data_fields_bvbrc[data_type].keys()): + raise ValueError( + f"The data-field '{data_field}' is not permitted for the " + f"data-type '{data_type}'.\nAllowed data fields are: " + f"{list(data_fields_bvbrc[data_type].keys())}") # Construct the RQL queries - elif ids: - rql_query = f"in({data_field},({','.join(map(str, ids))}))" + if ids is not None or metadata is not None: + # Join the quoted ids with commas + joined_ids = ','.join(quote(f'"{str(id_)}"') for id_ in ids) + # Final result + rql_query = f'in({quote(data_field)},({joined_ids}))' return rql_query -data_fields = { - "antibiotics": [ - "_version_", - "antibiotic_name", - "atc_classification", - "canonical_smiles", - "cas_id", - "date_inserted", - "date_modified", - "description", - "drugbank_interactions", - "inchi_key", - "isomeric_smiles", - "mechanism_of_action", - "molecular_formula", - "molecular_weight", - "pharmacological_classes", - "pharmacology", - "pubchem_cid", - "pubchem_cid_i", - "synonyms" - ], - "enzyme_class_ref": [ - "_version_", - "date_inserted", - "date_modified", - "ec_description", - "ec_number", - "go" - ], - "epitope": [ - "_version_", - "assay_results", - "bcell_assays", - "comments", - "date_inserted", - "date_modified", - "end", - "epitope_id", - "epitope_sequence", - "epitope_type", - "host_name", - "mhc_assays", - "organism", - "protein_accession", - "protein_id", - "protein_name", - "start", - "taxon_id", - "taxon_lineage_ids", - "taxon_lineage_names", - "tcell_assays", - "total_assays" - ], - "epitope_assay": [ - "_version_", - "assay_group", - "assay_id", - "assay_measurement", - "assay_measurement_unit", - "assay_method", - "assay_result", - "assay_type", - "authors", - "date_inserted", - "date_modified", - "end", - "epitope_id", - "epitope_sequence", - "epitope_type", - "host_name", - "host_taxon_id", - "mhc_allele", - "mhc_allele_class", - "organism", - "pdb_id", - "pmid", - "protein_accession", - "protein_id", - "protein_name", - "start", - "taxon_id", - "taxon_lineage_ids", - "taxon_lineage_names", - "title" - ], - "experiment": [ - "_version_", - "additional_data", - "additional_metadata", - "biosets", - "date_inserted", - "date_modified", - "detection_instrument", - "doi", - "exp_description", - "exp_id", - "exp_name", - "exp_poc", - "exp_protocol", - "exp_title", - "exp_type", - "experimenters", - "genome_id", - "measurement_technique", - "organism", - "pmid", - "public_identifier", - "public_repository", - "samples", - "strain", - "study_description", - "study_institution", - "study_name", - "study_pi", - "study_title", - "taxon_id", - "taxon_lineage_ids", - "treatment_amount", - "treatment_duration", - "treatment_name", - "treatment_type" - ], - "bioset": [ - "_version_", - "additional_data", - "additional_metadata", - "analysis_group_1", - "analysis_group_2", - "analysis_method", - "bioset_criteria", - "bioset_description", - "bioset_id", - "bioset_name", - "bioset_result", - "bioset_type", - "date_inserted", - "date_modified", - "entity_count", - "entity_type", - "exp_id", - "exp_name", - "exp_title", - "exp_type", - "genome_id", - "organism", - "protocol", - "result_type", - "strain", - "study_description", - "study_institution", - "study_name", - "study_pi", - "study_title", - "taxon_id", - "taxon_lineage_ids", - "treatment_amount", - "treatment_duration", - "treatment_name", - "treatment_type" - ], - "bioset_result": [ - "_version_", - "bioset_description", - "bioset_id", - "bioset_name", - "bioset_type", - "counts", - "date_inserted", - "date_modified", - "entity_id", - "entity_name", - "entity_type", - "exp_id", - "exp_name", - "exp_title", - "exp_type", - "feature_id", - "fpkm", - "gene", - "gene_id", - "genome_id", - "id", - "locus_tag", - "log2_fc", - "organism", - "other_ids", - "other_value", - "p_value", - "patric_id", - "product", - "protein_id", - "result_type", - "strain", - "taxon_id", - "tpm", - "treatment_amount", - "treatment_duration", - "treatment_name", - "treatment_type", - "uniprot_id", - "z_score" - ], - "gene_ontology_ref": [ - "_version_", - "date_inserted", - "date_modified", - "definition", - "go_id", - "go_name", - "ontology" - ], - "genome": [ - "_version_", - "additional_metadata", - "altitude", - "antimicrobial_resistance", - "antimicrobial_resistance_evidence", - "assembly_accession", - "assembly_method", - "authors", - "bioproject_accession", - "biosample_accession", - "biovar", - "body_sample_site", - "body_sample_subsite", - "cds", - "cds_ratio", - "cell_shape", - "checkm_completeness", - "checkm_contamination", - "chromosomes", - "clade", - "class", - "coarse_consistency", - "collection_date", - "collection_year", - "comments", - "common_name", - "completion_date", - "contig_l50", - "contig_n50", - "contigs", - "core_families", - "core_family_ratio", - "culture_collection", - "date_inserted", - "date_modified", - "depth", - "disease", - "family", - "fine_consistency", - "gc_content", - "genbank_accessions", - "genome_id", - "genome_length", - "genome_name", - "genome_quality", - "genome_quality_flags", - "genome_status", - "genus", - "geographic_group", - "geographic_location", - "gram_stain", - "h1_clade_global", - "h1_clade_us", - "h3_clade", - "h5_clade", - "h_type", - "habitat", - "host_age", - "host_common_name", - "host_gender", - "host_group", - "host_health", - "host_name", - "host_scientific_name", - "hypothetical_cds", - "hypothetical_cds_ratio", - "isolation_comments", - "isolation_country", - "isolation_site", - "isolation_source", - "kingdom", - "lab_host", - "latitude", - "lineage", - "longitude", - "mat_peptide", - "missing_core_family_ids", - "mlst", - "motility", - "n_type", - "ncbi_project_id", - "nearest_genomes", - "optimal_temperature", - "order", - "organism_name", - "other_clinical", - "other_environmental", - "other_names", - "other_typing", - "outgroup_genomes", - "owner", - "oxygen_requirement", - "p2_genome_id", - "partial_cds", - "partial_cds_ratio", - "passage", - "pathovar", - "patric_cds", - "ph1n1_like", - "phenotype", - "phylum", - "plasmids", - "plfam_cds", - "plfam_cds_ratio", - "public", - "publication", - "reference_genome", - "refseq_accessions", - "refseq_cds", - "refseq_project_id", - "rrna", - "salinity", - "season", - "segment", - "segments", - "sequencing_centers", - "sequencing_depth", - "sequencing_platform", - "sequencing_status", - "serovar", - "species", - "sporulation", - "sra_accession", - "state_province", - "strain", - "subclade", - "subtype", - "superkingdom", - "taxon_id", - "taxon_lineage_ids", - "taxon_lineage_names", - "temperature_range", - "trna", - "type_strain", - "user_read", - "user_write" - ], - "strain": [ - "1_pb2", - "2_pb1", - "3_pa", - "4_ha", - "5_np", - "6_na", - "7_mp", - "8_ns", - "_version_", - "collection_date", - "collection_year", - "date_inserted", - "date_modified", - "family", - "genbank_accessions", - "genome_ids", - "genus", - "geographic_group", - "h_type", - "host_common_name", - "host_group", - "host_name", - "id", - "isolation_country", - "l", - "lab_host", - "m", - "n_type", - "other_segments", - "owner", - "passage", - "public", - "s", - "season", - "segment_count", - "species", - "status", - "strain", - "subtype", - "taxon_id", - "taxon_lineage_ids", - "taxon_lineage_names", - "user_read", - "user_write" - ], - "genome_amr": [ - "_version_", - "antibiotic", - "computational_method", - "computational_method_performance", - "computational_method_version", - "date_inserted", - "date_modified", - "evidence", - "genome_id", - "genome_name", - "id", - "laboratory_typing_method", - "laboratory_typing_method_version", - "laboratory_typing_platform", - "measurement", - "measurement_sign", - "measurement_unit", - "measurement_value", - "owner", - "pmid", - "public", - "resistant_phenotype", - "source", - "taxon_id", - "testing_standard", - "testing_standard_year", - "user_read", - "user_write", - "vendor" - ], - "feature_sequence": [ - "_version_", - "date_inserted", - "date_modified", - "md5", - "sequence", - "sequence_type" - ], - "genome_feature": [ - "aa_length", - "aa_sequence_md5", - "accession", - "alt_locus_tag", - "annotation", - "brc_id", - "classifier_round", - "classifier_score", - "codon_start", - "date_inserted", - "date_modified", - "end", - "feature_id", - "feature_type", - "figfam_id", - "gene", - "gene_id", - "genome_id", - "genome_name", - "go", - "location", - "na_length", - "na_sequence_md5", - "notes", - "og_id", - "owner", - "p2_feature_id", - "patric_id", - "pdb_accession", - "pgfam_id", - "plfam_id", - "product", - "property", - "protein_id", - "public", - "refseq_locus_tag", - "segments", - "sequence_id", - "sog_id", - "start", - "strand", - "taxon_id", - "uniprotkb_accession", - "user_read", - "user_write" - ], - "genome_sequence": [ - "_version_", - "accession", - "chromosome", - "date_inserted", - "date_modified", - "description", - "gc_content", - "genome_id", - "genome_name", - "gi", - "length", - "mol_type", - "owner", - "p2_sequence_id", - "plasmid", - "public", - "release_date", - "segment", - "sequence", - "sequence_id", - "sequence_md5", - "sequence_status", - "sequence_type", - "taxon_id", - "topology", - "user_read", - "user_write", - "version" - ], - "id_ref": [ - "_version_", - "date_inserted", - "date_modified", - "id", - "id_type", - "id_value", - "uniprotkb_accession" - ], - "misc_niaid_sgc": [ - "_version_", - "date_inserted", - "date_modified", - "gene_symbol_collection", - "genus", - "has_clones", - "has_proteins", - "selection_criteria", - "species", - "strain", - "target_id", - "target_status" - ], - "pathway": [ - "_version_", - "accession", - "alt_locus_tag", - "annotation", - "date_inserted", - "date_modified", - "ec_description", - "ec_number", - "feature_id", - "gene", - "genome_ec", - "genome_id", - "genome_name", - "id", - "owner", - "pathway_class", - "pathway_ec", - "pathway_id", - "pathway_name", - "patric_id", - "product", - "public", - "refseq_locus_tag", - "sequence_id", - "taxon_id", - "user_read", - "user_write" - ], - "pathway_ref": [ - "_version_", - "date_inserted", - "date_modified", - "ec_description", - "ec_number", - "id", - "map_location", - "map_name", - "map_type", - "occurrence", - "pathway_class", - "pathway_id", - "pathway_name" - ], - "ppi": [ - "_version_", - "category", - "date_inserted", - "date_modified", - "detection_method", - "domain_a", - "domain_b", - "evidence", - "feature_id_a", - "feature_id_b", - "gene_a", - "gene_b", - "genome_id_a", - "genome_id_b", - "genome_name_a", - "genome_name_b", - "id", - "interaction_type", - "interactor_a", - "interactor_b", - "interactor_desc_a", - "interactor_desc_b", - "interactor_type_a", - "interactor_type_b", - "pmid", - "refseq_locus_tag_a", - "refseq_locus_tag_b", - "score", - "source_db", - "source_id", - "taxon_id_a", - "taxon_id_b" - ], - "protein_family_ref": [ - "_version_", - "date_inserted", - "date_modified", - "family_id", - "family_product", - "family_type" - ], - "sequence_feature": [ - "aa_sequence_md5", - "aa_variant", - "additional_metadata", - "comments", - "date_inserted", - "date_modified", - "end", - "evidence_code", - "feature_id", - "genbank_accession", - "gene", - "genome_id", - "genome_name", - "id", - "length", - "patric_id", - "product", - "publication", - "refseq_locus_tag", - "segment", - "segments", - "sf_category", - "sf_id", - "sf_name", - "sf_sequence", - "sf_sequence_md5", - "source", - "source_aa_sequence", - "source_id", - "source_sf_location", - "source_strain", - "start", - "subtype", - "taxon_id", - "variant_types" - ], - "sequence_feature_vt": [ - "additional_metadata", - "comments", - "date_inserted", - "date_modified", - "id", - "sf_category", - "sf_id", - "sf_name", - "sf_sequence", - "sf_sequence_md5", - "sfvt_genome_count", - "sfvt_genome_ids", - "sfvt_id", - "sfvt_sequence", - "sfvt_sequence_md5", - "sfvt_variations" - ], - "sp_gene": [ - "_version_", - "alt_locus_tag", - "antibiotics", - "antibiotics_class", - "classification", - "date_inserted", - "date_modified", - "e_value", - "evidence", - "feature_id", - "function", - "gene", - "genome_id", - "genome_name", - "id", - "identity", - "organism", - "owner", - "patric_id", - "pmid", - "product", - "property", - "property_source", - "public", - "query_coverage", - "refseq_locus_tag", - "same_genome", - "same_genus", - "same_species", - "source", - "source_id", - "subject_coverage", - "taxon_id", - "user_read", - "user_write" - ], - "sp_gene_ref": [ - "_version_", - "antibiotics", - "antibiotics_class", - "assertion", - "classification", - "date_inserted", - "date_modified", - "function", - "gene_id", - "gene_name", - "genus", - "gi", - "id", - "locus_tag", - "organism", - "pmid", - "product", - "property", - "source", - "source_id", - "species" - ], - "spike_lineage": [ - "_version_", - "country", - "date_inserted", - "date_modified", - "growth_rate", - "id", - "lineage", - "lineage_count", - "lineage_of_concern", - "month", - "prevalence", - "region", - "sequence_features", - "total_isolates" - ], - "spike_variant": [ - "_version_", - "aa_variant", - "country", - "date_inserted", - "date_modified", - "growth_rate", - "id", - "lineage_count", - "month", - "prevalence", - "region", - "sequence_features", - "total_isolates" - ], - "structured_assertion": [ - "_version_", - "comment", - "date_inserted", - "date_modified", - "evidence_code", - "feature_id", - "id", - "owner", - "patric_id", - "pmid", - "property", - "public", - "refseq_locus_tag", - "score", - "source", - "user_read", - "user_write", - "value" - ], - "subsystem": [ - "_version_", - "active", - "class", - "date_inserted", - "date_modified", - "feature_id", - "gene", - "genome_id", - "genome_name", - "id", - "owner", - "patric_id", - "product", - "public", - "refseq_locus_tag", - "role_id", - "role_name", - "subclass", - "subsystem_id", - "subsystem_name", - "superclass", - "taxon_id", - "user_read", - "user_write" - ], - "subsystem_ref": [ - "_version_", - "class", - "date_inserted", - "date_modified", - "description", - "id", - "notes", - "pmid", - "role_id", - "role_name", - "subclass", - "subsystem_id", - "subsystem_name", - "superclass" - ], - "taxonomy": [ - "_version_", - "cds_mean", - "cds_sd", - "core_families", - "core_family_ids", - "description", - "division", - "genetic_code", - "genome_count", - "genome_length_mean", - "genome_length_sd", - "genomes", - "genomes_f", - "hypothetical_cds_ratio_mean", - "hypothetical_cds_ratio_sd", - "lineage", - "lineage_ids", - "lineage_names", - "lineage_ranks", - "other_names", - "parent_id", - "plfam_cds_ratio_mean", - "plfam_cds_ratio_sd", - "taxon_id", - "taxon_id_i", - "taxon_name", - "taxon_rank" - ], - "protein_structure": [ - "alignments", - "authors", - "date_inserted", - "date_modified", - "feature_id", - "file_path", - "gene", - "genome_id", - "institution", - "method", - "organism_name", - "patric_id", - "pdb_id", - "pmid", - "product", - "release_date", - "resolution", - "sequence", - "sequence_md5", - "taxon_id", - "taxon_lineage_ids", - "taxon_lineage_names", - "title", - "uniprotkb_accession" - ], - "protein_feature": [ - "aa_sequence_md5", - "classification", - "comments", - "date_inserted", - "date_modified", - "description", - "e_value", - "end", - "evidence", - "feature_id", - "feature_type", - "gene", - "genome_id", - "genome_name", - "id", - "interpro_description", - "interpro_id", - "length", - "patric_id", - "product", - "publication", - "refseq_locus_tag", - "score", - "segments", - "sequence", - "source", - "source_id", - "start", - "taxon_id" - ], - "surveillance": [ - "additional_metadata", - "alcohol_or_other_drug_dependence", - "breastfeeding", - "chest_imaging_interpretation", - "chronic_conditions", - "collection_city", - "collection_country", - "collection_date", - "collection_latitude", - "collection_longitude", - "collection_poi", - "collection_season", - "collection_state_province", - "collection_year", - "collector_institution", - "collector_name", - "comments", - "contact_email_address", - "contributing_institution", - "date_inserted", - "date_modified", - "daycare_attendance", - "days_elapsed_to_disease_status", - "days_elapsed_to_sample_collection", - "days_elapsed_to_vaccination", - "diagnosis", - "dialysis", - "disease_severity", - "disease_status", - "duration_of_exposure", - "duration_of_treatment", - "ecmo", - "education", - "embargo_end_date", - "exposure", - "exposure_type", - "genome_id", - "geographic_group", - "hospitalization_duration", - "hospitalized", - "host_age", - "host_capture_status", - "host_common_name", - "host_ethnicity", - "host_group", - "host_habitat", - "host_health", - "host_height", - "host_id_type", - "host_identifier", - "host_natural_state", - "host_race", - "host_sex", - "host_species", - "host_weight", - "human_leukocyte_antigens", - "id", - "infections_within_five_years", - "influenza_like_illness_over_the_past_year", - "initiation_of_treatment", - "intensive_care_unit", - "last_update_date", - "longitudinal_study", - "maintenance_medication", - "nursing_home_residence", - "onset_hours", - "other_vaccinations", - "oxygen_saturation", - "packs_per_day_for_how_many_years", - "pathogen_test_interpretation", - "pathogen_test_result", - "pathogen_test_type", - "pathogen_type", - "post_visit_medications", - "pre_visit_medications", - "pregnancy", - "primary_living_situation", - "profession", - "project_identifier", - "sample_accession", - "sample_identifier", - "sample_material", - "sample_receipt_date", - "sample_transport_medium", - "sequence_accession", - "source_of_vaccine_information", - "species", - "strain", - "submission_date", - "subtype", - "sudden_onset", - "symptoms", - "taxon_lineage_ids", - "tobacco_use", - "travel_history", - "treatment", - "treatment_dosage", - "treatment_type", - "trimester_of_pregnancy", - "types_of_allergies", - "use_of_personal_protective_equipment", - "vaccination_type", - "vaccine_dosage", - "vaccine_lot_number", - "vaccine_manufacturer", - "ventilation" - ], - "serology": [ - "additional_metadata", - "collection_city", - "collection_country", - "collection_date", - "collection_state", - "collection_year", - "comments", - "contributing_institution", - "date_inserted", - "date_modified", - "genbank_accession", - "geographic_group", - "host_age", - "host_age_group", - "host_common_name", - "host_health", - "host_identifier", - "host_sex", - "host_species", - "host_type", - "id", - "positive_definition", - "project_identifier", - "sample_accession", - "sample_identifier", - "serotype", - "strain", - "taxon_lineage_ids", - "test_antigen", - "test_interpretation", - "test_pathogen", - "test_result", - "test_type", - "virus_identifier" - ] +data_fields_bvbrc = { + "antibiotics": { + "_version_": str, + "antibiotic_name": str, + "atc_classification": str, + "canonical_smiles": str, + "cas_id": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "drugbank_interactions": str, + "inchi_key": str, + "isomeric_smiles": str, + "mechanism_of_action": str, + "molecular_formula": str, + "molecular_weight": str, + "pharmacological_classes": str, + "pharmacology": str, + "pubchem_cid": str, + "pubchem_cid_i": str, + "synonyms": str + }, + "enzyme_class_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "ec_number": str, + "go": str + }, + "epitope": { + "_version_": str, + "assay_results": str, + "bcell_assays": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "end": float, + "epitope_id": str, + "epitope_sequence": str, + "epitope_type": str, + "host_name": str, + "mhc_assays": str, + "organism": str, + "protein_accession": str, + "protein_id": str, + "protein_name": str, + "start": float, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "tcell_assays": str, + "total_assays": float + }, + "epitope_assay": { + "_version_": str, + "assay_group": str, + "assay_id": str, + "assay_measurement": str, + "assay_measurement_unit": str, + "assay_method": str, + "assay_result": str, + "assay_type": str, + "authors": str, + "date_inserted": str, + "date_modified": str, + "end": float, + "epitope_id": str, + "epitope_sequence": str, + "epitope_type": str, + "host_name": str, + "host_taxon_id": str, + "mhc_allele": str, + "mhc_allele_class": str, + "organism": str, + "pdb_id": str, + "pmid": str, + "protein_accession": str, + "protein_id": str, + "protein_name": str, + "start": float, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "title": str + }, + "experiment": { + "_version_": str, + "additional_data": str, + "additional_metadata": str, + "biosets": float, + "date_inserted": str, + "date_modified": str, + "detection_instrument": str, + "doi": str, + "exp_description": str, + "exp_id": str, + "exp_name": str, + "exp_poc": str, + "exp_protocol": str, + "exp_title": str, + "exp_type": str, + "experimenters": str, + "genome_id": str, + "measurement_technique": str, + "organism": str, + "pmid": str, + "public_identifier": str, + "public_repository": str, + "samples": float, + "strain": str, + "study_description": str, + "study_institution": str, + "study_name": str, + "study_pi": str, + "study_title": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "treatment_amount": str, + "treatment_duration": str, + "treatment_name": str, + "treatment_type": str + }, + "bioset": { + "_version_": str, + "additional_data": str, + "additional_metadata": str, + "analysis_group_1": str, + "analysis_group_2": str, + "analysis_method": str, + "bioset_criteria": str, + "bioset_description": str, + "bioset_id": str, + "bioset_name": str, + "bioset_result": str, + "bioset_type": str, + "date_inserted": str, + "date_modified": str, + "entity_count": str, + "entity_type": str, + "exp_id": str, + "exp_name": str, + "exp_title": str, + "exp_type": str, + "genome_id": str, + "organism": str, + "protocol": str, + "result_type": str, + "strain": str, + "study_description": str, + "study_institution": str, + "study_name": str, + "study_pi": str, + "study_title": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "treatment_amount": str, + "treatment_duration": str, + "treatment_name": str, + "treatment_type": str + }, + "bioset_result": { + "_version_": str, + "bioset_description": str, + "bioset_id": str, + "bioset_name": str, + "bioset_type": str, + "counts": float, + "date_inserted": str, + "date_modified": str, + "entity_id": str, + "entity_name": str, + "entity_type": str, + "exp_id": str, + "exp_name": str, + "exp_title": str, + "exp_type": str, + "feature_id": str, + "fpkm": float, + "gene": str, + "gene_id": str, + "genome_id": str, + "id": str, + "locus_tag": str, + "log2_fc": float, + "organism": str, + "other_ids": str, + "other_value": float, + "p_value": float, + "patric_id": str, + "product": str, + "protein_id": str, + "result_type": str, + "strain": str, + "taxon_id": str, + "tpm": float, + "treatment_amount": str, + "treatment_duration": str, + "treatment_name": str, + "treatment_type": str, + "uniprot_id": str, + "z_score": float + }, + "gene_ontology_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "definition": str, + "go_id": str, + "go_name": str, + "ontology": str + }, + "genome": { + "_version_": str, + "additional_metadata": str, + "altitude": str, + "antimicrobial_resistance": str, + "antimicrobial_resistance_evidence": str, + "assembly_accession": str, + "assembly_method": str, + "authors": str, + "bioproject_accession": str, + "biosample_accession": str, + "biovar": str, + "body_sample_site": str, + "body_sample_subsite": str, + "cds": float, + "cds_ratio": float, + "cell_shape": str, + "checkm_completeness": float, + "checkm_contamination": float, + "chromosomes": float, + "clade": str, + "class": str, + "coarse_consistency": float, + "collection_date": str, + "collection_year": float, + "comments": str, + "common_name": str, + "completion_date": str, + "contig_l50": float, + "contig_n50": float, + "contigs": float, + "core_families": float, + "core_family_ratio": float, + "culture_collection": str, + "date_inserted": str, + "date_modified": str, + "depth": str, + "disease": str, + "family": str, + "fine_consistency": float, + "gc_content": float, + "genbank_accessions": str, + "genome_id": str, + "genome_length": float, + "genome_name": str, + "genome_quality": str, + "genome_quality_flags": str, + "genome_status": str, + "genus": str, + "geographic_group": str, + "geographic_location": str, + "gram_stain": str, + "h1_clade_global": str, + "h1_clade_us": str, + "h3_clade": str, + "h5_clade": str, + "h_type": float, + "habitat": str, + "host_age": str, + "host_common_name": str, + "host_gender": str, + "host_group": str, + "host_health": str, + "host_name": str, + "host_scientific_name": str, + "hypothetical_cds": float, + "hypothetical_cds_ratio": float, + "isolation_comments": str, + "isolation_country": str, + "isolation_site": str, + "isolation_source": str, + "kingdom": str, + "lab_host": str, + "latitude": str, + "lineage": str, + "longitude": str, + "mat_peptide": float, + "missing_core_family_ids": str, + "mlst": str, + "motility": str, + "n_type": float, + "ncbi_project_id": str, + "nearest_genomes": str, + "optimal_temperature": str, + "order": str, + "organism_name": str, + "other_clinical": str, + "other_environmental": str, + "other_names": str, + "other_typing": str, + "outgroup_genomes": str, + "owner": str, + "oxygen_requirement": str, + "p2_genome_id": str, + "partial_cds": float, + "partial_cds_ratio": float, + "passage": str, + "pathovar": str, + "patric_cds": float, + "ph1n1_like": str, + "phenotype": str, + "phylum": str, + "plasmids": float, + "plfam_cds": float, + "plfam_cds_ratio": float, + "public": str, + "publication": str, + "reference_genome": str, + "refseq_accessions": str, + "refseq_cds": float, + "refseq_project_id": str, + "rrna": float, + "salinity": str, + "season": str, + "segment": str, + "segments": float, + "sequencing_centers": str, + "sequencing_depth": str, + "sequencing_platform": str, + "sequencing_status": str, + "serovar": str, + "species": str, + "sporulation": str, + "sra_accession": str, + "state_province": str, + "strain": str, + "subclade": str, + "subtype": str, + "superkingdom": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "temperature_range": str, + "trna": float, + "type_strain": str, + "user_read": str, + "user_write": str + }, + "strain": { + "1_pb2": str, + "2_pb1": str, + "3_pa": str, + "4_ha": str, + "5_np": str, + "6_na": str, + "7_mp": str, + "8_ns": str, + "_version_": str, + "collection_date": str, + "collection_year": float, + "date_inserted": str, + "date_modified": str, + "family": str, + "genbank_accessions": str, + "genome_ids": str, + "genus": str, + "geographic_group": str, + "h_type": float, + "host_common_name": str, + "host_group": str, + "host_name": str, + "id": str, + "isolation_country": str, + "l": str, + "lab_host": str, + "m": str, + "n_type": float, + "other_segments": str, + "owner": str, + "passage": str, + "public": str, + "s": str, + "season": str, + "segment_count": float, + "species": str, + "status": str, + "strain": str, + "subtype": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "user_read": str, + "user_write": str + }, + "genome_amr": { + "_version_": str, + "antibiotic": str, + "computational_method": str, + "computational_method_performance": str, + "computational_method_version": str, + "date_inserted": str, + "date_modified": str, + "evidence": str, + "genome_id": str, + "genome_name": str, + "id": str, + "laboratory_typing_method": str, + "laboratory_typing_method_version": str, + "laboratory_typing_platform": str, + "measurement": str, + "measurement_sign": str, + "measurement_unit": str, + "measurement_value": str, + "owner": str, + "pmid": str, + "public": str, + "resistant_phenotype": str, + "source": str, + "taxon_id": str, + "testing_standard": str, + "testing_standard_year": float, + "user_read": str, + "user_write": str, + "vendor": str + }, + "feature_sequence": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "md5": str, + "sequence": str, + "sequence_type": str + }, + "genome_feature": { + "aa_length": float, + "aa_sequence_md5": str, + "accession": str, + "alt_locus_tag": str, + "annotation": str, + "brc_id": str, + "classifier_round": float, + "classifier_score": float, + "codon_start": float, + "date_inserted": str, + "date_modified": str, + "end": float, + "feature_id": str, + "feature_type": str, + "figfam_id": str, + "gene": str, + "gene_id": str, + "genome_id": str, + "genome_name": str, + "go": str, + "location": str, + "na_length": float, + "na_sequence_md5": str, + "notes": str, + "og_id": str, + "owner": str, + "p2_feature_id": str, + "patric_id": str, + "pdb_accession": str, + "pgfam_id": str, + "plfam_id": str, + "product": str, + "property": str, + "protein_id": str, + "public": str, + "refseq_locus_tag": str, + "segments": str, + "sequence_id": str, + "sog_id": str, + "start": float, + "strand": str, + "taxon_id": str, + "uniprotkb_accession": str, + "user_read": str, + "user_write": str + }, + "genome_sequence": { + "_version_": str, + "accession": str, + "chromosome": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "gc_content": float, + "genome_id": str, + "genome_name": str, + "gi": float, + "length": float, + "mol_type": str, + "owner": str, + "p2_sequence_id": str, + "plasmid": str, + "public": str, + "release_date": str, + "segment": str, + "sequence": str, + "sequence_id": str, + "sequence_md5": str, + "sequence_status": str, + "sequence_type": str, + "taxon_id": str, + "topology": str, + "user_read": str, + "user_write": str, + "version": str + }, + "id_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "id": str, + "id_type": str, + "id_value": str, + "uniprotkb_accession": str + }, + "misc_niaid_sgc": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "gene_symbol_collection": str, + "genus": str, + "has_clones": str, + "has_proteins": str, + "selection_criteria": str, + "species": str, + "strain": str, + "target_id": str, + "target_status": str + }, + "pathway": { + "_version_": str, + "accession": str, + "alt_locus_tag": str, + "annotation": str, + "date_inserted": str, + "date_modified": str, + "ec_description": str, + "ec_number": str, + "feature_id": str, + "gene": str, + "genome_ec": str, + "genome_id": str, + "genome_name": str, + "id": str, + "owner": str, + "pathway_class": str, + "pathway_ec": str, + "pathway_id": str, + "pathway_name": str, + "patric_id": str, + "product": str, + "public": str, + "refseq_locus_tag": str, + "sequence_id": str, + "taxon_id": str, + "user_read": str, + "user_write": str + }, + "pathway_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "ec_description": str, + "ec_number": str, + "id": str, + "map_location": str, + "map_name": str, + "map_type": str, + "occurrence": float, + "pathway_class": str, + "pathway_id": str, + "pathway_name": str + }, + "ppi": { + "_version_": str, + "category": str, + "date_inserted": str, + "date_modified": str, + "detection_method": str, + "domain_a": str, + "domain_b": str, + "evidence": str, + "feature_id_a": str, + "feature_id_b": str, + "gene_a": str, + "gene_b": str, + "genome_id_a": str, + "genome_id_b": str, + "genome_name_a": str, + "genome_name_b": str, + "id": str, + "interaction_type": str, + "interactor_a": str, + "interactor_b": str, + "interactor_desc_a": str, + "interactor_desc_b": str, + "interactor_type_a": str, + "interactor_type_b": str, + "pmid": str, + "refseq_locus_tag_a": str, + "refseq_locus_tag_b": str, + "score": str, + "source_db": str, + "source_id": str, + "taxon_id_a": str, + "taxon_id_b": str + }, + "protein_family_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "family_id": str, + "family_product": str, + "family_type": str + }, + "sequence_feature": { + "aa_sequence_md5": str, + "aa_variant": str, + "additional_metadata": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "end": float, + "evidence_code": str, + "feature_id": str, + "genbank_accession": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "length": float, + "patric_id": str, + "product": str, + "publication": str, + "refseq_locus_tag": str, + "segment": str, + "segments": str, + "sf_category": str, + "sf_id": str, + "sf_name": str, + "sf_sequence": str, + "sf_sequence_md5": str, + "source": str, + "source_aa_sequence": str, + "source_id": str, + "source_sf_location": str, + "source_strain": str, + "start": float, + "subtype": str, + "taxon_id": str, + "variant_types": str + }, + "sequence_feature_vt": { + "additional_metadata": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "id": str, + "sf_category": str, + "sf_id": str, + "sf_name": str, + "sf_sequence": str, + "sf_sequence_md5": str, + "sfvt_genome_count": str, + "sfvt_genome_ids": str, + "sfvt_id": str, + "sfvt_sequence": str, + "sfvt_sequence_md5": str, + "sfvt_variations": str + }, + "sp_gene": { + "_version_": str, + "alt_locus_tag": str, + "antibiotics": str, + "antibiotics_class": str, + "classification": str, + "date_inserted": str, + "date_modified": str, + "e_value": str, + "evidence": str, + "feature_id": str, + "function": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "identity": float, + "organism": str, + "owner": str, + "patric_id": str, + "pmid": str, + "product": str, + "property": str, + "property_source": str, + "public": str, + "query_coverage": float, + "refseq_locus_tag": str, + "same_genome": float, + "same_genus": float, + "same_species": float, + "source": str, + "source_id": str, + "subject_coverage": float, + "taxon_id": str, + "user_read": str, + "user_write": str + }, + "sp_gene_ref": { + "_version_": str, + "antibiotics": str, + "antibiotics_class": str, + "assertion": str, + "classification": str, + "date_inserted": str, + "date_modified": str, + "function": str, + "gene_id": str, + "gene_name": str, + "genus": str, + "gi": str, + "id": str, + "locus_tag": str, + "organism": str, + "pmid": str, + "product": str, + "property": str, + "source": str, + "source_id": str, + "species": str + }, + "spike_lineage": { + "_version_": str, + "country": str, + "date_inserted": str, + "date_modified": str, + "growth_rate": float, + "id": str, + "lineage": str, + "lineage_count": float, + "lineage_of_concern": str, + "month": str, + "prevalence": float, + "region": str, + "sequence_features": str, + "total_isolates": float + }, + "spike_variant": { + "_version_": str, + "aa_variant": str, + "country": str, + "date_inserted": str, + "date_modified": str, + "growth_rate": float, + "id": str, + "lineage_count": float, + "month": str, + "prevalence": float, + "region": str, + "sequence_features": str, + "total_isolates": float + }, + "structured_assertion": { + "_version_": str, + "comment": str, + "date_inserted": str, + "date_modified": str, + "evidence_code": str, + "feature_id": str, + "id": str, + "owner": str, + "patric_id": str, + "pmid": str, + "property": str, + "public": str, + "refseq_locus_tag": str, + "score": str, + "source": str, + "user_read": str, + "user_write": str, + "value": str + }, + "subsystem": { + "_version_": str, + "active": str, + "class": str, + "date_inserted": str, + "date_modified": str, + "feature_id": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "owner": str, + "patric_id": str, + "product": str, + "public": str, + "refseq_locus_tag": str, + "role_id": str, + "role_name": str, + "subclass": str, + "subsystem_id": str, + "subsystem_name": str, + "superclass": str, + "taxon_id": str, + "user_read": str, + "user_write": str + }, + "subsystem_ref": { + "_version_": str, + "class": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "id": str, + "notes": str, + "pmid": str, + "role_id": str, + "role_name": str, + "subclass": str, + "subsystem_id": str, + "subsystem_name": str, + "superclass": str + }, + "taxonomy": { + "_version_": str, + "cds_mean": float, + "cds_sd": float, + "core_families": float, + "core_family_ids": str, + "description": str, + "division": str, + "genetic_code": str, + "genome_count": float, + "genome_length_mean": float, + "genome_length_sd": float, + "genomes": float, + "genomes_f": str, + "hypothetical_cds_ratio_mean": float, + "hypothetical_cds_ratio_sd": float, + "lineage": str, + "lineage_ids": str, + "lineage_names": str, + "lineage_ranks": str, + "other_names": str, + "parent_id": str, + "plfam_cds_ratio_mean": float, + "plfam_cds_ratio_sd": float, + "taxon_id": str, + "taxon_id_i": str, + "taxon_name": str, + "taxon_rank": str + }, + "protein_structure": { + "alignments": str, + "authors": str, + "date_inserted": str, + "date_modified": str, + "feature_id": str, + "file_path": str, + "gene": str, + "genome_id": str, + "institution": str, + "method": str, + "organism_name": str, + "patric_id": str, + "pdb_id": str, + "pmid": str, + "product": str, + "release_date": str, + "resolution": str, + "sequence": str, + "sequence_md5": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "title": str, + "uniprotkb_accession": str + }, + "protein_feature": { + "aa_sequence_md5": str, + "classification": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "e_value": str, + "end": float, + "evidence": str, + "feature_id": str, + "feature_type": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "interpro_description": str, + "interpro_id": str, + "length": float, + "patric_id": str, + "product": str, + "publication": str, + "refseq_locus_tag": str, + "score": float, + "segments": str, + "sequence": str, + "source": str, + "source_id": str, + "start": float, + "taxon_id": str + }, + "surveillance": { + "additional_metadata": str, + "alcohol_or_other_drug_dependence": str, + "breastfeeding": str, + "chest_imaging_interpretation": str, + "chronic_conditions": str, + "collection_city": str, + "collection_country": str, + "collection_date": str, + "collection_latitude": float, + "collection_longitude": float, + "collection_poi": str, + "collection_season": str, + "collection_state_province": str, + "collection_year": str, + "collector_institution": str, + "collector_name": str, + "comments": str, + "contact_email_address": str, + "contributing_institution": str, + "date_inserted": str, + "date_modified": str, + "daycare_attendance": str, + "days_elapsed_to_disease_status": str, + "days_elapsed_to_sample_collection": str, + "days_elapsed_to_vaccination": str, + "diagnosis": str, + "dialysis": str, + "disease_severity": str, + "disease_status": str, + "duration_of_exposure": str, + "duration_of_treatment": str, + "ecmo": str, + "education": str, + "embargo_end_date": str, + "exposure": str, + "exposure_type": str, + "genome_id": str, + "geographic_group": str, + "hospitalization_duration": str, + "hospitalized": str, + "host_age": str, + "host_capture_status": str, + "host_common_name": str, + "host_ethnicity": str, + "host_group": str, + "host_habitat": str, + "host_health": str, + "host_height": str, + "host_id_type": str, + "host_identifier": str, + "host_natural_state": str, + "host_race": str, + "host_sex": str, + "host_species": str, + "host_weight": str, + "human_leukocyte_antigens": str, + "id": str, + "infections_within_five_years": str, + "influenza_like_illness_over_the_past_year": str, + "initiation_of_treatment": str, + "intensive_care_unit": str, + "last_update_date": str, + "longitudinal_study": str, + "maintenance_medication": str, + "nursing_home_residence": str, + "onset_hours": str, + "other_vaccinations": str, + "oxygen_saturation": str, + "packs_per_day_for_how_many_years": str, + "pathogen_test_interpretation": str, + "pathogen_test_result": str, + "pathogen_test_type": str, + "pathogen_type": str, + "post_visit_medications": str, + "pre_visit_medications": str, + "pregnancy": str, + "primary_living_situation": str, + "profession": str, + "project_identifier": str, + "sample_accession": str, + "sample_identifier": str, + "sample_material": str, + "sample_receipt_date": str, + "sample_transport_medium": str, + "sequence_accession": str, + "source_of_vaccine_information": str, + "species": str, + "strain": str, + "submission_date": str, + "subtype": str, + "sudden_onset": str, + "symptoms": str, + "taxon_lineage_ids": str, + "tobacco_use": str, + "travel_history": str, + "treatment": str, + "treatment_dosage": str, + "treatment_type": str, + "trimester_of_pregnancy": str, + "types_of_allergies": str, + "use_of_personal_protective_equipment": str, + "vaccination_type": str, + "vaccine_dosage": str, + "vaccine_lot_number": str, + "vaccine_manufacturer": str, + "ventilation": str + }, + "serology": { + "additional_metadata": str, + "collection_city": str, + "collection_country": str, + "collection_date": str, + "collection_state": str, + "collection_year": str, + "comments": str, + "contributing_institution": str, + "date_inserted": str, + "date_modified": str, + "genbank_accession": str, + "geographic_group": str, + "host_age": str, + "host_age_group": str, + "host_common_name": str, + "host_health": str, + "host_identifier": str, + "host_sex": str, + "host_species": str, + "host_type": str, + "id": str, + "positive_definition": str, + "project_identifier": str, + "sample_accession": str, + "sample_identifier": str, + "serotype": str, + "strain": str, + "taxon_lineage_ids": str, + "test_antigen": str, + "test_interpretation": str, + "test_pathogen": str, + "test_result": str, + "test_type": str, + "virus_identifier": str + } } diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 45b8c8e..5961cc0 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -13,10 +13,10 @@ from qiime2.core.type import TypeMatch from qiime2.plugin import (Str, Plugin, Choices, List, Citations, Range, Int, Float, Visualization, Bool, TypeMap, Metadata, - MetadataColumn, Categorical) + MetadataColumn, Categorical, Numeric) -from .bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \ - fetch_genome_features_bv_brc +from .bv_brc import get_bv_brc_genomes, get_bv_brc_metadata, \ + get_bv_brc_genome_features, data_fields_bvbrc from .subsample import subsample_fasta from .trim_alignment import trim_alignment from .merge import merge_taxa @@ -1239,129 +1239,182 @@ ) -datatypes_metadata = [ - "antibiotics", - "enzyme_class_ref", - "epitope", - "epitope_assay", - "experiment", - "bioset", - "bioset_result", - "gene_ontology_ref", - "genome", - "strain", - "genome_amr", - "feature_sequence", - "genome_feature", - "genome_sequence", - "id_ref", - "misc_niaid_sgc", - "pathway", - "pathway_ref", - "ppi", - "protein_family_ref", - "sequence_feature", - "sequence_feature_vt", - "sp_gene", - "sp_gene_ref", - "spike_lineage", - "spike_variant", - "structured_assertion", - "subsystem", - "subsystem_ref", - "taxonomy", - "protein_structure", - "protein_feature", - "surveillance", - "serology" -] +bv_brc_rql_query = ('Query in RQL format. To download all data ' + 'for genome_ids "224308.43" and "2030927.4755", the RQL ' + 'query looks like this: "in(genome_id,(224308.43,' + '2030927.4755))". While "in" is an RQL operator, ' + '"genome_id" is a data field and "224308.43,' + '2030927.4755" are the values. It is important to percent ' + 'encode values if they contain illegal characters like ' + 'spaces. The values "Bacillus subtilis" and ' + '"Bacteroidales bacterium" have to be provided with ' + 'percent encoded quotes (%22) and spaces (%20) like ' + 'this: "in(species,(%22Bacillus%20subtilis%22,' + '%22Bacteroidales%20bacterium%22))". Check ' + 'https://www.bv-brc.org/api/doc/ for documentation on ' + 'data types and corresponding data fields.') + +bv_brc_ids_metadata = ('A metadata column obtained with the action ' + 'get-bv-brc-metadata that can be used as a query.') +bv_brc_ids = ('IDs/values of the corresponding data field. This parameter can ' + 'only be used in conjunction with the "data-field" parameter. ' + 'Retrieves all data associated with these IDs/values in the ' + 'specified data field.') +bv_brc_data_field = ('Data field of the corrsponding data type. This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves all data associated with ' + 'the IDs/values specified in parameter "ids" in this ' + 'data field.') + plugin.methods.register_function( - function=fetch_genomes_bv_brc, + function=get_bv_brc_genomes, inputs={}, parameters={ + 'ids_metadata': MetadataColumn[Numeric | Categorical], 'rql_query': Str, - 'genome_ids': List[Str], + 'data_field': Str, + 'ids': Str, 'ranks': List[Str % Choices(_allowed_ranks)], + 'rank_propagation': Bool, }, outputs=[('genomes', GenomeData[DNASequence]), ('taxonomy', FeatureData[Taxonomy])], input_descriptions={}, parameter_descriptions={ - 'rql_query': 'Query in RQL format. Check ' - 'https://www.bv-brc.org/api/doc/genome_sequence ' - 'for documentation.', - 'genome_ids': 'List of genome IDs from BV-BRC.', - 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' - "NCBI Taxonomy database. [default: '" + + 'ids_metadata': bv_brc_ids_metadata, + 'rql_query': bv_brc_rql_query, + 'data_field': 'Data field of the data type "genome_sequence". This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves all genomes associated ' + 'with the IDs/values specified in parameter "ids" in ' + 'this data field. Check ' + 'https://www.bv-brc.org/api/doc/genome_sequence for ' + 'allowed data fields.', + 'ids': bv_brc_ids, + 'ranks': 'List of taxonomic ranks for building a taxonomy. ' + "[default: '" + "', '".join(_default_ranks) + "']", + 'rank_propagation': RANK_PROPAGATE_DESCRIPTION, }, output_descriptions={ - 'genomes': 'genomes', - 'taxonomy': 'Taxonomy data.' + 'genomes': 'Genome sequences for specified query.', + 'taxonomy': 'Taxonomy data for all sequences.' }, - name='fetch genomes', - description="fetch genomes", + name='Get genome sequences from the BV-BRC database.', + description="Fetch genome sequences from BV-BRC. BV-BRC (Bacterial and " + "Viral Bioinformatics Resource Center) is a database for " + "bacterial and viral genomes, annotations, and metadata. " + "There are three ways to query data: You can use an RQL " + "query to refine your search and get targeted genomes. By " + "providing IDs/values and a corresponding data field, " + "you can retrieve all genomes associated with those specific " + "values in that data field. And as a third option a metadata " + "column can be provided, to use metadata obtained with the " + "action get-bv-brc-metadata as a new query. Check " + "https://www.bv-brc.org/api/doc/ for documentation.", citations=[citations['olson2023introducing']] ) plugin.methods.register_function( - function=fetch_metadata_bv_brc, + function=get_bv_brc_metadata, inputs={}, parameters={ - 'data_type': Str % Choices(datatypes_metadata), - 'rql_query': Str + 'ids_metadata': MetadataColumn[Numeric | Categorical], + 'data_type': Str % Choices(list(data_fields_bvbrc.keys())), + 'rql_query': Str, + 'data_field': Str, + 'ids': Str, }, outputs=[('metadata', ImmutableMetadata)], input_descriptions={}, parameter_descriptions={ - 'data_type': 'BV-BCR data type. Check https://www.bv-brc.org/api/doc/ ' + 'ids_metadata': bv_brc_ids_metadata, + 'data_type': 'BV-BCR data type for which metadata should be ' + 'downloaded. Check https://www.bv-brc.org/api/doc/ ' 'for documentation.', - 'rql_query': 'Query in RQL format. Check ' - 'https://www.bv-brc.org/api/doc/ for documentation.' + 'rql_query': bv_brc_rql_query, + 'data_field': 'Data field of the specified "data-type". This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves metadata associated ' + 'with the IDs/values specified in parameter "ids" in ' + 'this data field. Check ' + 'https://www.bv-brc.org/api/doc/ for allowed data ' + 'fields in the specified "data-type".', + 'ids': bv_brc_ids, }, output_descriptions={ - 'metadata': 'metadata'}, + 'metadata': 'BV-BCR metadata of specified data type.' + }, name='Fetch BV-BCR metadata.', - description="Fetch BV-BCR metadata for a specific data type with an RQL " - "query.", + description="Fetch BV-BCR metadata for a specific data type. BV-BRC (" + "Bacterial and Viral Bioinformatics Resource Center) is a " + "database for bacterial and viral genomes, annotations, " + "and metadata. There are three ways to query data: You can " + "use an RQL query to refine your search and get targeted " + "results. By providing IDs/values and a corresponding data " + "field, you can retrieve all metadata associated with those " + "specific values in that data field. And as a third option a " + "metadata column can be provided, to use the results from " + "other data types as a new query. Check " + "https://www.bv-brc.org/api/doc/ for documentation.", citations=[citations['olson2023introducing']] ) plugin.methods.register_function( - function=fetch_genome_features_bv_brc, + function=get_bv_brc_genome_features, inputs={}, parameters={ + 'ids_metadata': MetadataColumn[Numeric | Categorical], 'rql_query': Str, + 'data_field': Str, + 'ids': Str, 'ranks': List[Str % Choices(_allowed_ranks)], - 'taxon_ids': List[Str], - + 'rank_propagation': Bool, }, outputs=[ ('genes', GenomeData[Genes]), ('proteins', GenomeData[Proteins]), - ('taxonomy', FeatureData[Taxonomy]) + ('taxonomy', FeatureData[Taxonomy]), + ('loci', GenomeData[Loci]) ], input_descriptions={}, parameter_descriptions={ - 'rql_query': 'Query in RQL format. Check ' - 'https://www.bv-brc.org/api/doc/genome_feature ' - 'for documentation.', - 'taxon_ids': 'List of taxon IDs from BV-BRC.', - 'ranks': 'List of taxonomic ranks for building a taxonomy from the ' - "NCBI Taxonomy database. [default: '" + - "', '".join(_default_ranks) + "']", + 'ids_metadata': bv_brc_ids_metadata, + 'rql_query': bv_brc_rql_query, + 'data_field': 'Data field of the data type "genome_feature". This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves all data associated with ' + 'the IDs/values specified in parameter "ids" in this ' + 'data field. Check ' + 'https://www.bv-brc.org/api/doc/genome_feature for ' + 'allowed data fields.', + 'ids': bv_brc_ids, + 'ranks': 'List of taxonomic ranks for building a taxonomy ' + "[default: '" + ', '.join(_default_ranks) + "']", + 'rank_propagation': RANK_PROPAGATE_DESCRIPTION, }, output_descriptions={ - 'genes': 'genes', + 'genes': 'Gene', 'proteins': 'proteins', - 'taxonomy': 'taxonomy', + 'taxonomy': 'Taxonomy data for all sequences.', + 'loci': 'loci', }, name='Fetch genome features from BV-BRC.', description='Fetch DNA and protein sequences of genome features from ' - 'BV-BRC.', + 'BV-BRC. BV-BRC (Bacterial and Viral Bioinformatics Resource ' + 'Center) is a database for bacterial and viral genomes, ' + 'annotations, and metadata. There are three ways to query ' + 'data: You can use an RQL query to refine your search and ' + 'get targeted features. By providing IDs/values and a ' + 'corresponding data field, you can retrieve all features ' + 'associated with those specific values in that data field. ' + 'And as a third option a metadata column can be provided, ' + 'to use metadata obtained with the action ' + 'get-bv-brc-metadata as a new query. Check ' + 'https://www.bv-brc.org/api/doc/ for documentation.', citations=[citations['olson2023introducing']] ) diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py index 75939e4..3a82e58 100644 --- a/rescript/tests/test_bv_brc.py +++ b/rescript/tests/test_bv_brc.py @@ -5,51 +5,91 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import os import unittest from unittest.mock import Mock, patch, mock_open, MagicMock import pandas as pd -from q2_types.feature_data import TSVTaxonomyDirectoryFormat -from q2_types.genome_data import GenomeSequencesDirectoryFormat, \ - GenesDirectoryFormat, ProteinsDirectoryFormat +import qiime2 from qiime2.plugin.testing import TestPluginBase -from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \ - fetch_genome_features_bv_brc, id_list_handling, \ - error_handling, download_data, json_to_fasta, \ - parse_lineage_names_with_ranks, parse_fasta_to_dict +from rescript.bv_brc import get_bv_brc_genomes, get_bv_brc_metadata, \ + get_bv_brc_genome_features, parameter_validation, \ + error_handling, download_data, create_genome_fasta, \ + create_taxonomy_entry, get_loci, read_tsv_data_with_dtypes, process_loci, \ + get_sequences, get_taxonomy, create_taxonomy class TestIDListHandling(TestPluginBase): package = 'rescript.tests' - def test_error_both_parameters_given(self): + def test_missing_data_type(self): + # Test when data_type is None + with self.assertRaisesRegex(ValueError, "data-type"): + parameter_validation() + + def test_rql_query_and_other_params(self): + # Test when rql_query is specified with other conflicting parameters with self.assertRaisesRegex(ValueError, - "Parameters rql_query and ids can't be " - "used simultaneously."): - id_list_handling(rql_query="some_query", - ids=[1, 2, 3], - parameter_name="ids", - data_field="id") - - def test_error_neither_parameter_given(self): + "rql_query.*can't.*simultaneously"): + parameter_validation(rql_query="some_query", ids=[1, 2], + data_field="genome_id", data_type="genome") + + def test_metadata_and_other_params(self): + # Test when metadata is specified with other conflicting parameters with self.assertRaisesRegex(ValueError, - "At least one of the parameters rql_query " - "and ids has to be given."): - id_list_handling(rql_query="", - ids=[], - parameter_name="ids", - data_field="id") - - def test_correct_rql_query_generation(self): - result = id_list_handling( - rql_query="", - ids=[1, 2, 3], - parameter_name="ids", - data_field="id") - expected_query = "in(id,(1,2,3))" - self.assertEqual(result, expected_query) + "metadata.*can't.*simultaneously"): + parameter_validation(metadata="metadata", ids=[1, 2], + data_field="genome_id", data_type="genome") + + def test_ids_without_data_field(self): + # Test when ids is specified without data_field + with self.assertRaisesRegex(ValueError, r"ids.*data-field"): + parameter_validation(ids=[1, 2], data_type="genome") + + def test_no_rql_query_ids_metadata(self): + # Test when neither rql_query, ids, nor metadata is specified + with self.assertRaisesRegex(ValueError, "rql-query.*ids.*metadata"): + parameter_validation(data_type="genome") + + def test_invalid_data_field_for_data_type(self): + # Test when the data_field is not valid for the given data_type + with self.assertRaisesRegex(ValueError, "data-field.*permitted"): + parameter_validation(ids=[1, 2], data_field="invalid_field", + data_type="genome") + + def test_valid_rql_query_generation(self): + rql_query = parameter_validation( + ids=["Bacillus subtilis", "Bacteroidales bacterium"], + data_field="species", + data_type="genome" + ) + self.assertEqual(rql_query, "in(species,(%22Bacillus%20subtilis%22," + "%22Bacteroidales%20bacterium%22))") + + def test_valid_rql_query_with_metadata(self): + # Create mock metadata objects + mock_metadata = MagicMock() + mock_series = MagicMock() + + # Mock the .name attribute of the Series to return "species" + mock_series.name = "species" + + # Mock the return value of to_series to be the mock_series + mock_metadata.to_series.return_value = mock_series + + # Mock the ids in the series (mimicking a list of ids) + mock_series.__iter__.return_value = iter( + ["Bacillus subtilis", "Bacteroidales bacterium"]) + + # Call the function with the mock metadata + rql_query = parameter_validation( + metadata=mock_metadata, + data_type="genome" + ) + + # Assert that the rql_query is correctly generated + self.assertEqual(rql_query, "in(species,(%22Bacillus%20subtilis%22," + "%22Bacteroidales%20bacterium%22))") class TestErrorHandling(TestPluginBase): @@ -96,58 +136,143 @@ def test_unhandled_response(self): class TestDownloadData(TestPluginBase): package = 'rescript.tests' - @patch('rescript.bv_brc.requests.get') + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.read_tsv_data_with_dtypes') @patch('rescript.bv_brc.error_handling') - def test_download_data_success(self, mock_error_handling, - mock_requests_get): - # Mock the requests.get response for a successful request - mock_response = Mock() - mock_response.status_code = 200 - mock_requests_get.return_value = mock_response + def test_download_data_json_batch(self, mock_error_handling, + mock_read_data_with_dtypes, mock_post): + # Mock the response for the first batch (25,000 entries) + mock_response_1 = MagicMock() + mock_response_1.status_code = 200 + mock_response_1.json.return_value = [{"id": i} for i in range(25000)] + + # Mock the response for the second batch (fewer than 25,000 entries) + mock_response_2 = MagicMock() + mock_response_2.status_code = 200 + mock_response_2.json.return_value = [{"id": i} for i in + range(25000, 25010)] + + # The first call to requests.post returns 25,000 entries, + # the second call returns 10 entries. + mock_post.side_effect = [mock_response_1, mock_response_2] + + # Call the function for JSON + result = download_data(data_type="genome", query="eq(id,1)", + accept="application/json", select=["genome_id"]) + + # Check that the result is as expected (25,000 + 10 entries) + self.assertEqual(len(result), 25010) + self.assertEqual(result[0], {"id": 0}) + self.assertEqual(result[-1], {"id": 25009}) + + # Ensure requests.post is called with the correct parameters + mock_post.assert_any_call( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,0)&select(genome_id)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'application/json'} + ) + + mock_post.assert_any_call( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,25000)&select(genome_id)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'application/json'} + ) - url = "http://example.com/data" - data_type = "some_type" + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.read_tsv_data_with_dtypes') + @patch('rescript.bv_brc.error_handling') + def test_download_data_tsv(self, mock_error_handling, + mock_read_data_with_dtypes, mock_post): + # Mock the response for TSV data type + mock_response = MagicMock() + mock_response.status_code = 200 + mock_post.return_value = mock_response + + # Mock reading TSV data + mock_df = pd.DataFrame({"id": [1, 2]}) + mock_read_data_with_dtypes.return_value = mock_df + + # Call the function for TSV + result = download_data(data_type="genome", query="eq(id,1)", + accept="text/tsv") + + # Check that the result is a DataFrame and as expected + pd.testing.assert_frame_equal(result, pd.DataFrame({"id": [1, 2]})) + + # Ensure requests.post is called with the correct parameters + mock_post.assert_called_with( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,0)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'text/tsv'} + ) - result = download_data(url, data_type) + # Ensure read_tsv_data_with_dtypes was called + mock_read_data_with_dtypes.assert_called_once_with( + response=mock_response, data_type="genome") - mock_requests_get.assert_called_once_with(url) - self.assertEqual(result, mock_response) + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.error_handling') + def test_download_data_gff(self, mock_error_handling, mock_post): + # Mock the response for GFF data type + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = "mock_gff_data" + mock_post.return_value = mock_response + + # Call the function for GFF + result = download_data(data_type="genome", query="eq(id,1)", + accept="application/gff") + + # Check that the result is as expected + self.assertEqual(result, "mock_gff_data") + + # Ensure requests.post is called with the correct parameters + mock_post.assert_called_with( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,0)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'application/gff'} + ) - @patch('rescript.bv_brc.requests.get') + @patch('rescript.bv_brc.requests.post') @patch('rescript.bv_brc.error_handling') - def test_download_data_error_400(self, mock_error_handling, - mock_requests_get): - # Mock the requests.get response for a 400 Bad Request - mock_response = Mock() + def test_download_data_400_error(self, mock_error_handling, mock_post): + # Mock the response for 400 error + mock_response = MagicMock() mock_response.status_code = 400 - mock_requests_get.return_value = mock_response - - url = "http://example.com/data" - data_type = "some_type" + mock_post.return_value = mock_response - download_data(url, data_type) + # Call the function and check that error_handling is called + download_data(data_type="genome", query="eq(id,1)", + accept="application/json") - mock_requests_get.assert_called_once_with(url) - mock_error_handling.assert_called_once_with(mock_response, data_type) + # Ensure error_handling was called for the 400 response + mock_error_handling.assert_called_once_with(response=mock_response, + data_type="genome") - @patch('rescript.bv_brc.requests.get') - @patch('rescript.bv_brc.error_handling') - def test_download_data_other_error(self, mock_error_handling, - mock_requests_get): - # Mock the requests.get response for any other error - mock_response = Mock() + @patch('rescript.bv_brc.requests.post') + def test_download_data_unexpected_error(self, mock_post): + # Mock the response for unexpected error + mock_response = MagicMock() mock_response.status_code = 500 - mock_response.text = "Server Error" - mock_requests_get.return_value = mock_response + mock_response.text = "Internal Server Error" + mock_post.return_value = mock_response - url = "http://example.com/data" - data_type = "some_type" + # Check that the function raises a ValueError for non-200, non-400 + # responses + with self.assertRaises(ValueError) as context: + download_data(data_type="genome", query="eq(id,1)", + accept="application/json") - with self.assertRaisesRegex(ValueError, "Server Error"): - download_data(url, data_type) - - mock_requests_get.assert_called_once_with(url) - mock_error_handling.assert_not_called() + # Ensure the ValueError contains the right message + self.assertEqual(str(context.exception), "Internal Server Error") class TestJsonToFasta(TestPluginBase): @@ -178,23 +303,23 @@ def setUp(self): @patch('rescript.bv_brc.open', new_callable=mock_open) def test_json_to_fasta_single_genome(self, mock_file): - json_to_fasta(self.json_input_1, "/fake/dir") + result = create_genome_fasta(self.json_input_1) # Expected FASTA content - expected_fasta = ">accn|acc1 desc1 [genome_name1 | genome1]\nATGC" + expected_fasta = ">acc1 desc1 [genome_name1 | genome1]\nATGC" # Check if the file was created with the correct path and content - mock_file.assert_called_once_with("/fake/dir/genome1.fasta", 'w') + mock_file.assert_called_once_with(f"{str(result)}/genome1.fasta", 'w') mock_file().write.assert_called_once_with(expected_fasta) @patch('rescript.bv_brc.open', new_callable=mock_open) def test_json_to_fasta_multiple_genomes(self, mock_file): - json_to_fasta(self.json_input_1 + self.json_input_2, "/fake/dir") + create_genome_fasta(self.json_input_1 + self.json_input_2) # Expected FASTA content - expected_fasta_genome1 = (">accn|acc1 desc1 [genome_name1 | " + expected_fasta_genome1 = (">acc1 desc1 [genome_name1 | " "genome1]\nATGC") - expected_fasta_genome2 = (">accn|acc2 desc2 [genome_name2 | " + expected_fasta_genome2 = (">acc2 desc2 [genome_name2 | " "genome2]\nCGTA") # Check if the files were created with the correct path and content @@ -203,277 +328,119 @@ def test_json_to_fasta_multiple_genomes(self, mock_file): @patch('rescript.bv_brc.open', new_callable=mock_open) def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file): - json_to_fasta(self.json_input_1 + self.json_input_1, "/fake/dir") + result = create_genome_fasta(self.json_input_1 + self.json_input_1) # Expected FASTA content - expected_fasta = (">accn|acc1 desc1 [genome_name1 | " - "genome1]\nATGC\n" - ">accn|acc1 desc1 [genome_name1 | " - "genome1]\nATGC") + expected_fasta = (">acc1 desc1 [genome_name1 | genome1]\nATGC\n" + ">acc1 desc1 [genome_name1 | genome1]\nATGC") # Check if the file was created with the correct path and content - mock_file.assert_called_once_with("/fake/dir/genome1.fasta", 'w') + mock_file.assert_called_once_with(f"{str(result)}/genome1.fasta", 'w') mock_file().write.assert_called_once_with(expected_fasta) -class TestFetchGenomeFeaturesBVBR(TestPluginBase): +class TestGetBvBrcGenomes(TestPluginBase): package = 'rescript.tests' - @patch('rescript.bv_brc.parse_fasta_to_dict') + @patch('rescript.bv_brc.get_taxonomy') + @patch('rescript.bv_brc.create_genome_fasta') @patch('rescript.bv_brc.download_data') - @patch('rescript.bv_brc.id_list_handling') - @patch('builtins.open', new_callable=mock_open) - def test_fetch_genome_features_bv_brc(self, mock_open, - mock_id_list_handling, - mock_download_data, - mock_parse_fasta_to_dict): - # Mock the id_list_handling function - mock_id_list_handling.return_value = ("in(feature_id," - "(feature1,feature2))") - - # Mock the download_data function responses - mock_response_genes = MagicMock() - mock_response_genes.text = "mocked_genes_fasta_data" - mock_response_proteins = MagicMock() - mock_response_proteins.text = "mocked_proteins_fasta_data" - mock_download_data.side_effect = [mock_response_genes, - mock_response_proteins] - - # Mock the parse_fasta_to_dict function - mock_parse_fasta_to_dict.side_effect = [ - {'2030927.4755': '>fig|2030927| GTPase [ABC | ' - '2030927.4755]\nATGA\n'}, - {'1234567.89': '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n'} - ] - - # Call the function with the test RQL query - genes, proteins = fetch_genome_features_bv_brc( - rql_query="in(feature_id,(feature1,feature2))" - ) - - # Assertions to ensure the correct calls were made - mock_id_list_handling.assert_called_once_with( - rql_query="in(feature_id,(feature1,feature2))", - ids=None, - parameter_name="feature_ids", - data_field="feature_id" - ) - - mock_download_data.assert_any_call( - url="https://www.bv-brc.org/api/genome_feature/?in(feature_id," - "(feature1,feature2))&http_accept=application/dna+fasta", - data_type="genome_feature" - ) - - mock_download_data.assert_any_call( - url="https://www.bv-brc.org/api/genome_feature/?in(feature_id," - "(feature1,feature2))&http_accept=application/protein+fasta", - data_type="genome_feature" - ) - - mock_parse_fasta_to_dict.assert_any_call("mocked_genes_fasta_data") - mock_parse_fasta_to_dict.assert_any_call("mocked_proteins_fasta_data") - - # Check that the files were written correctly for genes - mock_open.assert_any_call( - os.path.join(str(genes), "2030927.4755.fasta"), 'w') - mock_open().write.assert_any_call( - '>fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n') - - # Check that the files were written correctly for proteins - mock_open.assert_any_call( - os.path.join(str(proteins), "1234567.89.fasta"), 'w') - mock_open().write.assert_any_call( - '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n') - - # Check that the return types are correct - self.assertIsInstance(genes, GenesDirectoryFormat) - self.assertIsInstance(proteins, ProteinsDirectoryFormat) - - def test_parse_fasta_to_dict(self): - fasta_string = ( - ">fig|2030927| GTPase [ABC | 2030927.4755]\natga\n" - ">fig|1234567| protein [XYZ | 1234567.89]\ngcgt\n" - ) - expected_output = { - '2030927.4755': ( - ">fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n" - ), - '1234567.89': ( - ">fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n" - ) - } - result = parse_fasta_to_dict(fasta_string) - self.assertEqual(result, expected_output) - - -class TestFetchGenomesBVBRC(TestPluginBase): - package = 'rescript.tests' - - @patch('rescript.bv_brc.json_to_fasta') - @patch('rescript.bv_brc.download_data') - @patch('rescript.bv_brc.id_list_handling') - def test_fetch_genomes_bv_brc( - self, mock_id_list_handling, mock_download_data, mock_json_to_fasta - ): - # Mock the id_list_handling function - mock_id_list_handling.return_value = "genome_id=in(genome1,genome2)" - - # Mock the download_data response - mock_response = MagicMock() - mock_response.json.return_value = {'genomes': ['genome_data']} - mock_download_data.return_value = mock_response + @patch('rescript.bv_brc.parameter_validation') + def test_get_bv_brc_genomes(self, mock_parameter_validation, + mock_download_data, mock_create_genome_fasta, + mock_get_taxonomy): + # Mocked return values for the external functions + mock_parameter_validation.return_value = "mocked_rql_query" + mock_download_data.return_value = [ + {'id': 'genome1', 'sequence': 'ATGC'}, + {'id': 'genome2', 'sequence': 'GCTA'}] + mock_create_genome_fasta.return_value = MagicMock( + name='GenomeSequencesDirectoryFormat') + mock_get_taxonomy.return_value = MagicMock(name='TSVTaxonomyFormat') # Call the function - genomes = fetch_genomes_bv_brc( - rql_query="genome_id=in(genome1,genome2)", - genome_ids=["genome1", "genome2"] + get_bv_brc_genomes( + ids_metadata=MagicMock(name='NumericMetadataColumn'), + rql_query=None, + data_field="mock_field", + ids=["id1", "id2"], + ranks=["rank1", "rank2"], + rank_propagation=True ) - # Assertions - mock_id_list_handling.assert_called_once_with( - rql_query="genome_id=in(genome1,genome2)", - ids=["genome1", "genome2"], - parameter_name="genome_ids", - data_field="genome_id" - ) - mock_download_data.assert_called_once_with( - url="https://www.bv-brc.org/api/genome_sequence/" - "?genome_id=in(genome1,genome2)", - data_type="genome_sequence" - ) - - mock_json_to_fasta.assert_called_once_with( - {'genomes': ['genome_data']}, - str(genomes) - ) - - self.assertIsInstance(genomes, GenomeSequencesDirectoryFormat) - - -class TestFetchMetadataBVBR(TestPluginBase): +class TestGetBvBrcMetadata(TestPluginBase): package = 'rescript.tests' - @patch('rescript.bv_brc.qiime2.Metadata') - @patch('rescript.bv_brc.pd.read_csv') @patch('rescript.bv_brc.download_data') - def test_fetch_metadata_bv_brc(self, mock_download_data, - mock_read_csv, mock_metadata): - # Mock the download_data response - mock_response = MagicMock() - mock_response.text = ( - "id\tcolumn1\tcolumn2\n1\tdata1\tdata2\n2\tdata3\tdata4") - mock_download_data.return_value = mock_response + @patch('rescript.bv_brc.parameter_validation') + def test_get_bv_brc_metadata(self, mock_parameter_validation, + mock_download_data): + # Mock the return value of parameter_validation + mock_parameter_validation.return_value = 'rql(query)' - # Mock the pandas read_csv return value + # Mock the return value of download_data mock_df = pd.DataFrame({ - 'column1': ['data1', 'data3'], - 'column2': ['data2', 'data4'] - }, index=pd.Index(['1', '2'], name='id')) - mock_read_csv.return_value = mock_df - - # Mock qiime2.Metadata creation - mock_metadata_instance = MagicMock() - mock_metadata.return_value = mock_metadata_instance + 'id': ['id1', 'id2'], + 'feature': ['value1', 'value2'], + 'empty_field': [' ', None] + }).set_index('id') + mock_download_data.return_value = mock_df # Call the function - fetch_metadata_bv_brc( - data_type="genome", - rql_query="genome_id=in(1,2)" + result_metadata = get_bv_brc_metadata( + ids_metadata=None, + data_type='genome', + data_field='genome_id', + ids=['id1', 'id2'] ) - # Assertions - mock_download_data.assert_called_once_with( - url="https://www.bv-brc.org/api/genome/" - "?genome_id=in(1,2)&http_accept=text/tsv", - data_type="genome" - ) + # Assertions on the returned qiime2.Metadata + self.assertIsInstance(result_metadata, qiime2.Metadata) - mock_read_csv.assert_called_once() - args, kwargs = mock_read_csv.call_args - self.assertEqual(kwargs['sep'], '\t') + # Extract the DataFrame from the result and check its values + result_df = result_metadata.to_dataframe() - self.assertEqual(args[0].getvalue(), "id\tcolumn1\tcolumn2\n1\tdata1" - "\tdata2\n2\tdata3\tdata4") + # Check that the DataFrame's index and columns are correct + self.assertEqual(result_df.index.tolist(), ['id1', 'id2']) + self.assertIn('feature', result_df.columns) - mock_metadata.assert_called_once_with(mock_df) + # Check that empty/space-only fields are replaced with NaN + self.assertTrue(pd.isna(result_df.loc['id1', 'empty_field'])) + self.assertTrue(pd.isna(result_df.loc['id2', 'empty_field'])) -class TestFetchTaxonomyBVBR(TestPluginBase): +class TestGetBvBrcGenomeFeatures(TestPluginBase): package = 'rescript.tests' - @patch('pandas.DataFrame.to_csv') - @patch('rescript.bv_brc.transform_taxonomy_df') + @patch('rescript.bv_brc.get_loci') + @patch('rescript.bv_brc.get_taxonomy') + @patch('rescript.bv_brc.get_sequences') @patch('rescript.bv_brc.download_data') - @patch('rescript.bv_brc.pd.read_csv') - @patch('rescript.bv_brc.id_list_handling') - def test_fetch_taxonomy_bv_brc( - self, mock_id_list_handling, mock_read_csv, mock_download_data, - mock_transform_taxonomy_df, mock_to_csv - ): - # Mock the id_list_handling function - mock_id_list_handling.return_value = "taxon_id=in(taxon1,taxon2)" - - # Mock the download_data response - mock_response = MagicMock() - mock_response.text = ( - "id\trank1\trank2\n1\tdata1\tdata2\n2\tdata3\tdata4") - mock_download_data.return_value = mock_response - - # Prepare mocks for file output - with patch('builtins.open', unittest.mock.mock_open()): - directory = fetch_taxonomy_bv_brc( - rql_query="taxon_id=in(taxon1,taxon2)", - ranks=['rank1', 'rank2'], - taxon_ids=["taxon1", "taxon2"] - ) - - # Assertions - mock_id_list_handling.assert_called_once_with( - rql_query="taxon_id=in(taxon1,taxon2)", - ids=["taxon1", "taxon2"], - parameter_name="taxon_ids", - data_field="taxon_id" - ) - - mock_download_data.assert_called_once_with( - url="https://www.bv-brc.org/api/taxonomy/" - "?taxon_id=in(taxon1,taxon2)&http_accept=text/tsv", - data_type="taxonomy" - ) - - self.assertIsInstance(directory, TSVTaxonomyDirectoryFormat) - - @patch('rescript.bv_brc.parse_lineage_names_with_ranks') - def test_transform_taxonomy_df(self, mock_parse_lineage_names_with_ranks): - # Mock the parse_lineage_names_with_ranks function - mock_parse_lineage_names_with_ranks.side_effect = \ - lambda lineage_names, lineage_ranks, ranks: "Mocked Taxon" - - # Create a sample DataFrame - df = pd.DataFrame({ - 'taxon_id': ['taxon1', 'taxon2'], - 'lineage_names': ['name1;name2', 'name3;name4'], - 'lineage_ranks': ['rank1;rank2', 'rank3;rank4'] - }) - - ranks = ['rank1', 'rank2', 'rank3'] + @patch('rescript.bv_brc.parameter_validation') + def test_get_bv_brc_genome_features(self, mock_parameter_validation, + mock_download_data, mock_get_sequences, + mock_get_taxonomy, mock_get_loci): + # Mocked return values for the external functions + mock_parameter_validation.return_value = "mocked_rql_query" + mock_download_data.return_value = [{"genome_id": "genome1"}] + mock_get_sequences.return_value = (MagicMock(), MagicMock()) + mock_get_taxonomy.return_value = MagicMock() + mock_get_loci.return_value = MagicMock() # Call the function - result_df = transform_taxonomy_df(df, ranks) - - # Expected DataFrame after transformation - expected_df = pd.DataFrame({ - 'Feature ID': ['taxon1', 'taxon2'], - 'Taxon': ['Mocked Taxon', 'Mocked Taxon'] - }).set_index('Feature ID') + get_bv_brc_genome_features( + ids_metadata=None, + rql_query=None, + data_field="mock_field", + ids=["id1", "id2"], + ranks=["rank1", "rank2"], + rank_propagation=True + ) - # Assert that the result matches the expected DataFrame - pd.testing.assert_frame_equal(result_df, expected_df) - class TestParseLineageNamesWithRanks(TestPluginBase): - package = 'rescript.tests' +class TestCreateTaxonomyEntry(TestPluginBase): + package = 'rescript.tests' def test_basic_functionality(self): lineage_names = ("Animalia;Chordata;Mammalia;Primates;Hominidae;Homo;" @@ -482,7 +449,7 @@ def test_basic_functionality(self): expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " "o__Primates; f__Hominidae; g__Homo; s__sapiens") - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks) + result = create_taxonomy_entry(lineage_names, lineage_ranks) self.assertEqual(result, expected_result) def test_with_missing_ranks(self): @@ -491,7 +458,8 @@ def test_with_missing_ranks(self): expected_result = ("k__Animalia; p__Chordata; c__; o__; f__Hominidae; " "g__; s__sapiens") - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks) + result = create_taxonomy_entry(lineage_names, lineage_ranks, + rank_propagation=False) self.assertEqual(result, expected_result) def test_rank_propagation(self): @@ -500,8 +468,8 @@ def test_rank_propagation(self): expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " "o__Mammalia; f__Mammalia; g__Homo; s__Homo") - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, - rank_propagation=True) + result = create_taxonomy_entry(lineage_names, lineage_ranks, + rank_propagation=True) self.assertEqual(result, expected_result) def test_genus_species_split(self): @@ -511,7 +479,7 @@ def test_genus_species_split(self): expected_result = ("k__Animalia; p__Chordata; c__Mammalia;" " o__Primates; f__Hominidae; g__Homo; s__sapiens") - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks) + result = create_taxonomy_entry(lineage_names, lineage_ranks) self.assertEqual(result, expected_result) def test_genus_only_split(self): @@ -521,7 +489,7 @@ def test_genus_only_split(self): expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " "o__Primates; f__Hominidae; g__Homo; s__sapiens") - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks) + result = create_taxonomy_entry(lineage_names, lineage_ranks) self.assertEqual(result, expected_result) def test_no_species_in_ranks(self): @@ -531,10 +499,10 @@ def test_no_species_in_ranks(self): expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " "o__Primates; f__Hominidae; g__Homo sapiens") - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, - ranks=['kingdom', 'phylum', - 'class', 'order', - 'family', 'genus']) + result = create_taxonomy_entry(lineage_names, lineage_ranks, + ranks=['kingdom', 'phylum', + 'class', 'order', + 'family', 'genus']) self.assertEqual(result, expected_result) def test_custom_ranks(self): @@ -542,15 +510,257 @@ def test_custom_ranks(self): lineage_ranks = "superkingdom" expected_result = "sk__Metazoa" - result = parse_lineage_names_with_ranks(lineage_names, lineage_ranks, - ranks=['superkingdom']) + result = create_taxonomy_entry(lineage_names, lineage_ranks, + ranks=['superkingdom']) self.assertEqual(result, expected_result) - def test_genomes(self): - fetch_genomes_bv_brc(rql_query="eq(taxon_id,1313)") - def test_genome_features(self): - fetch_genome_features_bv_brc(rql_query="eq(taxon_id,1313)") +class TestGetTaxonomy(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.create_taxonomy') + @patch('rescript.bv_brc.download_data') + def test_get_taxonomy(self, mock_download_data, mock_create_taxonomy): + # Mock response_sequences (list of dicts) + response_sequences = [ + {"taxon_id": "taxon1", "feature_id": "feature1"}, + {"taxon_id": "taxon2", "feature_id": "feature2"}, + {"taxon_id": "taxon3", "feature_id": "feature3"}, + {"taxon_id": "taxon1", "feature_id": "feature4"}, + ] + + # Mock the download_data and create_taxonomy functions + mock_download_data.return_value = MagicMock() + mock_create_taxonomy.return_value = MagicMock() + + # Define test parameters + ranks = ["kingdom", "phylum", "class"] + rank_propagation = True + accession_name = "feature_id" + + # Call the function + get_taxonomy( + response_sequences=response_sequences, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name=accession_name + ) + + # Check that the taxon_ids are extracted correctly and that duplicates + # are removed + expected_taxon_ids = {"taxon1", "taxon2", "taxon3"} + extracted_taxon_ids = {str(entry["taxon_id"]) for entry in + response_sequences} + self.assertEqual(extracted_taxon_ids, expected_taxon_ids) + + +class TestCreateTaxonomy(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.TSVTaxonomyFormat') + @patch('rescript.bv_brc.create_taxonomy_entry') + def test_create_taxonomy(self, mock_create_taxonomy_entry, + mock_TSVTaxonomyFormat): + # Mock the input data for taxonomy_bvbrc DataFrame + taxonomy_bvbrc = pd.DataFrame({ + 'taxon_id': ['taxon1', 'taxon2'], + 'lineage_names': [['Bacteria', 'Proteobacteria'], + ['Bacteria', 'Firmicutes']], + 'lineage_ranks': [['domain', 'phylum'], ['domain', 'phylum']] + }) + + # Mock the create_taxonomy_entry to return a fake taxonomy string + mock_create_taxonomy_entry.side_effect = \ + lambda lineage_names, lineage_ranks, rank_propagation, ranks: ( + ";".join(lineage_names)) + + # Mock response_sequences (list of dicts) + response_sequences = [ + {"taxon_id": "taxon1", "feature_id": "feature1"}, + {"taxon_id": "taxon2", "feature_id": "feature2"} + ] + + # Mock TSVTaxonomyFormat to return a file-like object + mock_taxonomy_file = MagicMock() + (mock_TSVTaxonomyFormat.return_value.open. + return_value.__enter__).return_value = mock_taxonomy_file + + # Call the create_taxonomy function + create_taxonomy( + taxonomy_bvbrc=taxonomy_bvbrc, + response_sequences=response_sequences, + ranks=['domain', 'phylum'], + rank_propagation=True, + accession_name="feature_id" + ) + + # Ensure that the correct data was written to the file + written_data = "\t".join(["Feature ID", "Taxon"]) + "\n" + \ + "\t".join( + ["feature1", "Bacteria;Proteobacteria"]) + "\n" + \ + "\t".join(["feature2", "Bacteria;Firmicutes"]) + "\n" + + # Check if the 'write' method was actually called with the right data + mock_taxonomy_file.write.assert_called_once_with(written_data) + + +class TestReadDataWithDtypes(TestPluginBase): + package = 'rescript.tests' + + def test_read_data_with_dtypes(self): + # Mock response with a TSV file containing antibiotics data + mock_response = MagicMock() + mock_response.text = """_version_\tantibiotic_name\tcas_id\tdescription +1.0\tPenicillin\tCAS1234\tAntibiotic description +2.0\tAmoxicillin\tCAS5678\tAntibiotic description 2 +""" + + # Call the function with mock data and data_type "antibiotics" + df = read_tsv_data_with_dtypes(mock_response, "antibiotics") + + # Expected DataFrame output + expected_data = { + "_version_": ["1.0", "2.0"], + "antibiotic_name": ["Penicillin", "Amoxicillin"], + "cas_id": ["CAS1234", "CAS5678"], + "description": ["Antibiotic description", + "Antibiotic description 2"] + } + expected_df = pd.DataFrame(expected_data) + + # Check if the DataFrame matches the expected output + pd.testing.assert_frame_equal(df, expected_df) + + def test_no_data_raises_value_error(self): + # Mock response with only the column headers + mock_response = MagicMock() + mock_response.text = """_version_\tantibiotic_name\tcas_id""" + + # Assert that a ValueError is raised when no data rows are present + with self.assertRaises(ValueError): + read_tsv_data_with_dtypes(mock_response, "antibiotics") + + +class TestGetLoci(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.process_loci') + @patch('builtins.open', new_callable=mock_open) + @patch('rescript.bv_brc.LociDirectoryFormat') + def test_get_loci(self, mock_loci_dir_format, mock_open_file, + mock_process_loci, mock_download_data): + # Set up the mocks + mock_download_data.return_value = "mock_gff_data" + mock_process_loci.return_value = "processed_gff_data" + + # Create a mock directory format + mock_loci_dir = MagicMock() + mock_loci_dir_format.return_value = mock_loci_dir + + # Mock response_sequences with genome_id information + response_sequences = [ + {"genome_id": "genome1"}, + {"genome_id": "genome2"}, + {"genome_id": "genome1"} + ] + + # Call the function with response_sequences + get_loci(response_sequences) + + # Check that download_data is called for each unique genome_id + mock_download_data.assert_any_call(data_type="genome_feature", + query="eq(genome_id,genome1)", + accept="application/gff") + mock_download_data.assert_any_call(data_type="genome_feature", + query="eq(genome_id,genome2)", + accept="application/gff") + + # Ensure it is only called twice (for unique genome_ids) + self.assertEqual(mock_download_data.call_count, 2) + + # Check that process_loci is called with the right data + mock_process_loci.assert_any_call(gff_string="mock_gff_data") + + # Check that open was called with the correct paths and the data was + # written + mock_open_file.assert_any_call( + mock_loci_dir.__str__() + '/genome1.gff', 'w') + mock_open_file.assert_any_call( + mock_loci_dir.__str__() + '/genome2.gff', 'w') + + # Ensure the processed data was written to the file + mock_open_file().write.assert_any_call("processed_gff_data") + + def test_process_loci(self): + # Input GFF string with both headers and data lines + input_data = """##gff-version 3 +##sequence-region accn|NC_000001.11 1 1000 +accn|NC_000001.11\tRefSeq\tregion\t1\t1000\t.\t+\t.\tID=region0; +accn|NC_000002.11\tRefSeq\tgene\t1\t1000\t.\t+\t.\tID=gene0;""" + + # Expected output after processing (removing "accn|") + expected_output = """##gff-version 3 +##sequence-region accn|NC_000001.11 1 1000 +NC_000001.11\tRefSeq\tregion\t1\t1000\t.\t+\t.\tID=region0; +NC_000002.11\tRefSeq\tgene\t1\t1000\t.\t+\t.\tID=gene0;""" + + # Call the function with the input data + result = process_loci(input_data) + + # Check if the result matches the expected output + self.assertEqual(result, expected_output) + + +class TestGetSequences(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('builtins.open', new_callable=mock_open) + def test_get_sequences(self, mock_open, mock_download_data): + # Mock data for genome_features + genome_features = [ + { + "genome_id": "genome1", + "feature_id": "feature1", + "na_sequence_md5": "md5na1", + "aa_sequence_md5": "md5aa1" + }, + { + "genome_id": "genome2", + "feature_id": "feature2", + "na_sequence_md5": "md5na2" + } + ] + + # Mock the return value of download_data + mock_download_data.return_value = [ + {"md5": "md5na1", "sequence": "atgc"}, + {"md5": "md5aa1", "sequence": "MKV"}, + {"md5": "md5na2", "sequence": "gtca"} + ] + + # Call the function + get_sequences(genome_features) + + # Assertions to check that download_data was called correctly + mock_download_data.assert_called_once_with( + data_type="feature_sequence", + query=unittest.mock.ANY, + accept="application/json", + select=["md5", "sequence"] + ) + + # Check that genome1.fasta and genome2.fasta are in the paths used + # for file opening + open_calls = [call[0][0] for call in mock_open.call_args_list] + + self.assertTrue(any('genome1.fasta' in call for call in open_calls)) + self.assertTrue(any('genome2.fasta' in call for call in open_calls)) + + # Check if the correct sequences were written to the genes file + mock_open().write.assert_any_call('>feature1\nATGC\n') + mock_open().write.assert_any_call('>feature2\nGTCA\n') - def test_genome_features2(self): - fetch_genome_features_bv_brc(rql_query="eq(genome_id,1313.5550)") \ No newline at end of file + # Check if the correct sequences were written to the proteins file + mock_open().write.assert_any_call('>feature1\nMKV\n')