From 58a0ef6e6599ba21ef32b01431e834b025c65a1a Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Thu, 3 Oct 2024 17:47:58 +0200 Subject: [PATCH] ENH: Add actions to fetch data from BV-BRC. (#202) LGTM! Thank you @VinzentRisch! --- rescript/bv_brc.py | 1638 +++++++++++++++++++++++++++++++++ rescript/citations.bib | 11 + rescript/plugin_setup.py | 188 +++- rescript/tests/test_bv_brc.py | 784 ++++++++++++++++ 4 files changed, 2619 insertions(+), 2 deletions(-) create mode 100644 rescript/bv_brc.py create mode 100644 rescript/tests/test_bv_brc.py diff --git a/rescript/bv_brc.py b/rescript/bv_brc.py new file mode 100644 index 0000000..b8a2497 --- /dev/null +++ b/rescript/bv_brc.py @@ -0,0 +1,1638 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2019-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from collections import OrderedDict +from io import StringIO +import os +from typing import Union +from urllib.parse import quote + +import qiime2 +import pandas as pd +import requests +from q2_types.feature_data import TSVTaxonomyFormat +from q2_types.genome_data import (GenomeSequencesDirectoryFormat, + GenesDirectoryFormat, + ProteinsDirectoryFormat, LociDirectoryFormat) + +from rescript.ncbi import _allowed_ranks, _default_ranks +import json + + +def get_bv_brc_metadata( + ids_metadata: Union[qiime2.NumericMetadataColumn, + qiime2.CategoricalMetadataColumn] = None, + data_type: str = None, + rql_query: str = None, + data_field: str = None, + ids: list = None, +) -> qiime2.Metadata: + # Parameter validation and creation of RQL query + rql_query = parameter_validation(rql_query=rql_query, + ids=ids, + data_type=data_type, + data_field=data_field, + metadata=ids_metadata) + + # Download metadata as df + metadata = download_data(data_type=data_type, + query=rql_query, + accept="text/tsv", + ) + + # Set index of metadata df + metadata.index.name = "id" + metadata.index = metadata.index.astype(str) + + # Replace empty values and values consisting of spaces only with np.nan + metadata.replace(to_replace=r'^\s*$', value=pd.NA, regex=True, + inplace=True) + metadata.replace([None], pd.NA, inplace=True) + return qiime2.Metadata(metadata) + + +def get_bv_brc_genomes( + ids_metadata: Union[qiime2.NumericMetadataColumn, + qiime2.CategoricalMetadataColumn] = None, + rql_query: str = None, + data_field: str = None, + ids: list = None, + ranks: list = None, + rank_propagation: bool = True, +) -> (GenomeSequencesDirectoryFormat, TSVTaxonomyFormat): + # Parameter validation + rql_query = parameter_validation(rql_query=rql_query, + ids=ids, + data_type="genome", + data_field=data_field, + metadata=ids_metadata + ) + + response_genomes = download_data(data_type="genome", + query=rql_query, + accept="application/json", + select=["genome_id", "taxon_id"] + ) + + # Get genome sequences and create FASTA files + genomes = get_genome_sequences(response_genomes=response_genomes) + + # Get taxonomy for sequences + taxonomy = get_taxonomy(response_sequences=response_genomes, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name="accession") + + return genomes, taxonomy + + +def get_genome_sequences(response_genomes): + # Extract genome ids from response (list of dicts) + genome_ids = set([str(entry['genome_id']) for entry in response_genomes]) + + # Fetch the genome sequences for all genome ids + genome_sequences = download_data( + data_type="genome_sequence", + query=f"in(genome_id,({','.join(genome_ids)}))", + accept="application/json", + select=["accession", "description", "genome_name", + "genome_id", "sequence"] + ) + + # Create FASTA files from sequences + genomes = create_genome_fasta(genome_sequences=genome_sequences) + + return genomes + + +def get_bv_brc_genome_features( + ids_metadata: Union[qiime2.NumericMetadataColumn, + qiime2.CategoricalMetadataColumn] = None, + rql_query: str = None, + data_field: str = None, + ids: list = None, + ranks: list = None, + rank_propagation: bool = True, +) -> ( + GenesDirectoryFormat, ProteinsDirectoryFormat, + TSVTaxonomyFormat, LociDirectoryFormat): + # Parameter validation + rql_query = parameter_validation(rql_query=rql_query, + ids=ids, + data_type="genome_feature", + data_field=data_field, + metadata=ids_metadata + ) + + # Download genome_features data object as JSON + genome_features = download_data(data_type="genome_feature", + query=rql_query, + accept="application/json", + select=["genome_id", "feature_id", + "aa_sequence_md5", + "na_sequence_md5", "taxon_id"] + ) + + # Download nucleotide and protein sequences for features + genes, proteins = get_sequences(genome_features=genome_features) + + # Download taxonomy for feature sequences + taxonomy = get_taxonomy(response_sequences=genome_features, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name="feature_id" + ) + + # Download GFF files for genome ids + loci = get_loci(response_sequences=genome_features) + + return genes, proteins, taxonomy, loci + + +def get_sequences(genome_features): + genes = GenesDirectoryFormat() + proteins = ProteinsDirectoryFormat() + + # Extract md5 values for nucleotide and protein sequences from + # genome_features JSON + md5_ids = set([item[key] for item in genome_features for key in + ['aa_sequence_md5', 'na_sequence_md5'] if + key in item]) + + # Download sequences corresponding to the md5 values + feature_sequences = download_data(data_type="feature_sequence", + query=f"in(md5,({','.join(md5_ids)}))", + accept="application/json", + select=["md5", "sequence"]) + + # Create dict with md5 id as keys and sequence as value for faster look up + lookup_dict = {entry['md5']: entry['sequence'] for entry in + feature_sequences} + + fasta_genes = {} + fasta_proteins = {} + + # Loop through list of dicts + for entry in genome_features: + # Extract genome_id + genome_id = entry["genome_id"] + + if 'na_sequence_md5' in entry: + # Create FASTA entry for the sequence + fasta_na = (f">{entry['feature_id']}\n" + f"{lookup_dict[entry['na_sequence_md5']].upper()}\n") + + # Add FASTA entry to entries with the same genome id + fasta_genes[genome_id] = fasta_genes.get(genome_id, "") + fasta_na + + if 'aa_sequence_md5' in entry: + # Create FASTA entry for the sequence + fasta_aa = (f">{entry['feature_id']}\n" + f"{lookup_dict[entry['aa_sequence_md5']].upper()}\n") + + # Add FASTA entry to entries with the same genome id + fasta_proteins[genome_id] = (fasta_proteins.get(genome_id, "") + + fasta_aa) + + # Save genes and proteins as FASTA files one file per genome_id + for genome_id, fasta_sequences in fasta_genes.items(): + with open(os.path.join(str(genes), f"{genome_id}.fasta"), + 'w') as fasta_file: + fasta_file.write(fasta_sequences) + + for genome_id, fasta_sequences in fasta_proteins.items(): + with open(os.path.join(str(proteins), f"{genome_id}.fasta"), + 'w') as fasta_file: + fasta_file.write(fasta_sequences) + + return genes, proteins + + +def get_loci(response_sequences): + # Init loci dir format + loci = LociDirectoryFormat() + + # Extract genome ids from genome feature JSON + genome_ids = set([str(entry['genome_id']) for entry in response_sequences]) + + # Download GFF files for all genome ids. For every id there is a separate + # request + for genome_id in genome_ids: + gff = download_data(data_type="genome_feature", + query=f"eq(genome_id,{genome_id})", + accept="application/gff", + ) + + # Process the loci data and write to file + with open(os.path.join(str(loci), f"{genome_id}.gff"), 'w') as file: + file.write(process_loci(gff_string=gff)) + + return loci + + +def process_loci(gff_string): + # Split the string into lines + lines = gff_string.splitlines() + modified_lines = [] + + for line in lines: + if line.startswith("#"): + # Keep header lines unchanged + modified_lines.append(line) + else: + # Remove the first five characters "accn|" + modified_lines.append(line[5:]) + + # Join the lines back into a single string + return "\n".join(modified_lines) + + +def get_taxonomy(response_sequences, ranks, rank_propagation, accession_name): + # Extract all taxon_ids from response_sequences JSON + taxon_ids = set([str(entry['taxon_id']) for entry in response_sequences]) + + # Download taxonomy for all taxon_ids as df + taxonomy = download_data(data_type="taxonomy", + query=f"in(taxon_id,({','.join(taxon_ids)}))", + accept="text/tsv", + select=["taxon_id", "lineage_names", + "lineage_ranks"]) + + # Transform the df to conform with TSVTaxonomyFormat + return create_taxonomy( + taxonomy_bvbrc=taxonomy, + response_sequences=response_sequences, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name=accession_name, + ) + + +def create_taxonomy_entry( + lineage_names, + lineage_ranks, + ranks=None, + rank_propagation=True +): + # Set ranks to default if no list is specified + if not ranks: + ranks = _default_ranks + + # Split the lineage names and ranks by ';' + lineage_split = lineage_names.split(';') + rank_split = lineage_ranks.split(';') + + # Initialize the taxonomy dictionary with the allowed ranks + taxonomy = OrderedDict((r, None) for r in ranks) + + # Loop over each rank and assign the corresponding name + for rank, name in zip(rank_split, lineage_split): + if rank in ranks: + taxonomy[rank] = name + + # Handle genus and species splitting logic + if 'genus' in ranks and taxonomy.get('species'): + species = taxonomy.get('species') + # If genus includes the species name cut it out and put it into species + if taxonomy.get('genus'): + if species.startswith(taxonomy['genus'] + ' '): + species = species[len(taxonomy['genus']) + 1:] + taxonomy['species'] = species + # If species includes the genus name cut it out and put it into genus + elif ' ' in species: + genus, species = species.split(' ', 1) + taxonomy['genus'] = genus + taxonomy['species'] = species + + # Apply rank propagation if enabled. Assigns last higher rank to lower + # undefined ranks + if rank_propagation: + last_label = None + for rank in taxonomy: + if taxonomy[rank] is None: + taxonomy[rank] = last_label + last_label = taxonomy[rank] + + # Create taxonomy entry + taxonomy_entry = '; '.join( + f"{_allowed_ranks.get(rank, '')}{name if name else ''}" for rank, name + in taxonomy.items() + ) + return taxonomy_entry + + +def create_taxonomy(taxonomy_bvbrc, response_sequences, + ranks, rank_propagation, accession_name): + # Create qiime style taxonomy entries for all rows in taxonomy_bvbrc + taxonomy_bvbrc['Taxon'] = ( + taxonomy_bvbrc.apply(lambda row: + create_taxonomy_entry( + lineage_names=row['lineage_names'], + lineage_ranks=row['lineage_ranks'], + rank_propagation=rank_propagation, + ranks=ranks), axis=1)) + + taxonomy_df = pd.DataFrame(columns=['Feature ID', 'Taxon']) + + # Loop through each JSON dictionary in the list + for entry in response_sequences: + # Get the accession and taxon_id from the JSON dictionary + accession = entry.get(accession_name) + taxon_id = str(entry.get('taxon_id')) + + # Look up the corresponding taxon in taxonomy_bvbrc using taxon_id + taxon_name = taxonomy_bvbrc.loc[ + taxonomy_bvbrc['taxon_id'] == taxon_id, 'Taxon'].values + + # Create a new row as a DataFrame with accession and corresponding + # taxon + new_row = pd.DataFrame( + {'Feature ID': accession, 'Taxon': taxon_name}) + + # Append the new row to the taxonomy_df + taxonomy_df = pd.concat([taxonomy_df, new_row], ignore_index=True) + + # Set index + taxonomy_df = taxonomy_df.set_index('Feature ID') + + # Write taxonomy_df to taxonomy format + taxonomy = TSVTaxonomyFormat() + + with taxonomy.open() as file_handle: + file_handle.write(taxonomy_df.to_csv(sep="\t")) + + return taxonomy + + +def create_genome_fasta(genome_sequences): + genomes = GenomeSequencesDirectoryFormat() + + # Dictionary to hold sequences grouped by genome_id + fasta_entries = {} + + # Loop over all dicts in genome_sequences + for entry in genome_sequences: + genome_id = entry['genome_id'] + if genome_id not in fasta_entries: + fasta_entries[genome_id] = [] + + # Construct FASTA format to be identical to BV-BRC FASTA headers + header = (f">{entry['accession']} {entry['description']} " + f"[{entry['genome_name']} | {entry['genome_id']}]") + + # Append FASTA entry to fasta_entries + fasta_entries[genome_id].append( + f"{header}\n{entry['sequence'].upper()}") + + # Write each genome_id's sequences to a separate FASTA file + for genome_id, sequences in fasta_entries.items(): + fasta_content = "\n".join(sequences) + fasta_filename = os.path.join(str(genomes), f"{genome_id}.fasta") + + with open(fasta_filename, 'w') as fasta_file: + fasta_file.write(fasta_content) + + return genomes + + +def download_data(data_type, query, accept, select=None): + # URL and headers for requests + base_url = "https://www.bv-brc.org/api/" + url = base_url + data_type + "/" + headers = {'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': accept} + + results = [] if accept == "application/json" else pd.DataFrame() + + # BV-BRC sets an upper limit of 25000 entries that can be fetched at once + batch_size = 25000 + start = 0 + + while True: + # Create the data string with current batch start and limit + data = f"{query}&limit({batch_size},{start})" + + # Add select parameter to only download specified fields + if select: + data = data + f"&select({','.join(select)})" + + # POST request + response = requests.post(url=url, data=data, headers=headers) + + # If the response is successful, process the data + if response.status_code == 200: + if accept == "application/json": + result = response.json() + results.extend(result) # Add the current batch to the list + elif accept == "text/tsv": + result = read_tsv_data_with_dtypes(response=response, + data_type=data_type) + + results = pd.concat([results, result], ignore_index=True) + else: # application == application/gff + return response.text + + # If the number of results is less than the batch size, break the + # loop + if len(result) < batch_size: + break + + # Increment the start for the next batch + start += batch_size + + # Handle errors + elif response.status_code == 400: + error_handling(response=response, data_type=data_type) + break + else: + raise ValueError(response.text) + + return results + + +def read_tsv_data_with_dtypes(response, data_type): + tsv_data = StringIO(response.text) + + # Read only the header to get a list of the column names + columns_in_file = pd.read_csv(tsv_data, sep='\t', nrows=0).columns.tolist() + + # Filter the dtype dictionary to include only the columns that + # exist in the file + dtype_dict = data_fields_bvbrc.get(data_type, {}) + filtered_dtype_dict = {col: dtype_dict[col] for col in columns_in_file if + col in dtype_dict} + + # Move the file pointer to the beginning of the file-like object + tsv_data.seek(0) + + # Read the entire file with the filtered dtype dictionary + df = pd.read_csv(tsv_data, sep='\t', dtype=filtered_dtype_dict) + + # Raise value error if no data was retrieved + if len(df) == 0: + raise ValueError("No data could be retrieved. Either because of an " + "incorrect RQL query or because no data exists for " + "the query.") + + return df + + +def error_handling(response, data_type): + # No data found for query or incorrect RQL query + if response.text == "[]": + raise ValueError("No data could be retrieved. Either because of an " + "incorrect RQL query or because no data exists for " + "the query.") + + elif response.text.startswith("A Database Error Occured:"): + + # Parse the response dict + json_str = response.text[response.text.find('{'):] + response_dict = json.loads(json_str) + + # Incorrect RQL operator + if response_dict['msg'].startswith("undefined field object"): + raise ValueError( + f"Error code {response_dict['code']}: {response_dict['msg']}. " + f"Incorrect RQL query operator." + ) + + # Incorrect field for data type + elif response_dict['msg'].startswith("undefined field"): + raise ValueError( + f"Error code {response_dict['code']}: {response_dict['msg']}. " + f"\nAllowed fields for data type {data_type}: " + f"\n{list(data_fields_bvbrc[data_type].keys())}" + ) + + # Handling any other errors that start with "A Database Error Occured:" + else: + raise ValueError( + f"Error code {response_dict['code']}: {response_dict['msg']}." + ) + + # Handling any other error codes + else: + raise ValueError(response.text) + + +def parameter_validation(rql_query=None, + ids=None, + data_type=None, + data_field=None, + metadata=None): + # Error if any data_type is None + if not data_type: + raise ValueError("Parameter 'data-type' has to be specified.") + + local = locals().copy() + # Error if any other parameter is specified simultaneously with rql_query + # or metadata + for parameter_1 in ["rql_query", "metadata"]: + for parameter_2, value in local.items(): + if (parameter_2 != parameter_1 and + parameter_2 != "data_type" and + local[parameter_1] is not None and + value is not None): + raise ValueError( + f"Parameters '{parameter_1}' and '{parameter_2}' " + "can't be used simultaneously.") + + # Error if ids or data_fields is specified without the other + if (ids and not data_field) or (data_field and not ids): + raise ValueError( + "If parameter 'ids' is given, parameter 'data-field' has to be " + "specified and vice versa.") + + # Error if rql_query, metadata and ids parameters are not given + if not rql_query and not ids and not metadata: + raise ValueError("At least one of the parameters 'rql-query', 'ids' " + "or 'ids_metadata' has to be specified.") + + # Extract data_field and ids from metadata + if metadata is not None: + data_field = metadata.to_series().name + ids = metadata.to_series() + + # Check if data_field is allowed for specified data_type + if (data_field is not None and + data_field not in data_fields_bvbrc[data_type].keys()): + raise ValueError( + f"The data-field '{data_field}' is not permitted for the " + f"data-type '{data_type}'.\nAllowed data fields are: " + f"{list(data_fields_bvbrc[data_type].keys())}") + + # Construct the RQL query + if ids is not None or metadata is not None: + # Join the quoted ids with commas and percent encode the values + joined_ids = ','.join(quote(f'"{str(id_)}"') for id_ in ids) + + # Final query + rql_query = f'in({data_field},({joined_ids}))' + + return rql_query + + +data_fields_bvbrc = { + "antibiotics": { + "_version_": str, + "antibiotic_name": str, + "atc_classification": str, + "canonical_smiles": str, + "cas_id": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "drugbank_interactions": str, + "inchi_key": str, + "isomeric_smiles": str, + "mechanism_of_action": str, + "molecular_formula": str, + "molecular_weight": str, + "pharmacological_classes": str, + "pharmacology": str, + "pubchem_cid": str, + "pubchem_cid_i": str, + "synonyms": str + }, + "enzyme_class_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "ec_number": str, + "go": str + }, + "epitope": { + "_version_": str, + "assay_results": str, + "bcell_assays": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "end": float, + "epitope_id": str, + "epitope_sequence": str, + "epitope_type": str, + "host_name": str, + "mhc_assays": str, + "organism": str, + "protein_accession": str, + "protein_id": str, + "protein_name": str, + "start": float, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "tcell_assays": str, + "total_assays": float + }, + "epitope_assay": { + "_version_": str, + "assay_group": str, + "assay_id": str, + "assay_measurement": str, + "assay_measurement_unit": str, + "assay_method": str, + "assay_result": str, + "assay_type": str, + "authors": str, + "date_inserted": str, + "date_modified": str, + "end": float, + "epitope_id": str, + "epitope_sequence": str, + "epitope_type": str, + "host_name": str, + "host_taxon_id": str, + "mhc_allele": str, + "mhc_allele_class": str, + "organism": str, + "pdb_id": str, + "pmid": str, + "protein_accession": str, + "protein_id": str, + "protein_name": str, + "start": float, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "title": str + }, + "experiment": { + "_version_": str, + "additional_data": str, + "additional_metadata": str, + "biosets": float, + "date_inserted": str, + "date_modified": str, + "detection_instrument": str, + "doi": str, + "exp_description": str, + "exp_id": str, + "exp_name": str, + "exp_poc": str, + "exp_protocol": str, + "exp_title": str, + "exp_type": str, + "experimenters": str, + "genome_id": str, + "measurement_technique": str, + "organism": str, + "pmid": str, + "public_identifier": str, + "public_repository": str, + "samples": float, + "strain": str, + "study_description": str, + "study_institution": str, + "study_name": str, + "study_pi": str, + "study_title": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "treatment_amount": str, + "treatment_duration": str, + "treatment_name": str, + "treatment_type": str + }, + "bioset": { + "_version_": str, + "additional_data": str, + "additional_metadata": str, + "analysis_group_1": str, + "analysis_group_2": str, + "analysis_method": str, + "bioset_criteria": str, + "bioset_description": str, + "bioset_id": str, + "bioset_name": str, + "bioset_result": str, + "bioset_type": str, + "date_inserted": str, + "date_modified": str, + "entity_count": str, + "entity_type": str, + "exp_id": str, + "exp_name": str, + "exp_title": str, + "exp_type": str, + "genome_id": str, + "organism": str, + "protocol": str, + "result_type": str, + "strain": str, + "study_description": str, + "study_institution": str, + "study_name": str, + "study_pi": str, + "study_title": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "treatment_amount": str, + "treatment_duration": str, + "treatment_name": str, + "treatment_type": str + }, + "bioset_result": { + "_version_": str, + "bioset_description": str, + "bioset_id": str, + "bioset_name": str, + "bioset_type": str, + "counts": float, + "date_inserted": str, + "date_modified": str, + "entity_id": str, + "entity_name": str, + "entity_type": str, + "exp_id": str, + "exp_name": str, + "exp_title": str, + "exp_type": str, + "feature_id": str, + "fpkm": float, + "gene": str, + "gene_id": str, + "genome_id": str, + "id": str, + "locus_tag": str, + "log2_fc": float, + "organism": str, + "other_ids": str, + "other_value": float, + "p_value": float, + "patric_id": str, + "product": str, + "protein_id": str, + "result_type": str, + "strain": str, + "taxon_id": str, + "tpm": float, + "treatment_amount": str, + "treatment_duration": str, + "treatment_name": str, + "treatment_type": str, + "uniprot_id": str, + "z_score": float + }, + "gene_ontology_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "definition": str, + "go_id": str, + "go_name": str, + "ontology": str + }, + "genome": { + "_version_": str, + "additional_metadata": str, + "altitude": str, + "antimicrobial_resistance": str, + "antimicrobial_resistance_evidence": str, + "assembly_accession": str, + "assembly_method": str, + "authors": str, + "bioproject_accession": str, + "biosample_accession": str, + "biovar": str, + "body_sample_site": str, + "body_sample_subsite": str, + "cds": float, + "cds_ratio": float, + "cell_shape": str, + "checkm_completeness": float, + "checkm_contamination": float, + "chromosomes": float, + "clade": str, + "class": str, + "coarse_consistency": float, + "collection_date": str, + "collection_year": float, + "comments": str, + "common_name": str, + "completion_date": str, + "contig_l50": float, + "contig_n50": float, + "contigs": float, + "core_families": float, + "core_family_ratio": float, + "culture_collection": str, + "date_inserted": str, + "date_modified": str, + "depth": str, + "disease": str, + "family": str, + "fine_consistency": float, + "gc_content": float, + "genbank_accessions": str, + "genome_id": str, + "genome_length": float, + "genome_name": str, + "genome_quality": str, + "genome_quality_flags": str, + "genome_status": str, + "genus": str, + "geographic_group": str, + "geographic_location": str, + "gram_stain": str, + "h1_clade_global": str, + "h1_clade_us": str, + "h3_clade": str, + "h5_clade": str, + "h_type": float, + "habitat": str, + "host_age": str, + "host_common_name": str, + "host_gender": str, + "host_group": str, + "host_health": str, + "host_name": str, + "host_scientific_name": str, + "hypothetical_cds": float, + "hypothetical_cds_ratio": float, + "isolation_comments": str, + "isolation_country": str, + "isolation_site": str, + "isolation_source": str, + "kingdom": str, + "lab_host": str, + "latitude": str, + "lineage": str, + "longitude": str, + "mat_peptide": float, + "missing_core_family_ids": str, + "mlst": str, + "motility": str, + "n_type": float, + "ncbi_project_id": str, + "nearest_genomes": str, + "optimal_temperature": str, + "order": str, + "organism_name": str, + "other_clinical": str, + "other_environmental": str, + "other_names": str, + "other_typing": str, + "outgroup_genomes": str, + "owner": str, + "oxygen_requirement": str, + "p2_genome_id": str, + "partial_cds": float, + "partial_cds_ratio": float, + "passage": str, + "pathovar": str, + "patric_cds": float, + "ph1n1_like": str, + "phenotype": str, + "phylum": str, + "plasmids": float, + "plfam_cds": float, + "plfam_cds_ratio": float, + "public": str, + "publication": str, + "reference_genome": str, + "refseq_accessions": str, + "refseq_cds": float, + "refseq_project_id": str, + "rrna": float, + "salinity": str, + "season": str, + "segment": str, + "segments": float, + "sequencing_centers": str, + "sequencing_depth": str, + "sequencing_platform": str, + "sequencing_status": str, + "serovar": str, + "species": str, + "sporulation": str, + "sra_accession": str, + "state_province": str, + "strain": str, + "subclade": str, + "subtype": str, + "superkingdom": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "temperature_range": str, + "trna": float, + "type_strain": str, + "user_read": str, + "user_write": str + }, + "strain": { + "1_pb2": str, + "2_pb1": str, + "3_pa": str, + "4_ha": str, + "5_np": str, + "6_na": str, + "7_mp": str, + "8_ns": str, + "_version_": str, + "collection_date": str, + "collection_year": float, + "date_inserted": str, + "date_modified": str, + "family": str, + "genbank_accessions": str, + "genome_ids": str, + "genus": str, + "geographic_group": str, + "h_type": float, + "host_common_name": str, + "host_group": str, + "host_name": str, + "id": str, + "isolation_country": str, + "l": str, + "lab_host": str, + "m": str, + "n_type": float, + "other_segments": str, + "owner": str, + "passage": str, + "public": str, + "s": str, + "season": str, + "segment_count": float, + "species": str, + "status": str, + "strain": str, + "subtype": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "user_read": str, + "user_write": str + }, + "genome_amr": { + "_version_": str, + "antibiotic": str, + "computational_method": str, + "computational_method_performance": str, + "computational_method_version": str, + "date_inserted": str, + "date_modified": str, + "evidence": str, + "genome_id": str, + "genome_name": str, + "id": str, + "laboratory_typing_method": str, + "laboratory_typing_method_version": str, + "laboratory_typing_platform": str, + "measurement": str, + "measurement_sign": str, + "measurement_unit": str, + "measurement_value": str, + "owner": str, + "pmid": str, + "public": str, + "resistant_phenotype": str, + "source": str, + "taxon_id": str, + "testing_standard": str, + "testing_standard_year": float, + "user_read": str, + "user_write": str, + "vendor": str + }, + "feature_sequence": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "md5": str, + "sequence": str, + "sequence_type": str + }, + "genome_feature": { + "aa_length": float, + "aa_sequence_md5": str, + "accession": str, + "alt_locus_tag": str, + "annotation": str, + "brc_id": str, + "classifier_round": float, + "classifier_score": float, + "codon_start": float, + "date_inserted": str, + "date_modified": str, + "end": float, + "feature_id": str, + "feature_type": str, + "figfam_id": str, + "gene": str, + "gene_id": str, + "genome_id": str, + "genome_name": str, + "go": str, + "location": str, + "na_length": float, + "na_sequence_md5": str, + "notes": str, + "og_id": str, + "owner": str, + "p2_feature_id": str, + "patric_id": str, + "pdb_accession": str, + "pgfam_id": str, + "plfam_id": str, + "product": str, + "property": str, + "protein_id": str, + "public": str, + "refseq_locus_tag": str, + "segments": str, + "sequence_id": str, + "sog_id": str, + "start": float, + "strand": str, + "taxon_id": str, + "uniprotkb_accession": str, + "user_read": str, + "user_write": str + }, + "genome_sequence": { + "_version_": str, + "accession": str, + "chromosome": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "gc_content": float, + "genome_id": str, + "genome_name": str, + "gi": float, + "length": float, + "mol_type": str, + "owner": str, + "p2_sequence_id": str, + "plasmid": str, + "public": str, + "release_date": str, + "segment": str, + "sequence": str, + "sequence_id": str, + "sequence_md5": str, + "sequence_status": str, + "sequence_type": str, + "taxon_id": str, + "topology": str, + "user_read": str, + "user_write": str, + "version": str + }, + "id_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "id": str, + "id_type": str, + "id_value": str, + "uniprotkb_accession": str + }, + "misc_niaid_sgc": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "gene_symbol_collection": str, + "genus": str, + "has_clones": str, + "has_proteins": str, + "selection_criteria": str, + "species": str, + "strain": str, + "target_id": str, + "target_status": str + }, + "pathway": { + "_version_": str, + "accession": str, + "alt_locus_tag": str, + "annotation": str, + "date_inserted": str, + "date_modified": str, + "ec_description": str, + "ec_number": str, + "feature_id": str, + "gene": str, + "genome_ec": str, + "genome_id": str, + "genome_name": str, + "id": str, + "owner": str, + "pathway_class": str, + "pathway_ec": str, + "pathway_id": str, + "pathway_name": str, + "patric_id": str, + "product": str, + "public": str, + "refseq_locus_tag": str, + "sequence_id": str, + "taxon_id": str, + "user_read": str, + "user_write": str + }, + "pathway_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "ec_description": str, + "ec_number": str, + "id": str, + "map_location": str, + "map_name": str, + "map_type": str, + "occurrence": float, + "pathway_class": str, + "pathway_id": str, + "pathway_name": str + }, + "ppi": { + "_version_": str, + "category": str, + "date_inserted": str, + "date_modified": str, + "detection_method": str, + "domain_a": str, + "domain_b": str, + "evidence": str, + "feature_id_a": str, + "feature_id_b": str, + "gene_a": str, + "gene_b": str, + "genome_id_a": str, + "genome_id_b": str, + "genome_name_a": str, + "genome_name_b": str, + "id": str, + "interaction_type": str, + "interactor_a": str, + "interactor_b": str, + "interactor_desc_a": str, + "interactor_desc_b": str, + "interactor_type_a": str, + "interactor_type_b": str, + "pmid": str, + "refseq_locus_tag_a": str, + "refseq_locus_tag_b": str, + "score": str, + "source_db": str, + "source_id": str, + "taxon_id_a": str, + "taxon_id_b": str + }, + "protein_family_ref": { + "_version_": str, + "date_inserted": str, + "date_modified": str, + "family_id": str, + "family_product": str, + "family_type": str + }, + "sequence_feature": { + "aa_sequence_md5": str, + "aa_variant": str, + "additional_metadata": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "end": float, + "evidence_code": str, + "feature_id": str, + "genbank_accession": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "length": float, + "patric_id": str, + "product": str, + "publication": str, + "refseq_locus_tag": str, + "segment": str, + "segments": str, + "sf_category": str, + "sf_id": str, + "sf_name": str, + "sf_sequence": str, + "sf_sequence_md5": str, + "source": str, + "source_aa_sequence": str, + "source_id": str, + "source_sf_location": str, + "source_strain": str, + "start": float, + "subtype": str, + "taxon_id": str, + "variant_types": str + }, + "sequence_feature_vt": { + "additional_metadata": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "id": str, + "sf_category": str, + "sf_id": str, + "sf_name": str, + "sf_sequence": str, + "sf_sequence_md5": str, + "sfvt_genome_count": str, + "sfvt_genome_ids": str, + "sfvt_id": str, + "sfvt_sequence": str, + "sfvt_sequence_md5": str, + "sfvt_variations": str + }, + "sp_gene": { + "_version_": str, + "alt_locus_tag": str, + "antibiotics": str, + "antibiotics_class": str, + "classification": str, + "date_inserted": str, + "date_modified": str, + "e_value": str, + "evidence": str, + "feature_id": str, + "function": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "identity": float, + "organism": str, + "owner": str, + "patric_id": str, + "pmid": str, + "product": str, + "property": str, + "property_source": str, + "public": str, + "query_coverage": float, + "refseq_locus_tag": str, + "same_genome": float, + "same_genus": float, + "same_species": float, + "source": str, + "source_id": str, + "subject_coverage": float, + "taxon_id": str, + "user_read": str, + "user_write": str + }, + "sp_gene_ref": { + "_version_": str, + "antibiotics": str, + "antibiotics_class": str, + "assertion": str, + "classification": str, + "date_inserted": str, + "date_modified": str, + "function": str, + "gene_id": str, + "gene_name": str, + "genus": str, + "gi": str, + "id": str, + "locus_tag": str, + "organism": str, + "pmid": str, + "product": str, + "property": str, + "source": str, + "source_id": str, + "species": str + }, + "spike_lineage": { + "_version_": str, + "country": str, + "date_inserted": str, + "date_modified": str, + "growth_rate": float, + "id": str, + "lineage": str, + "lineage_count": float, + "lineage_of_concern": str, + "month": str, + "prevalence": float, + "region": str, + "sequence_features": str, + "total_isolates": float + }, + "spike_variant": { + "_version_": str, + "aa_variant": str, + "country": str, + "date_inserted": str, + "date_modified": str, + "growth_rate": float, + "id": str, + "lineage_count": float, + "month": str, + "prevalence": float, + "region": str, + "sequence_features": str, + "total_isolates": float + }, + "structured_assertion": { + "_version_": str, + "comment": str, + "date_inserted": str, + "date_modified": str, + "evidence_code": str, + "feature_id": str, + "id": str, + "owner": str, + "patric_id": str, + "pmid": str, + "property": str, + "public": str, + "refseq_locus_tag": str, + "score": str, + "source": str, + "user_read": str, + "user_write": str, + "value": str + }, + "subsystem": { + "_version_": str, + "active": str, + "class": str, + "date_inserted": str, + "date_modified": str, + "feature_id": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "owner": str, + "patric_id": str, + "product": str, + "public": str, + "refseq_locus_tag": str, + "role_id": str, + "role_name": str, + "subclass": str, + "subsystem_id": str, + "subsystem_name": str, + "superclass": str, + "taxon_id": str, + "user_read": str, + "user_write": str + }, + "subsystem_ref": { + "_version_": str, + "class": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "id": str, + "notes": str, + "pmid": str, + "role_id": str, + "role_name": str, + "subclass": str, + "subsystem_id": str, + "subsystem_name": str, + "superclass": str + }, + "taxonomy": { + "_version_": str, + "cds_mean": float, + "cds_sd": float, + "core_families": float, + "core_family_ids": str, + "description": str, + "division": str, + "genetic_code": str, + "genome_count": float, + "genome_length_mean": float, + "genome_length_sd": float, + "genomes": float, + "genomes_f": str, + "hypothetical_cds_ratio_mean": float, + "hypothetical_cds_ratio_sd": float, + "lineage": str, + "lineage_ids": str, + "lineage_names": str, + "lineage_ranks": str, + "other_names": str, + "parent_id": str, + "plfam_cds_ratio_mean": float, + "plfam_cds_ratio_sd": float, + "taxon_id": str, + "taxon_id_i": str, + "taxon_name": str, + "taxon_rank": str + }, + "protein_structure": { + "alignments": str, + "authors": str, + "date_inserted": str, + "date_modified": str, + "feature_id": str, + "file_path": str, + "gene": str, + "genome_id": str, + "institution": str, + "method": str, + "organism_name": str, + "patric_id": str, + "pdb_id": str, + "pmid": str, + "product": str, + "release_date": str, + "resolution": str, + "sequence": str, + "sequence_md5": str, + "taxon_id": str, + "taxon_lineage_ids": str, + "taxon_lineage_names": str, + "title": str, + "uniprotkb_accession": str + }, + "protein_feature": { + "aa_sequence_md5": str, + "classification": str, + "comments": str, + "date_inserted": str, + "date_modified": str, + "description": str, + "e_value": str, + "end": float, + "evidence": str, + "feature_id": str, + "feature_type": str, + "gene": str, + "genome_id": str, + "genome_name": str, + "id": str, + "interpro_description": str, + "interpro_id": str, + "length": float, + "patric_id": str, + "product": str, + "publication": str, + "refseq_locus_tag": str, + "score": float, + "segments": str, + "sequence": str, + "source": str, + "source_id": str, + "start": float, + "taxon_id": str + }, + "surveillance": { + "additional_metadata": str, + "alcohol_or_other_drug_dependence": str, + "breastfeeding": str, + "chest_imaging_interpretation": str, + "chronic_conditions": str, + "collection_city": str, + "collection_country": str, + "collection_date": str, + "collection_latitude": float, + "collection_longitude": float, + "collection_poi": str, + "collection_season": str, + "collection_state_province": str, + "collection_year": str, + "collector_institution": str, + "collector_name": str, + "comments": str, + "contact_email_address": str, + "contributing_institution": str, + "date_inserted": str, + "date_modified": str, + "daycare_attendance": str, + "days_elapsed_to_disease_status": str, + "days_elapsed_to_sample_collection": str, + "days_elapsed_to_vaccination": str, + "diagnosis": str, + "dialysis": str, + "disease_severity": str, + "disease_status": str, + "duration_of_exposure": str, + "duration_of_treatment": str, + "ecmo": str, + "education": str, + "embargo_end_date": str, + "exposure": str, + "exposure_type": str, + "genome_id": str, + "geographic_group": str, + "hospitalization_duration": str, + "hospitalized": str, + "host_age": str, + "host_capture_status": str, + "host_common_name": str, + "host_ethnicity": str, + "host_group": str, + "host_habitat": str, + "host_health": str, + "host_height": str, + "host_id_type": str, + "host_identifier": str, + "host_natural_state": str, + "host_race": str, + "host_sex": str, + "host_species": str, + "host_weight": str, + "human_leukocyte_antigens": str, + "id": str, + "infections_within_five_years": str, + "influenza_like_illness_over_the_past_year": str, + "initiation_of_treatment": str, + "intensive_care_unit": str, + "last_update_date": str, + "longitudinal_study": str, + "maintenance_medication": str, + "nursing_home_residence": str, + "onset_hours": str, + "other_vaccinations": str, + "oxygen_saturation": str, + "packs_per_day_for_how_many_years": str, + "pathogen_test_interpretation": str, + "pathogen_test_result": str, + "pathogen_test_type": str, + "pathogen_type": str, + "post_visit_medications": str, + "pre_visit_medications": str, + "pregnancy": str, + "primary_living_situation": str, + "profession": str, + "project_identifier": str, + "sample_accession": str, + "sample_identifier": str, + "sample_material": str, + "sample_receipt_date": str, + "sample_transport_medium": str, + "sequence_accession": str, + "source_of_vaccine_information": str, + "species": str, + "strain": str, + "submission_date": str, + "subtype": str, + "sudden_onset": str, + "symptoms": str, + "taxon_lineage_ids": str, + "tobacco_use": str, + "travel_history": str, + "treatment": str, + "treatment_dosage": str, + "treatment_type": str, + "trimester_of_pregnancy": str, + "types_of_allergies": str, + "use_of_personal_protective_equipment": str, + "vaccination_type": str, + "vaccine_dosage": str, + "vaccine_lot_number": str, + "vaccine_manufacturer": str, + "ventilation": str + }, + "serology": { + "additional_metadata": str, + "collection_city": str, + "collection_country": str, + "collection_date": str, + "collection_state": str, + "collection_year": str, + "comments": str, + "contributing_institution": str, + "date_inserted": str, + "date_modified": str, + "genbank_accession": str, + "geographic_group": str, + "host_age": str, + "host_age_group": str, + "host_common_name": str, + "host_health": str, + "host_identifier": str, + "host_sex": str, + "host_species": str, + "host_type": str, + "id": str, + "positive_definition": str, + "project_identifier": str, + "sample_accession": str, + "sample_identifier": str, + "serotype": str, + "strain": str, + "taxon_lineage_ids": str, + "test_antigen": str, + "test_interpretation": str, + "test_pathogen": str, + "test_result": str, + "test_type": str, + "virus_identifier": str + } +} diff --git a/rescript/citations.bib b/rescript/citations.bib index bc4b4fa..c5a507c 100644 --- a/rescript/citations.bib +++ b/rescript/citations.bib @@ -178,3 +178,14 @@ @article{nilsson2019unite pmcid = {PMC6324048}, pmid = {30371820}, } + +@article{olson2023introducing, + title={Introducing the bacterial and viral bioinformatics resource center (BV-BRC): a resource combining PATRIC, IRD and ViPR}, + author={Olson, Robert D and Assaf, Rida and Brettin, Thomas and Conrad, Neal and Cucinell, Clark and Davis, James J and Dempsey, Donald M and Dickerman, Allan and Dietrich, Emily M and Kenyon, Ronald W and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D678--D689}, + year={2023}, + publisher={Oxford University Press} +} diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 5a5c6ba..59ca4e9 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -8,12 +8,15 @@ import importlib -from q2_types.genome_data import GenomeData, Loci, Proteins +from q2_types.genome_data import GenomeData, Loci, Proteins, Genes, DNASequence +from q2_types.metadata import ImmutableMetadata from qiime2.core.type import TypeMatch from qiime2.plugin import (Str, Plugin, Choices, List, Citations, Range, Int, Float, Visualization, Bool, TypeMap, Metadata, - MetadataColumn, Categorical) + MetadataColumn, Categorical, Numeric) +from .bv_brc import get_bv_brc_genomes, get_bv_brc_metadata, \ + get_bv_brc_genome_features, data_fields_bvbrc from .subsample import subsample_fasta from .trim_alignment import trim_alignment from .merge import merge_taxa @@ -1235,6 +1238,187 @@ ] ) + +bv_brc_rql_query = ('Query in RQL format. To download all data ' + 'for genome_ids "224308.43" and "2030927.4755", the RQL ' + 'query looks like this: "in(genome_id,(224308.43,' + '2030927.4755))". While "in" is an RQL operator, ' + '"genome_id" is a data field and "224308.43,' + '2030927.4755" are the values. It is important to percent ' + 'encode values if they contain illegal characters like ' + 'spaces. The values "Bacillus subtilis" and ' + '"Bacteroidales bacterium" have to be provided with ' + 'percent encoded quotes (%22) and spaces (%20) like ' + 'this: "in(species,(%22Bacillus%20subtilis%22,' + '%22Bacteroidales%20bacterium%22))". Check ' + 'https://www.bv-brc.org/api/doc/ for documentation on ' + 'data types and corresponding data fields.') + +bv_brc_ids_metadata = ('A metadata column obtained with the action ' + 'get-bv-brc-metadata that can be used as a query.') +bv_brc_ids = ('IDs/values of the corresponding data field. This parameter can ' + 'only be used in conjunction with the "data-field" parameter. ' + 'Retrieves all data associated with these IDs/values in the ' + 'specified data field.') +bv_brc_data_field = ('Data field of the corrsponding data type. This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves all data associated with ' + 'the IDs/values specified in parameter "ids" in this ' + 'data field.') + + +plugin.methods.register_function( + function=get_bv_brc_genomes, + inputs={}, + parameters={ + 'ids_metadata': MetadataColumn[Numeric | Categorical], + 'rql_query': Str, + 'data_field': Str, + 'ids': List[Str], + 'ranks': List[Str % Choices(_allowed_ranks)], + 'rank_propagation': Bool, + }, + outputs=[('genomes', GenomeData[DNASequence]), + ('taxonomy', FeatureData[Taxonomy])], + input_descriptions={}, + parameter_descriptions={ + 'ids_metadata': bv_brc_ids_metadata, + 'rql_query': bv_brc_rql_query, + 'data_field': 'Data field of the data type "genome_sequence". This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves all genomes associated ' + 'with the IDs/values specified in parameter "ids" in ' + 'this data field. Check ' + 'https://www.bv-brc.org/api/doc/genome_sequence for ' + 'allowed data fields.', + 'ids': bv_brc_ids, + 'ranks': 'List of taxonomic ranks for building a taxonomy. ' + "[default: '" + + "', '".join(_default_ranks) + "']", + 'rank_propagation': RANK_PROPAGATE_DESCRIPTION, + }, + output_descriptions={ + 'genomes': 'Genome sequences for specified query.', + 'taxonomy': 'Taxonomy data for all sequences.' + }, + name='Get genome sequences from the BV-BRC database.', + description="Fetch genome sequences from BV-BRC. BV-BRC (Bacterial and " + "Viral Bioinformatics Resource Center) is a database for " + "bacterial and viral genomes, annotations, and metadata. " + "There are three ways to query data: You can use an RQL " + "query to refine your search and get targeted genomes. By " + "providing IDs/values and a corresponding data field, " + "you can retrieve all genomes associated with those specific " + "values in that data field. And as a third option a metadata " + "column can be provided, to use metadata obtained with the " + "action get-bv-brc-metadata as a new query. Check " + "https://www.bv-brc.org/api/doc/ for documentation.", + citations=[citations['olson2023introducing']] +) + + +plugin.methods.register_function( + function=get_bv_brc_metadata, + inputs={}, + parameters={ + 'ids_metadata': MetadataColumn[Numeric | Categorical], + 'data_type': Str % Choices(list(data_fields_bvbrc.keys())), + 'rql_query': Str, + 'data_field': Str, + 'ids': List[Str], + }, + outputs=[('metadata', ImmutableMetadata)], + input_descriptions={}, + parameter_descriptions={ + 'ids_metadata': bv_brc_ids_metadata, + 'data_type': 'BV-BCR data type for which metadata should be ' + 'downloaded. Check https://www.bv-brc.org/api/doc/ ' + 'for documentation.', + 'rql_query': bv_brc_rql_query, + 'data_field': 'Data field of the specified "data-type". This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves metadata associated ' + 'with the IDs/values specified in parameter "ids" in ' + 'this data field. Check ' + 'https://www.bv-brc.org/api/doc/ for allowed data ' + 'fields in the specified "data-type".', + 'ids': bv_brc_ids, + }, + output_descriptions={ + 'metadata': 'BV-BCR metadata of specified data type.' + }, + name='Fetch BV-BCR metadata.', + description="Fetch BV-BCR metadata for a specific data type. BV-BRC (" + "Bacterial and Viral Bioinformatics Resource Center) is a " + "database for bacterial and viral genomes, annotations, " + "and metadata. There are three ways to query data: You can " + "use an RQL query to refine your search and get targeted " + "results. By providing IDs/values and a corresponding data " + "field, you can retrieve all metadata associated with those " + "specific values in that data field. And as a third option a " + "metadata column can be provided, to use the results from " + "other data types as a new query. Check " + "https://www.bv-brc.org/api/doc/ for documentation.", + citations=[citations['olson2023introducing']] +) + + +plugin.methods.register_function( + function=get_bv_brc_genome_features, + inputs={}, + parameters={ + 'ids_metadata': MetadataColumn[Numeric | Categorical], + 'rql_query': Str, + 'data_field': Str, + 'ids': List[Str], + 'ranks': List[Str % Choices(_allowed_ranks)], + 'rank_propagation': Bool, + }, + outputs=[ + ('genes', GenomeData[Genes]), + ('proteins', GenomeData[Proteins]), + ('taxonomy', FeatureData[Taxonomy]), + ('loci', GenomeData[Loci]) + ], + input_descriptions={}, + parameter_descriptions={ + 'ids_metadata': bv_brc_ids_metadata, + 'rql_query': bv_brc_rql_query, + 'data_field': 'Data field of the data type "genome_feature". This ' + 'parameter can only be used in conjunction with the ' + '"ids" parameter. Retrieves all data associated with ' + 'the IDs/values specified in parameter "ids" in this ' + 'data field. Check ' + 'https://www.bv-brc.org/api/doc/genome_feature for ' + 'allowed data fields.', + 'ids': bv_brc_ids, + 'ranks': 'List of taxonomic ranks for building a taxonomy ' + "[default: '" + ', '.join(_default_ranks) + "']", + 'rank_propagation': RANK_PROPAGATE_DESCRIPTION, + }, + output_descriptions={ + 'genes': 'Gene', + 'proteins': 'proteins', + 'taxonomy': 'Taxonomy data for all sequences.', + 'loci': 'loci', + }, + name='Fetch genome features from BV-BRC.', + description='Fetch DNA and protein sequences of genome features from ' + 'BV-BRC. BV-BRC (Bacterial and Viral Bioinformatics Resource ' + 'Center) is a database for bacterial and viral genomes, ' + 'annotations, and metadata. There are three ways to query ' + 'data: You can use an RQL query to refine your search and ' + 'get targeted features. By providing IDs/values and a ' + 'corresponding data field, you can retrieve all features ' + 'associated with those specific values in that data field. ' + 'And as a third option a metadata column can be provided, ' + 'to use metadata obtained with the action ' + 'get-bv-brc-metadata as a new query. Check ' + 'https://www.bv-brc.org/api/doc/ for documentation.', + citations=[citations['olson2023introducing']] +) + + # Registrations plugin.register_semantic_types(SILVATaxonomy, SILVATaxidMap) plugin.register_semantic_type_to_format( diff --git a/rescript/tests/test_bv_brc.py b/rescript/tests/test_bv_brc.py new file mode 100644 index 0000000..585e95a --- /dev/null +++ b/rescript/tests/test_bv_brc.py @@ -0,0 +1,784 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2019-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import unittest +from unittest.mock import Mock, patch, mock_open, MagicMock + +import pandas as pd +import qiime2 +from qiime2.plugin.testing import TestPluginBase + +from rescript.bv_brc import get_bv_brc_genomes, get_bv_brc_metadata, \ + get_bv_brc_genome_features, parameter_validation, \ + error_handling, download_data, create_genome_fasta, \ + create_taxonomy_entry, get_loci, read_tsv_data_with_dtypes, process_loci, \ + get_sequences, get_taxonomy, create_taxonomy, get_genome_sequences + + +class TestParameterValidation(TestPluginBase): + package = 'rescript.tests' + + def test_missing_data_type(self): + # Test when data_type is None + with self.assertRaisesRegex(ValueError, "data-type"): + parameter_validation() + + def test_rql_query_and_other_params(self): + # Test when rql_query is specified with other conflicting parameters + with self.assertRaisesRegex(ValueError, + "rql_query.*can't.*simultaneously"): + parameter_validation(rql_query="some_query", ids=[1, 2], + data_field="genome_id", data_type="genome") + + def test_metadata_and_other_params(self): + # Test when metadata is specified with other conflicting parameters + with self.assertRaisesRegex(ValueError, + "metadata.*can't.*simultaneously"): + parameter_validation(metadata="metadata", ids=[1, 2], + data_field="genome_id", data_type="genome") + + def test_ids_without_data_field(self): + # Test when ids is specified without data_field + with self.assertRaisesRegex(ValueError, r"ids.*data-field"): + parameter_validation(ids=[1, 2], data_type="genome") + + def test_no_rql_query_ids_metadata(self): + # Test when neither rql_query, ids, nor metadata is specified + with self.assertRaisesRegex(ValueError, "rql-query.*ids.*metadata"): + parameter_validation(data_type="genome") + + def test_invalid_data_field_for_data_type(self): + # Test when the data_field is not valid for the given data_type + with self.assertRaisesRegex(ValueError, "data-field.*permitted"): + parameter_validation(ids=[1, 2], data_field="invalid_field", + data_type="genome") + + def test_valid_rql_query_generation(self): + rql_query = parameter_validation( + ids=["Bacillus subtilis", "Bacteroidales bacterium"], + data_field="species", + data_type="genome" + ) + self.assertEqual(rql_query, "in(species,(%22Bacillus%20subtilis%22," + "%22Bacteroidales%20bacterium%22))") + + def test_valid_rql_query_with_metadata(self): + # Create mock metadata objects + mock_metadata = MagicMock() + mock_series = MagicMock() + + # Mock the .name attribute of the Series to return "species" + mock_series.name = "species" + + # Mock the return value of to_series to be the mock_series + mock_metadata.to_series.return_value = mock_series + + # Mock the ids in the series (mimicking a list of ids) + mock_series.__iter__.return_value = iter( + ["Bacillus subtilis", "Bacteroidales bacterium"]) + + # Call the function with the mock metadata + rql_query = parameter_validation( + metadata=mock_metadata, + data_type="genome" + ) + + # Assert that the rql_query is correctly generated + self.assertEqual(rql_query, "in(species,(%22Bacillus%20subtilis%22," + "%22Bacteroidales%20bacterium%22))") + + +class TestErrorHandling(TestPluginBase): + package = 'rescript.tests' + + def setUp(self): + super().setUp() + self.response = Mock() + + def test_no_data_found(self): + self.response.text = "[]" + + with self.assertRaisesRegex(ValueError, "No data"): + error_handling(self.response, data_type="genome") + + def test_database_error_occurred_undefined_field_object(self): + self.response.text = ('A Database Error Occured: {"code": 400, ' + '"msg": "undefined field object in RQL"}') + + with self.assertRaisesRegex(ValueError, "undefined field object"): + error_handling(self.response, data_type="genome") + + def test_database_error_occurred_undefined_field(self): + self.response.text = ('A Database Error Occured: {"code": 400, ' + '"msg": "undefined field"}') + + with self.assertRaisesRegex(ValueError, "undefined field"): + error_handling(self.response, data_type="genome") + + def test_database_error_occurred_general_error(self): + self.response.text = ('A Database Error Occured: {"code": 500, "msg": ' + '"Internal Server Error"}') + + with self.assertRaisesRegex(ValueError, "Internal Server Error"): + error_handling(self.response, data_type="genome") + + def test_unhandled_response(self): + self.response.text = "Unexpected error message" + + with self.assertRaisesRegex(ValueError, "Unexpected error"): + error_handling(self.response, data_type="genome") + + +class TestDownloadData(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.read_tsv_data_with_dtypes') + @patch('rescript.bv_brc.error_handling') + def test_download_data_json_batch(self, mock_error_handling, + mock_read_data_with_dtypes, mock_post): + # Mock the response for the first batch (25,000 entries) + mock_response_1 = MagicMock() + mock_response_1.status_code = 200 + mock_response_1.json.return_value = [{"id": i} for i in range(25000)] + + # Mock the response for the second batch (fewer than 25,000 entries) + mock_response_2 = MagicMock() + mock_response_2.status_code = 200 + mock_response_2.json.return_value = [{"id": i} for i in + range(25000, 25010)] + + # The first call to requests.post returns 25,000 entries, + # the second call returns 10 entries. + mock_post.side_effect = [mock_response_1, mock_response_2] + + # Call the function for JSON + result = download_data(data_type="genome", query="eq(id,1)", + accept="application/json", select=["genome_id"]) + + # Check that the result is as expected (25,000 + 10 entries) + self.assertEqual(len(result), 25010) + self.assertEqual(result[0], {"id": 0}) + self.assertEqual(result[-1], {"id": 25009}) + + # Ensure requests.post is called with the correct parameters + mock_post.assert_any_call( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,0)&select(genome_id)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'application/json'} + ) + + mock_post.assert_any_call( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,25000)&select(genome_id)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'application/json'} + ) + + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.read_tsv_data_with_dtypes') + @patch('rescript.bv_brc.error_handling') + def test_download_data_tsv(self, mock_error_handling, + mock_read_data_with_dtypes, mock_post): + # Mock the response for TSV data type + mock_response = MagicMock() + mock_response.status_code = 200 + mock_post.return_value = mock_response + + # Mock reading TSV data + mock_df = pd.DataFrame({"id": [1, 2]}) + mock_read_data_with_dtypes.return_value = mock_df + + # Call the function for TSV + result = download_data(data_type="genome", query="eq(id,1)", + accept="text/tsv") + + # Check that the result is a DataFrame and as expected + pd.testing.assert_frame_equal(result, pd.DataFrame({"id": [1, 2]})) + + # Ensure requests.post is called with the correct parameters + mock_post.assert_called_with( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,0)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'text/tsv'} + ) + + # Ensure read_tsv_data_with_dtypes was called + mock_read_data_with_dtypes.assert_called_once_with( + response=mock_response, data_type="genome") + + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.error_handling') + def test_download_data_gff(self, mock_error_handling, mock_post): + # Mock the response for GFF data type + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = "mock_gff_data" + mock_post.return_value = mock_response + + # Call the function for GFF + result = download_data(data_type="genome", query="eq(id,1)", + accept="application/gff") + + # Check that the result is as expected + self.assertEqual(result, "mock_gff_data") + + # Ensure requests.post is called with the correct parameters + mock_post.assert_called_with( + url="https://www.bv-brc.org/api/genome/", + data="eq(id,1)&limit(25000,0)", + headers={ + 'Content-Type': 'application/rqlquery+x-www-form-urlencoded', + 'ACCEPT': 'application/gff'} + ) + + @patch('rescript.bv_brc.requests.post') + @patch('rescript.bv_brc.error_handling') + def test_download_data_400_error(self, mock_error_handling, mock_post): + # Mock the response for 400 error + mock_response = MagicMock() + mock_response.status_code = 400 + mock_post.return_value = mock_response + + # Call the function and check that error_handling is called + download_data(data_type="genome", query="eq(id,1)", + accept="application/json") + + # Ensure error_handling was called for the 400 response + mock_error_handling.assert_called_once_with(response=mock_response, + data_type="genome") + + @patch('rescript.bv_brc.requests.post') + def test_download_data_unexpected_error(self, mock_post): + # Mock the response for unexpected error + mock_response = MagicMock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_post.return_value = mock_response + + # Check that the function raises a ValueError for non-200, non-400 + # responses + with self.assertRaises(ValueError) as context: + download_data(data_type="genome", query="eq(id,1)", + accept="application/json") + + # Ensure the ValueError contains the right message + self.assertEqual(str(context.exception), "Internal Server Error") + + +class TestJsonToFasta(TestPluginBase): + package = 'rescript.tests' + + def setUp(self): + super().setUp() + + self.json_input_1 = [ + { + "genome_id": "genome1", + "accession": "acc1", + "description": "desc1", + "genome_name": "genome_name1", + "sequence": "ATGC" + } + ] + + self.json_input_2 = [ + { + "genome_id": "genome2", + "accession": "acc2", + "description": "desc2", + "genome_name": "genome_name2", + "sequence": "CGTA" + } + ] + + @patch('rescript.bv_brc.open', new_callable=mock_open) + def test_json_to_fasta_single_genome(self, mock_file): + result = create_genome_fasta(self.json_input_1) + + # Expected FASTA content + expected_fasta = ">acc1 desc1 [genome_name1 | genome1]\nATGC" + + # Check if the file was created with the correct path and content + mock_file.assert_called_once_with(f"{str(result)}/genome1.fasta", 'w') + mock_file().write.assert_called_once_with(expected_fasta) + + @patch('rescript.bv_brc.open', new_callable=mock_open) + def test_json_to_fasta_multiple_genomes(self, mock_file): + create_genome_fasta(self.json_input_1 + self.json_input_2) + + # Expected FASTA content + expected_fasta_genome1 = (">acc1 desc1 [genome_name1 | " + "genome1]\nATGC") + expected_fasta_genome2 = (">acc2 desc2 [genome_name2 | " + "genome2]\nCGTA") + + # Check if the files were created with the correct path and content + mock_file().write.assert_any_call(expected_fasta_genome1) + mock_file().write.assert_any_call(expected_fasta_genome2) + + @patch('rescript.bv_brc.open', new_callable=mock_open) + def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file): + result = create_genome_fasta(self.json_input_1 + self.json_input_1) + + # Expected FASTA content + expected_fasta = (">acc1 desc1 [genome_name1 | genome1]\nATGC\n" + ">acc1 desc1 [genome_name1 | genome1]\nATGC") + + # Check if the file was created with the correct path and content + mock_file.assert_called_once_with(f"{str(result)}/genome1.fasta", 'w') + mock_file().write.assert_called_once_with(expected_fasta) + + +class TestGetBvBrcGenomes(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.get_taxonomy') + @patch('rescript.bv_brc.get_genome_sequences') + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.parameter_validation') + def test_get_bv_brc_genomes(self, mock_parameter_validation, + mock_download_data, mock_get_genome_sequences, + mock_get_taxonomy): + + # Call the function + get_bv_brc_genomes( + ids_metadata=MagicMock(name='NumericMetadataColumn'), + rql_query=None, + data_field="mock_field", + ids=["id1", "id2"], + ranks=["rank1", "rank2"], + rank_propagation=True + ) + + +class TestGetBvBrcMetadata(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.parameter_validation') + def test_get_bv_brc_metadata(self, mock_parameter_validation, + mock_download_data): + # Mock the return value of parameter_validation + mock_parameter_validation.return_value = 'rql(query)' + + # Mock the return value of download_data + mock_df = pd.DataFrame({ + 'id': ['id1', 'id2'], + 'feature': ['value1', 'value2'], + 'empty_field': [' ', None] + }).set_index('id') + mock_download_data.return_value = mock_df + + # Call the function + result_metadata = get_bv_brc_metadata( + ids_metadata=None, + data_type='genome', + data_field='genome_id', + ids=['id1', 'id2'] + ) + + # Assertions on the returned qiime2.Metadata + self.assertIsInstance(result_metadata, qiime2.Metadata) + + # Extract the DataFrame from the result and check its values + result_df = result_metadata.to_dataframe() + + # Check that the DataFrame's index and columns are correct + self.assertEqual(result_df.index.tolist(), ['id1', 'id2']) + self.assertIn('feature', result_df.columns) + + # Check that empty/space-only fields are replaced with NaN + self.assertTrue(pd.isna(result_df.loc['id1', 'empty_field'])) + self.assertTrue(pd.isna(result_df.loc['id2', 'empty_field'])) + + +class TestGetBvBrcGenomeFeatures(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.get_loci') + @patch('rescript.bv_brc.get_taxonomy') + @patch('rescript.bv_brc.get_sequences') + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.parameter_validation') + def test_get_bv_brc_genome_features(self, mock_parameter_validation, + mock_download_data, mock_get_sequences, + mock_get_taxonomy, mock_get_loci): + # Mocked return values for the external functions + mock_parameter_validation.return_value = "mocked_rql_query" + mock_download_data.return_value = [{"genome_id": "genome1"}] + mock_get_sequences.return_value = (MagicMock(), MagicMock()) + mock_get_taxonomy.return_value = MagicMock() + mock_get_loci.return_value = MagicMock() + + # Call the function + get_bv_brc_genome_features( + ids_metadata=None, + rql_query=None, + data_field="mock_field", + ids=["id1", "id2"], + ranks=["rank1", "rank2"], + rank_propagation=True + ) + + +class TestCreateTaxonomyEntry(TestPluginBase): + package = 'rescript.tests' + + def test_basic_functionality(self): + lineage_names = ("Animalia;Chordata;Mammalia;Primates;Hominidae;Homo;" + "sapiens") + lineage_ranks = "kingdom;phylum;class;order;family;genus;species" + expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " + "o__Primates; f__Hominidae; g__Homo; s__sapiens") + + result = create_taxonomy_entry(lineage_names, lineage_ranks) + self.assertEqual(result, expected_result) + + def test_with_missing_ranks(self): + lineage_names = "Animalia;Chordata;Hominidae;sapiens" + lineage_ranks = "kingdom;phylum;family;species" + expected_result = ("k__Animalia; p__Chordata; c__; o__; f__Hominidae; " + "g__; s__sapiens") + + result = create_taxonomy_entry(lineage_names, lineage_ranks, + rank_propagation=False) + self.assertEqual(result, expected_result) + + def test_rank_propagation(self): + lineage_names = "Animalia;Chordata;Mammalia;Homo" + lineage_ranks = "kingdom;phylum;class;genus" + expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " + "o__Mammalia; f__Mammalia; g__Homo; s__Homo") + + result = create_taxonomy_entry(lineage_names, lineage_ranks, + rank_propagation=True) + self.assertEqual(result, expected_result) + + def test_genus_species_split(self): + lineage_names = ("Animalia;Chordata;Mammalia;Primates;" + "Hominidae;Homo sapiens") + lineage_ranks = "kingdom;phylum;class;order;family;species" + expected_result = ("k__Animalia; p__Chordata; c__Mammalia;" + " o__Primates; f__Hominidae; g__Homo; s__sapiens") + + result = create_taxonomy_entry(lineage_names, lineage_ranks) + self.assertEqual(result, expected_result) + + def test_genus_only_split(self): + lineage_names = ("Animalia;Chordata;Mammalia;Primates;Hominidae;" + "Homo;Homo sapiens") + lineage_ranks = "kingdom;phylum;class;order;family;genus;species" + expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " + "o__Primates; f__Hominidae; g__Homo; s__sapiens") + + result = create_taxonomy_entry(lineage_names, lineage_ranks) + self.assertEqual(result, expected_result) + + def test_no_species_in_ranks(self): + lineage_names = ("Animalia;Chordata;Mammalia;Primates;" + "Hominidae;Homo sapiens") + lineage_ranks = "kingdom;phylum;class;order;family;genus" + expected_result = ("k__Animalia; p__Chordata; c__Mammalia; " + "o__Primates; f__Hominidae; g__Homo sapiens") + + result = create_taxonomy_entry(lineage_names, lineage_ranks, + ranks=['kingdom', 'phylum', + 'class', 'order', + 'family', 'genus']) + self.assertEqual(result, expected_result) + + def test_custom_ranks(self): + lineage_names = "Metazoa" + lineage_ranks = "superkingdom" + expected_result = "sk__Metazoa" + + result = create_taxonomy_entry(lineage_names, lineage_ranks, + ranks=['superkingdom']) + self.assertEqual(result, expected_result) + + +class TestGetTaxonomy(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.create_taxonomy') + @patch('rescript.bv_brc.download_data') + def test_get_taxonomy(self, mock_download_data, mock_create_taxonomy): + # Mock response_sequences (list of dicts) + response_sequences = [ + {"taxon_id": "taxon1", "feature_id": "feature1"}, + {"taxon_id": "taxon2", "feature_id": "feature2"}, + {"taxon_id": "taxon3", "feature_id": "feature3"}, + {"taxon_id": "taxon1", "feature_id": "feature4"}, + ] + + # Mock the download_data and create_taxonomy functions + mock_download_data.return_value = MagicMock() + mock_create_taxonomy.return_value = MagicMock() + + # Define test parameters + ranks = ["kingdom", "phylum", "class"] + rank_propagation = True + accession_name = "feature_id" + + # Call the function + get_taxonomy( + response_sequences=response_sequences, + ranks=ranks, + rank_propagation=rank_propagation, + accession_name=accession_name + ) + + # Check that the taxon_ids are extracted correctly and that duplicates + # are removed + expected_taxon_ids = {"taxon1", "taxon2", "taxon3"} + extracted_taxon_ids = {str(entry["taxon_id"]) for entry in + response_sequences} + self.assertEqual(extracted_taxon_ids, expected_taxon_ids) + + +class TestCreateTaxonomy(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.TSVTaxonomyFormat') + @patch('rescript.bv_brc.create_taxonomy_entry') + def test_create_taxonomy(self, mock_create_taxonomy_entry, + mock_TSVTaxonomyFormat): + # Mock the input data for taxonomy_bvbrc DataFrame + taxonomy_bvbrc = pd.DataFrame({ + 'taxon_id': ['taxon1', 'taxon2'], + 'lineage_names': [['Bacteria', 'Proteobacteria'], + ['Bacteria', 'Firmicutes']], + 'lineage_ranks': [['domain', 'phylum'], ['domain', 'phylum']] + }) + + # Mock the create_taxonomy_entry to return a fake taxonomy string + mock_create_taxonomy_entry.side_effect = \ + lambda lineage_names, lineage_ranks, rank_propagation, ranks: ( + ";".join(lineage_names)) + + # Mock response_sequences (list of dicts) + response_sequences = [ + {"taxon_id": "taxon1", "feature_id": "feature1"}, + {"taxon_id": "taxon2", "feature_id": "feature2"} + ] + + # Mock TSVTaxonomyFormat to return a file-like object + mock_taxonomy_file = MagicMock() + (mock_TSVTaxonomyFormat.return_value.open. + return_value.__enter__).return_value = mock_taxonomy_file + + # Call the create_taxonomy function + create_taxonomy( + taxonomy_bvbrc=taxonomy_bvbrc, + response_sequences=response_sequences, + ranks=['domain', 'phylum'], + rank_propagation=True, + accession_name="feature_id" + ) + + # Ensure that the correct data was written to the file + written_data = "\t".join(["Feature ID", "Taxon"]) + "\n" + \ + "\t".join( + ["feature1", "Bacteria;Proteobacteria"]) + "\n" + \ + "\t".join(["feature2", "Bacteria;Firmicutes"]) + "\n" + + # Check if the 'write' method was actually called with the right data + mock_taxonomy_file.write.assert_called_once_with(written_data) + + +class TestReadDataWithDtypes(TestPluginBase): + package = 'rescript.tests' + + def test_read_data_with_dtypes(self): + # Mock response with a TSV file containing antibiotics data + mock_response = MagicMock() + mock_response.text = """_version_\tantibiotic_name\tcas_id\tdescription +1.0\tPenicillin\tCAS1234\tAntibiotic description +2.0\tAmoxicillin\tCAS5678\tAntibiotic description 2 +""" + + # Call the function with mock data and data_type "antibiotics" + df = read_tsv_data_with_dtypes(mock_response, "antibiotics") + + # Expected DataFrame output + expected_data = { + "_version_": ["1.0", "2.0"], + "antibiotic_name": ["Penicillin", "Amoxicillin"], + "cas_id": ["CAS1234", "CAS5678"], + "description": ["Antibiotic description", + "Antibiotic description 2"] + } + expected_df = pd.DataFrame(expected_data) + + # Check if the DataFrame matches the expected output + pd.testing.assert_frame_equal(df, expected_df) + + def test_no_data_raises_value_error(self): + # Mock response with only the column headers + mock_response = MagicMock() + mock_response.text = """_version_\tantibiotic_name\tcas_id""" + + # Assert that a ValueError is raised when no data rows are present + with self.assertRaises(ValueError): + read_tsv_data_with_dtypes(mock_response, "antibiotics") + + +class TestGetLoci(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.process_loci') + @patch('builtins.open', new_callable=mock_open) + @patch('rescript.bv_brc.LociDirectoryFormat') + def test_get_loci(self, mock_loci_dir_format, mock_open_file, + mock_process_loci, mock_download_data): + # Set up the mocks + mock_download_data.return_value = "mock_gff_data" + mock_process_loci.return_value = "processed_gff_data" + + # Create a mock directory format + mock_loci_dir = MagicMock() + mock_loci_dir_format.return_value = mock_loci_dir + + # Mock response_sequences with genome_id information + response_sequences = [ + {"genome_id": "genome1"}, + {"genome_id": "genome2"}, + {"genome_id": "genome1"} + ] + + # Call the function with response_sequences + get_loci(response_sequences) + + # Check that download_data is called for each unique genome_id + mock_download_data.assert_any_call(data_type="genome_feature", + query="eq(genome_id,genome1)", + accept="application/gff") + mock_download_data.assert_any_call(data_type="genome_feature", + query="eq(genome_id,genome2)", + accept="application/gff") + + # Ensure it is only called twice (for unique genome_ids) + self.assertEqual(mock_download_data.call_count, 2) + + # Check that process_loci is called with the right data + mock_process_loci.assert_any_call(gff_string="mock_gff_data") + + # Check that open was called with the correct paths and the data was + # written + mock_open_file.assert_any_call( + mock_loci_dir.__str__() + '/genome1.gff', 'w') + mock_open_file.assert_any_call( + mock_loci_dir.__str__() + '/genome2.gff', 'w') + + # Ensure the processed data was written to the file + mock_open_file().write.assert_any_call("processed_gff_data") + + def test_process_loci(self): + # Input GFF string with both headers and data lines + input_data = """##gff-version 3 +##sequence-region accn|NC_000001.11 1 1000 +accn|NC_000001.11\tRefSeq\tregion\t1\t1000\t.\t+\t.\tID=region0; +accn|NC_000002.11\tRefSeq\tgene\t1\t1000\t.\t+\t.\tID=gene0;""" + + # Expected output after processing (removing "accn|") + expected_output = """##gff-version 3 +##sequence-region accn|NC_000001.11 1 1000 +NC_000001.11\tRefSeq\tregion\t1\t1000\t.\t+\t.\tID=region0; +NC_000002.11\tRefSeq\tgene\t1\t1000\t.\t+\t.\tID=gene0;""" + + # Call the function with the input data + result = process_loci(input_data) + + # Check if the result matches the expected output + self.assertEqual(result, expected_output) + + +class TestGetSequences(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('builtins.open', new_callable=mock_open) + def test_get_sequences(self, mock_open, mock_download_data): + # Mock data for genome_features + genome_features = [ + { + "genome_id": "genome1", + "feature_id": "feature1", + "na_sequence_md5": "md5na1", + "aa_sequence_md5": "md5aa1" + }, + { + "genome_id": "genome2", + "feature_id": "feature2", + "na_sequence_md5": "md5na2" + } + ] + + # Mock the return value of download_data + mock_download_data.return_value = [ + {"md5": "md5na1", "sequence": "atgc"}, + {"md5": "md5aa1", "sequence": "MKV"}, + {"md5": "md5na2", "sequence": "gtca"} + ] + + # Call the function + get_sequences(genome_features) + + # Assertions to check that download_data was called correctly + mock_download_data.assert_called_once_with( + data_type="feature_sequence", + query=unittest.mock.ANY, + accept="application/json", + select=["md5", "sequence"] + ) + + # Check that genome1.fasta and genome2.fasta are in the paths used + # for file opening + open_calls = [call[0][0] for call in mock_open.call_args_list] + + self.assertTrue(any('genome1.fasta' in call for call in open_calls)) + self.assertTrue(any('genome2.fasta' in call for call in open_calls)) + + # Check if the correct sequences were written to the genes file + mock_open().write.assert_any_call('>feature1\nATGC\n') + mock_open().write.assert_any_call('>feature2\nGTCA\n') + + # Check if the correct sequences were written to the proteins file + mock_open().write.assert_any_call('>feature1\nMKV\n') + + +class TestGetGenomeSequences(TestPluginBase): + package = 'rescript.tests' + + @patch('rescript.bv_brc.download_data') + @patch('rescript.bv_brc.create_genome_fasta') + def test_get_genome_sequences(self, mock_create_genome_fasta, + mock_download_data): + # Sample response_genomes to be used as input + response_genomes = [ + {'genome_id': '12345'}, + {'genome_id': '67890'} + ] + + # Call the function + get_genome_sequences(response_genomes) + + # Assert that download_data was called with the correct arguments + mock_download_data.assert_called_once_with( + data_type="genome_sequence", + query=unittest.mock.ANY, + accept="application/json", + select=["accession", "description", "genome_name", "genome_id", + "sequence"] + )