Skip to content

Commit

Permalink
changed genome features to be saved in files per genome
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Aug 21, 2024
1 parent d491eda commit 0613365
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 69 deletions.
58 changes: 35 additions & 23 deletions rescript/bv_brc.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,34 +178,46 @@ def fetch_genome_features_bv_brc(
response_proteins = download_data(url=proteins_url,
data_type="genome_feature")

genes_fasta_upper = convert_fasta_to_uppercase(response_genes.text)
proteins_fasta_upper = convert_fasta_to_uppercase(response_proteins.text)

# Save genes and proteins as FASTA files
with open(os.path.join(str(genes), "genes.fasta"), 'w') as fasta_file:
fasta_file.write(genes_fasta_upper)

with open(os.path.join(str(proteins), "proteins.fasta"),
'w') as fasta_file:
fasta_file.write(proteins_fasta_upper)
# Convert all sequences to upper case characters to conform with
# DNAFASTAFormat
genes_fasta = parse_fasta_to_dict(response_genes.text)
proteins_fasta = parse_fasta_to_dict(response_proteins.text)

# Save genes and proteins as FASTA files one file per genome_id
for genome_id, fasta_sequences in genes_fasta.items():
with open(os.path.join(str(genes), f"{genome_id}.fasta"),
'w') as fasta_file:
fasta_file.write(fasta_sequences)

for genome_id, fasta_sequences in proteins_fasta.items():
with open(os.path.join(str(proteins), f"{genome_id}.fasta"),
'w') as fasta_file:
fasta_file.write(fasta_sequences)

return genes, proteins


def convert_fasta_to_uppercase(fasta_string):
# Split string into lines
lines = fasta_string.splitlines()
result_lines = []

# Loop through all lines. If line does not start with ">" the characters
# get converted to upper case
for line in lines:
if line.startswith(">"): # This is a header line
result_lines.append(line)
else: # This is a sequence line
result_lines.append(line.upper())
def parse_fasta_to_dict(fasta_string):
# Creates a dict with genome_id as keys and the corresponding FASTA
# entries in upper case
fasta_dict = {}

genome_id = None
for line in fasta_string.splitlines():
if line.startswith(">"):
# Extract the genome ID from the header
genome_id = line.split("|")[-1][:-1].strip()
if genome_id not in fasta_dict:
# Start a new entry with the header
fasta_dict[genome_id] = line + "\n"
else:
# Append the header to the existing entry
fasta_dict[genome_id] += line + "\n"
else:
# Append the sequence line in uppercase
fasta_dict[genome_id] += line.upper() + "\n"

return "\n".join(result_lines)
return fasta_dict


def json_to_fasta(json, output_dir):
Expand Down
111 changes: 65 additions & 46 deletions rescript/tests/test_bv_brc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import unittest
from unittest.mock import Mock, patch, mock_open, MagicMock

import pandas as pd
from q2_types.feature_data import (MixedCaseDNAFASTAFormat,
ProteinFASTAFormat,
TSVTaxonomyDirectoryFormat)
from q2_types.genome_data import GenomeSequencesDirectoryFormat
from q2_types.genome_data import GenomeSequencesDirectoryFormat, \
GenesDirectoryFormat, ProteinsDirectoryFormat
from qiime2.plugin.testing import TestPluginBase

from rescript.bv_brc import fetch_genomes_bv_brc, fetch_metadata_bv_brc, \
fetch_genome_features_bv_brc, fetch_taxonomy_bv_brc, id_list_handling, \
error_handling, download_data, json_to_fasta, transform_taxonomy_df, \
parse_lineage_names_with_ranks, convert_fasta_to_uppercase
parse_lineage_names_with_ranks, parse_fasta_to_dict


class TestIDListHandling(TestPluginBase):
Expand Down Expand Up @@ -219,74 +221,91 @@ def test_json_to_fasta_multiple_sequences_same_genome(self, mock_file):
class TestFetchGenomeFeaturesBVBR(TestPluginBase):
package = 'rescript.tests'

@patch('rescript.bv_brc.parse_fasta_to_dict')
@patch('rescript.bv_brc.download_data')
@patch('rescript.bv_brc.id_list_handling')
@patch.object(MixedCaseDNAFASTAFormat, 'open')
@patch.object(ProteinFASTAFormat, 'open')
def test_fetch_genome_features_bv_brc(
self, mock_protein_open, mock_genes_open, mock_id_list_handling,
mock_download_data
):
@patch('builtins.open', new_callable=mock_open)
def test_fetch_genome_features_bv_brc(self, mock_open,
mock_id_list_handling,
mock_download_data,
mock_parse_fasta_to_dict):
# Mock the id_list_handling function
mock_id_list_handling.return_value = ("in(feature_id, "
mock_id_list_handling.return_value = ("in(feature_id,"
"(feature1,feature2))")

# Mock the responses from download_data
mock_genes_response = MagicMock()
mock_genes_response.text = ">gene1\nATGC\n>gene2\nATGC"
mock_proteins_response = MagicMock()
mock_proteins_response.text = (
">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK")
mock_download_data.side_effect = [mock_genes_response,
mock_proteins_response]

# Mock file write actions
mock_genes_file = MagicMock()
mock_protein_file = MagicMock()
mock_genes_open.return_value.__enter__.return_value = mock_genes_file
mock_protein_open.return_value.__enter__.return_value = (
mock_protein_file)
# Mock the download_data function responses
mock_response_genes = MagicMock()
mock_response_genes.text = "mocked_genes_fasta_data"
mock_response_proteins = MagicMock()
mock_response_proteins.text = "mocked_proteins_fasta_data"
mock_download_data.side_effect = [mock_response_genes,
mock_response_proteins]

# Mock the parse_fasta_to_dict function
mock_parse_fasta_to_dict.side_effect = [
{'2030927.4755': '>fig|2030927| GTPase [ABC | '
'2030927.4755]\nATGA\n'},
{'1234567.89': '>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n'}
]

# Call the function
# Call the function with the test RQL query
genes, proteins = fetch_genome_features_bv_brc(
rql_query="in(feature_id, (feature1,feature2))",
feature_ids=["feature1", "feature2"]
rql_query="in(feature_id,(feature1,feature2))"
)

# Assertions
# Assertions to ensure the correct calls were made
mock_id_list_handling.assert_called_once_with(
rql_query="in(feature_id, (feature1,feature2))",
ids=["feature1", "feature2"],
rql_query="in(feature_id,(feature1,feature2))",
ids=None,
parameter_name="feature_ids",
data_field="feature_id"
)

mock_download_data.assert_any_call(
url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, "
url="https://www.bv-brc.org/api/genome_feature/?in(feature_id,"
"(feature1,feature2))&http_accept=application/dna+fasta",
data_type="genome_feature"
)

mock_download_data.assert_any_call(
url="https://www.bv-brc.org/api/genome_feature/?in(feature_id, "
url="https://www.bv-brc.org/api/genome_feature/?in(feature_id,"
"(feature1,feature2))&http_accept=application/protein+fasta",
data_type="genome_feature"
)

# Check that the correct data is written to the correct files
mock_genes_file.write.assert_called_once_with(
">gene1\nATGC\n>gene2\nATGC")
mock_protein_file.write.assert_called_once_with(
">protein1\nMVLSPADKTNVK\n>protein2\nMVLSPADKTNVK")

self.assertIsInstance(genes, MixedCaseDNAFASTAFormat)
self.assertIsInstance(proteins, ProteinFASTAFormat)

def test_convert_fasta_to_uppercase(self):
input_fasta = ">header1\natgca\ngtacg\n>header2\nttgaa\ncctg"
expected_output = ">header1\nATGCA\nGTACG\n>header2\nTTGAA\nCCTG"

result = convert_fasta_to_uppercase(input_fasta)
mock_parse_fasta_to_dict.assert_any_call("mocked_genes_fasta_data")
mock_parse_fasta_to_dict.assert_any_call("mocked_proteins_fasta_data")

# Check that the files were written correctly for genes
mock_open.assert_any_call(
os.path.join(str(genes), "2030927.4755.fasta"), 'w')
mock_open().write.assert_any_call(
'>fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n')

# Check that the files were written correctly for proteins
mock_open.assert_any_call(
os.path.join(str(proteins), "1234567.89.fasta"), 'w')
mock_open().write.assert_any_call(
'>fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n')

# Check that the return types are correct
self.assertIsInstance(genes, GenesDirectoryFormat)
self.assertIsInstance(proteins, ProteinsDirectoryFormat)

def test_parse_fasta_to_dict(self):
fasta_string = (
">fig|2030927| GTPase [ABC | 2030927.4755]\natga\n"
">fig|1234567| protein [XYZ | 1234567.89]\ngcgt\n"
)
expected_output = {
'2030927.4755': (
">fig|2030927| GTPase [ABC | 2030927.4755]\nATGA\n"
),
'1234567.89': (
">fig|1234567| protein [XYZ | 1234567.89]\nGCGT\n"
)
}
result = parse_fasta_to_dict(fasta_string)
self.assertEqual(result, expected_output)


Expand Down

0 comments on commit 0613365

Please sign in to comment.