Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

validate and parse genome before upload #211

Merged
merged 3 commits into from
Aug 13, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 39 additions & 46 deletions lib/GenomeFileUtil/core/GenbankToGenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ def __init__(self):
self.assembly_path = None
self.assembly_info = None
self.input_params = None
self.gc_content = None
self.dna_size = None
self.md5 = None


class GenbankToGenome:
Expand Down Expand Up @@ -185,21 +188,25 @@ def _import_genbank_mass(self, params):
files = self._find_input_files(genome_obj.input_directory)
genome_obj.consolidated_file = self._join_files_skip_empty_lines(files)

# load contigs sequence
self._load_contigs(genome_obj)

# validate existing assemblies
self._validate_existing_assemblies(genome_obj)

# parse genbank file
self._parse_genbank(genome_obj)

# gather all objects
genome_objs.append(genome_obj)

self._load_contigs_for_and_validate_existing_assemblies(genome_objs)

self._save_files_to_blobstore_and_set_handle_service_output(genome_objs)

self._save_assemblies(workspace_id, genome_objs)

# TODO parse genbank files before the assemblies are saved
for genome_obj in genome_objs:
self._parse_genbank(genome_obj)
self._update_genome_data(genome_objs)

# clear the temp directory
# TODO move with parse genbank function into for loop above and save disk space
for genome_obj in genome_objs:
shutil.rmtree(genome_obj.input_directory)

# TODO make an internal mass function save_genomes
Expand Down Expand Up @@ -332,11 +339,6 @@ def _parse_genbank(self, genome_obj):
genome = {
"id": params['genome_name'],
"original_source_file_name": os.path.basename(genome_obj.consolidated_file),
"assembly_ref": genome_obj.assembly_ref,
"gc_content": genome_obj.gc_content,
"dna_size": genome_obj.dna_size,
"md5": genome_obj.md5,
"genbank_handle_ref": genome_obj.handle_service_output['handle']['hid'],
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
"publications": set(),
"contig_ids": [],
"contig_lengths": [],
Expand Down Expand Up @@ -473,38 +475,35 @@ def _validate_existing_assembly(self, assembly_ref, genome_obj):
if len(unmatched_ids_md5s) > 0:
raise ValueError(warnings["assembly_ref_diff_seq"].format(", ".join(unmatched_ids_md5s)))

def _load_contigs_for_and_validate_existing_assemblies(self, genome_objs):
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
ref2genome = {}
for genome_obj in genome_objs:
assembly_ref = genome_obj.use_existing_assembly
if assembly_ref:
contigs = Bio.SeqIO.parse(genome_obj.consolidated_file, "genbank")
extra_info = self._get_contigs_and_extra_info(
contigs, genome_obj
)
self._validate_existing_assembly(
assembly_ref, genome_obj
)
ref2genome[assembly_ref] = genome_obj
genome_obj.extra_info = extra_info
genome_obj.assembly_ref = assembly_ref
genome_obj.assembly_path = None

if ref2genome:
assembly_refs = list(ref2genome.keys())
assembly_objs_info = self._get_objects_info(assembly_refs)
for assembly_ref, assembly_info in zip(
assembly_refs, assembly_objs_info
):
genome = ref2genome[assembly_ref]
self._set_gc_content_dna_size_and_md5(genome, assembly_info)
def _load_contigs(self, genome_obj):
contigs = Bio.SeqIO.parse(genome_obj.consolidated_file, "genbank")
extra_info = self._get_contigs_and_extra_info(contigs, genome_obj)
genome_obj.extra_info = extra_info
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved

def _validate_existing_assemblies(self, genome_obj):
assembly_ref = genome_obj.use_existing_assembly
if assembly_ref:
self._validate_existing_assembly(assembly_ref, genome_obj)
genome_obj.assembly_ref = assembly_ref
genome_obj.assembly_path = None

assembly_objs_info = self._get_objects_info([assembly_ref])
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
self._set_gc_content_dna_size_and_md5(genome_obj, assembly_objs_info[0])

def _set_gc_content_dna_size_and_md5(self, genome, assembly_info):
genome.assembly_info = assembly_info
genome.gc_content = float(assembly_info[10]["GC content"])
genome.dna_size = int(assembly_info[10]["Size"])
genome.md5 = assembly_info[10]["MD5"]

def _update_genome_data(self, genome_objs):
for genome_obj in genome_objs:
genome_obj.genome_data["assembly_ref"] = genome_obj.assembly_ref
genome_obj.genome_data["gc_content"] = genome_obj.gc_content
genome_obj.genome_data["dna_size"] = genome_obj.dna_size
genome_obj.genome_data["md5"] = genome_obj.md5
genome_obj.genome_data["genbank_handle_ref"] = genome_obj.handle_service_output['handle']['hid']

def _save_assemblies(self, workspace_id, genome_objs):
id2genome = {}
bulk_inputs = []
Expand All @@ -513,18 +512,12 @@ def _save_assemblies(self, workspace_id, genome_objs):
continue

# contigs is an iterator
# Based on experiments, writing is significantly more time-consuming than reading.
# Therefore, reading twice is not going to slow things down much compared to the write.
contigs = Bio.SeqIO.parse(genome_obj.consolidated_file, "genbank")
genome_obj.assembly_id = f"{genome_obj.genome_name}_assembly"
genome_obj.assembly_path = f"{self.cfg.sharedFolder}/{genome_obj.assembly_id}.fasta"
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved

# populate contig_seq
genome_obj.extra_info = self._get_contigs_and_extra_info(
contigs, genome_obj
)

# Output as fasta file
contigs_output = Bio.SeqIO.parse(genome_obj.consolidated_file, "genbank")
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
Bio.SeqIO.write(contigs_output, genome_obj.assembly_path, "fasta")
Bio.SeqIO.write(contigs, genome_obj.assembly_path, "fasta")

bulk_inputs.append(
{
Expand Down
Loading