From 6208a9a8fa31b6f84345099a158fdd0ea90ec838 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 14 Aug 2024 17:14:23 -0700
Subject: [PATCH 01/24] add save_genomes function

---
 GenomeFileUtil.spec                        |  19 +
 RELEASE_NOTES.md                           |   2 +
 lib/GenomeFileUtil/GenomeFileUtilImpl.py   | 438 ++++++++++++++++++++-
 lib/GenomeFileUtil/GenomeFileUtilServer.py |   4 +
 lib/GenomeFileUtil/core/GenbankToGenome.py |  13 +-
 lib/GenomeFileUtil/core/GenomeInterface.py | 202 +++++++---
 lib/GenomeFileUtil/core/MiscUtils.py       |  26 ++
 test/problematic_tests/save_genome_test.py |   2 +-
 8 files changed, 637 insertions(+), 69 deletions(-)
diff --git a/GenomeFileUtil.spec b/GenomeFileUtil.spec
index 05cca1f9..5c06cff1 100644
--- a/GenomeFileUtil.spec
+++ b/GenomeFileUtil.spec
@@ -332,6 +332,25 @@ module GenomeFileUtil {
     funcdef save_one_genome(SaveOneGenomeParams params)
                 returns (SaveGenomeResult returnVal) authentication required;
 
+    typedef structure {
+        string name;
+        KBaseGenomes.Genome data;
+        boolean hidden;
+        boolean upgrade;
+    } GenomeInput;
+
+    typedef structure {
+        int workspace_id;
+        list<GenomeInput> inputs;
+    } SaveGenomesParams;
+
+    typedef structure {
+        list<SaveGenomeResult> results;
+    } SaveGenomesResults;
+
+    funcdef save_genomes(SaveGenomesParams params)
+                returns(SaveGenomesResults results) authentication required;
+
     /*
     gff_file - object containing path to gff_file
     ws_ref - input Assembly or Genome reference
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index c020b8f0..260b4aa4 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.11.7] - TBD
+- The `save_genomes` method was added to allow users to save genmes in batch
+- Parsed and validated genome before upload
 - Unusable `export_genome_features_protein_to_fasta` function was removed
 - The `genbanks_to_genomes` method was added to allow users to upload multiple
 genome objects at once
diff --git a/lib/GenomeFileUtil/GenomeFileUtilImpl.py b/lib/GenomeFileUtil/GenomeFileUtilImpl.py
index 0d01dd6e..98543d35 100644
--- a/lib/GenomeFileUtil/GenomeFileUtilImpl.py
+++ b/lib/GenomeFileUtil/GenomeFileUtilImpl.py
@@ -69,7 +69,7 @@ class GenomeFileUtil:
     ######################################### noqa
     VERSION = "0.11.7"
     GIT_URL = "git@github.com:kbaseapps/GenomeFileUtil.git"
-    GIT_COMMIT_HASH = "591a19ccf4d1b42f01cc06486654b6d3a8ea08e4"
+    GIT_COMMIT_HASH = "4819598e65eea38d3c38d8c2ab808aca0d9697fd"
 
     #BEGIN_CLASS_HEADER
     #END_CLASS_HEADER
@@ -1262,6 +1262,442 @@ def save_one_genome(self, ctx, params):
         # return the results
         return [returnVal]
 
+    def save_genomes(self, ctx, params):
+        """
+        :param params: instance of type "SaveGenomesParams" -> structure:
+           parameter "workspace_id" of Long, parameter "inputs" of list of
+           type "GenomeInput" -> structure: parameter "name" of String,
+           parameter "data" of type "Genome" (Genome type -- annotated and
+           assembled genome data. Field descriptions: id - string - KBase
+           legacy data ID scientific_name - string - human readable species
+           name domain - string - human readable phylogenetic domain name
+           (eg. "Bacteria") warnings - list of string - genome-level warnings
+           generated in the annotation process genome_tiers - list of string
+           - controlled vocabulary (based on app input and checked by
+           GenomeFileUtil) A list of labels describing the data source for
+           this genome. Allowed values - Representative, Reference,
+           ExternalDB, User Tier assignments based on genome source: * All
+           phytozome - Representative and ExternalDB * Phytozome flagship
+           genomes - Reference, Representative and ExternalDB * Ensembl -
+           Representative and ExternalDB * RefSeq Reference - Reference,
+           Representative and ExternalDB * RefSeq Representative -
+           Representative and ExternalDB * RefSeq Latest or All Assemblies
+           folder - ExternalDB * User Data - User tagged feature_counts - map
+           of string to integer - total counts of each type of feature keys
+           are a controlled vocabulary of - "CDS", "gene", "misc_feature",
+           "misc_recomb", "mobile_element", "ncRNA" - 72,
+           "non_coding_features", "non_coding_genes",
+           "protein_encoding_gene", "rRNA", "rep_origin", "repeat_region",
+           "tRNA" genetic_code - int - An NCBI-assigned taxonomic category
+           for the organism See here -
+           https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi dna_size
+           - integer - total number of nucleotides num_contigs - integer -
+           total number of contigs in the genome molecule_type - string -
+           controlled vocab - the type of molecule sequenced Possible values
+           are "Unknown", "DNA", "RNA", "genomic DNA", "genomic RNA", "mRNA",
+           "tRNA", "rRNA", "other RNA", "other DNA", "transcribed RNA",
+           "viral cRNA", "unassigned DNA", "unassigned RNA" contig_lengths -
+           list of int - nucleotide length of each contig in the genome
+           Indexes in this list correspond to indexes in the `contig_ids`
+           list. contig_ids - list of str - external database identifiers for
+           each contig (eg. "NC_000913.3") source - str - controlled vocab -
+           descriptor of where this data came from (eg. "RefSeq") Allowed
+           entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload
+           source_id - string - identifier of this genome from the source
+           database (eg. the RefSeq ID such as "NC_000913") md5 - string -
+           checksum of the underlying assembly sequence taxonomy - string -
+           semicolon-delimited taxonomy lineage, in order of parent to child
+           taxon_assignments - mapping of taxonomy namespace to taxon ID.
+           example - {"ncbi": "286", "gtdb": "s__staphylococcus_devriesei"}
+           gc_content - float - ratio of GC count to AT in the genome
+           publications - tuple of (pubmedid, source, title, web_addr, year,
+           authors, journal). See typedef above. ontology_events - A record
+           of the service and method used for a set of ontology assignments
+           on the genome. ontologies_present - a mapping of ontology source
+           id (eg. "GO") to a mapping of term IDs (eg "GO:16209") to term
+           names (eg. "histidine biosynthetic process"). features - array of
+           Feature - protein coding genes (see the separate Feature spec)
+           cdss - array of protein-coding sequences mrnas - array of
+           transcribed messenger RNA sequences (equal to cdss plus 5' and 3'
+           UTRs) non_coding_features - array of features that does not
+           include mRNA, CDS, and protein-encoding genes assembly_ref -
+           workspace reference to an assembly object from which this
+           annotated genome was derived. taxon_ref - workspace reference to a
+           taxon object that classifies the species or strain of this genome.
+           genbank_handle_ref - file server handle reference to the source
+           genbank file for this genome. gff_handle_ref - file server handle
+           reference to the source GFF file for this genome.
+           external_source_origination_date - TODO look at GFU for this
+           release - string - User-supplied release or version of the source
+           data. This most likely will come from an input field in the import
+           app. original_source_file_name - filename from which this genome
+           was derived (eg. genbank or gff filename). notes - TODO
+           quality_scores - TODO suspect - bool - flag of whether this
+           annotation is problematic due to some warning genome_type - string
+           - controlled vocab - One of "draft isolate", "finished isolate",
+           "mag", "sag", "virus", "plasmid", "construct" Features vs. coding
+           sequences: a feature is a sequence in the DNA that codes for a
+           protein, including non-transcribed introns. A coding sequence
+           (stored as `cdss`) includes **only** the sections of the feature
+           that codes for a protein, minus introns and UTRs. @optional
+           warnings contig_lengths contig_ids source_id taxonomy publications
+           @optional ontology_events ontologies_present non_coding_features
+           mrnas genome_type @optional genbank_handle_ref gff_handle_ref
+           external_source_origination_date @optional release
+           original_source_file_name notes quality_scores suspect
+           assembly_ref @optional taxon_ref taxon_assignments @metadata ws
+           gc_content as GC content @metadata ws taxonomy as Taxonomy
+           @metadata ws md5 as MD5 @metadata ws dna_size as Size @metadata ws
+           genetic_code as Genetic code @metadata ws domain as Domain
+           @metadata ws source_id as Source ID @metadata ws source as Source
+           @metadata ws scientific_name as Name @metadata ws genome_type as
+           Type @metadata ws length(features) as Number of Protein Encoding
+           Genes @metadata ws length(cdss) as Number of CDS @metadata ws
+           assembly_ref as Assembly Object @metadata ws num_contigs as Number
+           contigs @metadata ws length(warnings) as Number of Genome Level
+           Warnings @metadata ws suspect as Suspect Genome) -> structure:
+           parameter "id" of type "Genome_id" (KBase legacy data ID @id kb),
+           parameter "scientific_name" of String, parameter "domain" of
+           String, parameter "warnings" of list of String, parameter
+           "genome_tiers" of list of String, parameter "feature_counts" of
+           mapping from String to Long, parameter "genetic_code" of Long,
+           parameter "dna_size" of Long, parameter "num_contigs" of Long,
+           parameter "molecule_type" of String, parameter "contig_lengths" of
+           list of Long, parameter "contig_ids" of list of String, parameter
+           "source" of String, parameter "source_id" of type "source_id"
+           (Reference to a source_id @id external), parameter "md5" of
+           String, parameter "taxonomy" of String, parameter
+           "taxon_assignments" of mapping from String to String, parameter
+           "gc_content" of Double, parameter "publications" of list of type
+           "publication" (Structure for a publication Elements: (0) pubmedid
+           - float (1) source - string - (ex. Pubmed) (2) title - string (3)
+           string web address - string (4) publication year - string (5)
+           authors - string (6) journal - string) -> tuple of size 7:
+           parameter "pubmedid" of Double, parameter "source" of String,
+           parameter "title" of String, parameter "url" of String, parameter
+           "year" of String, parameter "authors" of String, parameter
+           "journal" of String, parameter "ontology_events" of list of type
+           "Ontology_event" (@optional ontology_ref method_version eco) ->
+           structure: parameter "id" of String, parameter "ontology_ref" of
+           type "Ontology_ref" (Reference to a ontology object @id ws
+           KBaseOntology.OntologyDictionary), parameter "method" of String,
+           parameter "method_version" of String, parameter "timestamp" of
+           String, parameter "eco" of String, parameter "ontologies_present"
+           of mapping from String to mapping from String to String, parameter
+           "features" of list of type "Feature" (Structure for a single CDS
+           encoding "gene" of a genome ONLY PUT GENES THAT HAVE A
+           CORRESPONDING CDS IN THIS ARRAY NOTE: Sequence is optional.
+           Ideally we can keep it in here, but Recognize due to space
+           constraints another solution may be needed. We may want to add
+           additional fields for other CDM functions (e.g., atomic regulons,
+           coexpressed fids, co_occurring fids,...)
+           protein_translation_length and protein_translation are for longest
+           coded protein (representative protein for splice variants) NOTE:
+           New Aliases field definitely breaks compatibility. As Does
+           Function. flags are flag fields in GenBank format. This will be a
+           controlled vocabulary. Initially Acceptable values are pseudo,
+           ribosomal_slippage, and trans_splicing Md5 is the md5 of
+           dna_sequence. @optional functions ontology_terms note
+           protein_translation mrnas flags warnings @optional inference_data
+           dna_sequence aliases db_xrefs children functional_descriptions) ->
+           structure: parameter "id" of type "Feature_id" (KBase Feature ID
+           @id external), parameter "location" of list of tuple of size 4:
+           type "Contig_id" (ContigSet contig ID @id external), Long, String,
+           Long, parameter "functions" of list of String, parameter
+           "functional_descriptions" of list of String, parameter
+           "ontology_terms" of mapping from String to mapping from String to
+           list of Long, parameter "note" of String, parameter "md5" of
+           String, parameter "protein_translation" of String, parameter
+           "protein_translation_length" of Long, parameter "cdss" of list of
+           String, parameter "mrnas" of list of String, parameter "children"
+           of list of String, parameter "flags" of list of String, parameter
+           "warnings" of list of String, parameter "inference_data" of list
+           of type "InferenceInfo" (Type spec for the "InferenceInfo" object.
+           TODO docs Found in the `inference_data` fields in mRNAs and CDSs
+           Fields: category - string - TODO type - string - TODO evidence -
+           string - TODO) -> structure: parameter "category" of String,
+           parameter "type" of String, parameter "evidence" of String,
+           parameter "dna_sequence" of String, parameter
+           "dna_sequence_length" of Long, parameter "aliases" of list of
+           tuple of size 2: parameter "fieldname" of String, parameter
+           "alias" of String, parameter "db_xrefs" of list of tuple of size
+           2: parameter "db_source" of String, parameter "db_identifier" of
+           String, parameter "non_coding_features" of list of type
+           "NonCodingFeature" (Structure for a single feature that is NOT one
+           of the following: - Protein encoding gene (gene that has a
+           corresponding CDS) - mRNA - CDS Note pseudo-genes and Non protein
+           encoding genes are put into this flags are flag fields in GenBank
+           format. This will be a controlled vocabulary. Initially Acceptable
+           values are pseudo, ribosomal_slippage, and trans_splicing Md5 is
+           the md5 of dna_sequence. @optional functions ontology_terms note
+           flags warnings functional_descriptions @optional inference_data
+           dna_sequence aliases db_xrefs children parent_gene) -> structure:
+           parameter "id" of type "Feature_id" (KBase Feature ID @id
+           external), parameter "location" of list of tuple of size 4: type
+           "Contig_id" (ContigSet contig ID @id external), Long, String,
+           Long, parameter "type" of String, parameter "functions" of list of
+           String, parameter "functional_descriptions" of list of String,
+           parameter "ontology_terms" of mapping from String to mapping from
+           String to list of Long, parameter "note" of String, parameter
+           "md5" of String, parameter "parent_gene" of String, parameter
+           "children" of list of String, parameter "flags" of list of String,
+           parameter "warnings" of list of String, parameter "inference_data"
+           of list of type "InferenceInfo" (Type spec for the "InferenceInfo"
+           object. TODO docs Found in the `inference_data` fields in mRNAs
+           and CDSs Fields: category - string - TODO type - string - TODO
+           evidence - string - TODO) -> structure: parameter "category" of
+           String, parameter "type" of String, parameter "evidence" of
+           String, parameter "dna_sequence" of String, parameter
+           "dna_sequence_length" of Long, parameter "aliases" of list of
+           tuple of size 2: parameter "fieldname" of String, parameter
+           "alias" of String, parameter "db_xrefs" of list of tuple of size
+           2: parameter "db_source" of String, parameter "db_identifier" of
+           String, parameter "cdss" of list of type "CDS" (Structure for a
+           single coding sequence. Coding sequences are the sections of a
+           feature's sequence that are translated to a protein (minus introns
+           and UTRs). Fields: id - string - identifier of the coding
+           sequence, such as "b0001_CDS_1" location - list<tuple<string, int,
+           string, int>> - list of locations from where this sequence
+           originates in the original assembly. Each sub-sequence in the list
+           constitutes a section of the resulting CDS. The first element in
+           the tuple corresponds to the "contig_id", such as "NC_000913.3".
+           The second element in the tuple is an index in the contig of where
+           the sequence starts. The third element is either a plus or minus
+           sign indicating whether it is on the 5' to 3' leading strand ("+")
+           or on the 3' to 5' lagging strand ("-"). The last element is the
+           length of the sub-sequence. For a location on the leading strand
+           (denoted by "+"), the index is of the leftmost base, and the
+           sequence extends to the right. For a location on the lagging
+           strand (denoted by "-"), the index is of the rightmost base, and
+           the sequence extends to the left. NOTE: the last element in each
+           tuple is the *length* of each sub-sequence. If you have a location
+           such as ("xyz", 100, "+", 50), then your sequence will go from
+           index 100 to index 149 (this has a length of 50). It *does not* go
+           from index 100 to index 150, as that would have a length of 51.
+           Likewise, if you have the location ("xyz", 100, "-", 50), then the
+           sequence extends from 100 down to 51, which has a length of 50
+           bases. It does not go from index 100 to 50, as that would have a
+           length of 51. md5 - string - md5 of the dna sequence - TODO
+           clarification protein_md5 - string - hash of the protein sequence
+           that this CDS encodes parent_gene - string - gene (feature) from
+           which this CDS comes from, including introns and UTRs that have
+           been removed to create this CDS. parent_mrna - string - mRNA
+           sequence from which this sequence is derived, including UTRs but
+           not introns. note - string - TODO functions - list<string> - list
+           of protein products or chemical processes that this sequence
+           creates, facilitates, or influences. functional_descriptions -
+           list<string> - TODO list of protein products or chemical processes
+           that sequence creates, facilitates, or influences. ontology_terms
+           - mapping<string, mapping<string, list<int>>> - a mapping of
+           ontology source id (eg. "GO") to a mapping of term IDs (eg
+           "GO:16209") to a list of indexes into the ontology_events data
+           (found in the top level of the genome object). The index into an
+           ontology event indicates what service and method created this term
+           assignment. flags - list<string>  - (controlled vocab) fields from
+           the genbank source. A common example is "pseudo" for pseudo-genes
+           that do not encode proteins, which shows up as "/pseudo" in the
+           genbank. Values can be: "pseudo", "ribosomal_slippage",
+           "trans_splicing" warnings - list<string> - TODO inference_data -
+           list<InferenceInfo> - TODO protein_translation - string - amino
+           acid sequence that this CDS gets translated into.
+           protein_translation_length - int - length of the above aliases -
+           list<(string, string)> - alternative list of names or identifiers
+           eg: [["gene", "thrA"], ["locus_tag", "b0002"]] db_xrefs -
+           list<(string, string)> - Identifiers from other databases
+           (database cross-references) The first string is the database name,
+           the second is the database identifier. eg: [["ASAP",
+           "ABE-0000006"], ["EcoGene", "EG11277"]] dna_sequence - string -
+           sequence of exons from the genome that constitute this protein
+           encoding sequence. dna_sequence_length - int - length of the above
+           @optional parent_gene parent_mrna functions ontology_terms note
+           flags warnings @optional inference_data dna_sequence aliases
+           db_xrefs functional_descriptions) -> structure: parameter "id" of
+           type "cds_id" (KBase CDS ID @id external), parameter "location" of
+           list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id
+           external), Long, String, Long, parameter "md5" of String,
+           parameter "protein_md5" of String, parameter "parent_gene" of type
+           "Feature_id" (KBase Feature ID @id external), parameter
+           "parent_mrna" of type "mrna_id" (KBase mRNA ID @id external),
+           parameter "note" of String, parameter "functions" of list of
+           String, parameter "functional_descriptions" of list of String,
+           parameter "ontology_terms" of mapping from String to mapping from
+           String to list of Long, parameter "flags" of list of String,
+           parameter "warnings" of list of String, parameter "inference_data"
+           of list of type "InferenceInfo" (Type spec for the "InferenceInfo"
+           object. TODO docs Found in the `inference_data` fields in mRNAs
+           and CDSs Fields: category - string - TODO type - string - TODO
+           evidence - string - TODO) -> structure: parameter "category" of
+           String, parameter "type" of String, parameter "evidence" of
+           String, parameter "protein_translation" of String, parameter
+           "protein_translation_length" of Long, parameter "aliases" of list
+           of tuple of size 2: parameter "fieldname" of String, parameter
+           "alias" of String, parameter "db_xrefs" of list of tuple of size
+           2: parameter "db_source" of String, parameter "db_identifier" of
+           String, parameter "dna_sequence" of String, parameter
+           "dna_sequence_length" of Long, parameter "mrnas" of list of type
+           "mRNA" (The mRNA is the transcribed sequence from the original
+           feature, minus the introns, but including the UTRs. Fields: id -
+           string - identifying string for the mRNA location -
+           list<tuple<string, int, string, int>> - list of locations from
+           where this sequence originates in the original assembly. Each
+           sub-sequence in the list constitutes a section of the resulting
+           CDS. The first element in the tuple corresponds to the
+           "contig_id", such as "NC_000913.3". The second element in the
+           tuple is an index in the contig of where the sequence starts. The
+           third element is either a plus or minus sign indicating whether it
+           is on the 5' to 3' leading strand ("+") or on the 3' to 5' lagging
+           strand ("-"). The last element is the length of the sub-sequence.
+           For a location on the leading strand (denoted by "+"), the index
+           is of the leftmost base, and the sequence extends to the right.
+           For a location on the lagging strand (denoted by "-"), the index
+           is of the rightmost base, and the sequence extends to the left.
+           NOTE: the last element in each tuple is the *length* of each
+           sub-sequence. If you have a location such as ("xyz", 100, "+",
+           50), then your sequence will go from index 100 to index 149 (this
+           has a length of 50). It *does not* go from index 100 to index 150,
+           as that would have a length of 51. Likewise, if you have the
+           location ("xyz", 100, "-", 50), then the sequence extends from 100
+           down to 51, which has a length of 50 bases. It does not go from
+           index 100 to 50, as that would have a length of 51. md5 - string -
+           md5 of the dna sequence - TODO clarification parent_gene -
+           Feature_id - corresponding feature for this sequence, including
+           introns and UTRs cds - string - corresponding coding sequence for
+           this mRNA (the sequence minus UTRs) dna_sequence - string -
+           sequence of UTRs and exons from the genome that constitute this
+           mRNA dna_sequence_length - int - length of the above note - string
+           - TODO functions - list<string> - TODO list of protein products or
+           chemical processes that sequence creates, facilitates, or
+           influences. functional_descriptions - list<string> - TODO list of
+           protein products or chemical processes that sequence creates,
+           facilitates, or influences. ontology_terms - mapping<string,
+           mapping<string, list<int>>> - a mapping of ontology source id (eg.
+           "GO") to a mapping of term IDs (eg "GO:16209") to a list of
+           indexes into the ontology_events data (found in the top level of
+           the genome object). The index into an ontology event indicates
+           what service and method created this term assignment. flags -
+           list<string> - controlled vocab - fields from the genbank source.
+           A common example is "pseudo" for pseudo-genes that do not encode
+           proteins, which shows up as "/pseudo" in the genbank. Values can
+           be: "pseudo", "ribosomal_slippage", "trans_splicing" warnings -
+           list<string> - TODO inference_data - list<InferenceInfo> - TODO
+           aliases - list<(string, string)> - alternative list of names or
+           identifiers eg: [["gene", "thrA"], ["locus_tag", "b0002"]]
+           db_xrefs - list<(string, string)> - Identifiers from other
+           databases (database cross-references). The first string is the
+           database name, the second is the database identifier. eg:
+           [["ASAP", "ABE-0000006"], ["EcoGene", "EG11277"]] @optional
+           parent_gene cds functions ontology_terms note flags warnings
+           @optional inference_data dna_sequence aliases db_xrefs
+           functional_descriptions) -> structure: parameter "id" of type
+           "mrna_id" (KBase mRNA ID @id external), parameter "location" of
+           list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id
+           external), Long, String, Long, parameter "md5" of String,
+           parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id
+           external), parameter "cds" of type "cds_id" (KBase CDS ID @id
+           external), parameter "dna_sequence" of String, parameter
+           "dna_sequence_length" of Long, parameter "note" of String,
+           parameter "functions" of list of String, parameter
+           "functional_descriptions" of list of String, parameter
+           "ontology_terms" of mapping from String to mapping from String to
+           list of Long, parameter "flags" of list of String, parameter
+           "warnings" of list of String, parameter "inference_data" of list
+           of type "InferenceInfo" (Type spec for the "InferenceInfo" object.
+           TODO docs Found in the `inference_data` fields in mRNAs and CDSs
+           Fields: category - string - TODO type - string - TODO evidence -
+           string - TODO) -> structure: parameter "category" of String,
+           parameter "type" of String, parameter "evidence" of String,
+           parameter "aliases" of list of tuple of size 2: parameter
+           "fieldname" of String, parameter "alias" of String, parameter
+           "db_xrefs" of list of tuple of size 2: parameter "db_source" of
+           String, parameter "db_identifier" of String, parameter
+           "assembly_ref" of type "Assembly_ref" (Reference to an Assembly
+           object in the workspace @id ws KBaseGenomeAnnotations.Assembly),
+           parameter "taxon_ref" of type "Taxon_ref" (Reference to a taxon
+           object @id ws KBaseGenomeAnnotations.Taxon), parameter
+           "genbank_handle_ref" of type "genbank_handle_ref" (Reference to a
+           handle to the Genbank file on shock @id handle), parameter
+           "gff_handle_ref" of type "gff_handle_ref" (Reference to a handle
+           to the GFF file on shock @id handle), parameter
+           "external_source_origination_date" of String, parameter "release"
+           of String, parameter "original_source_file_name" of String,
+           parameter "notes" of String, parameter "quality_scores" of list of
+           type "GenomeQualityScore" (Genome quality score Fields: method -
+           string - TODO method_report_ref - string - TODO method_version -
+           string - TODO score: string - TODO score_interpretation - string -
+           TODO timestamp - string - TODO Score_interpretation -
+           fraction_complete - controlled vocabulary managed by API @optional
+           method_report_ref method_version) -> structure: parameter "method"
+           of String, parameter "method_report_ref" of type
+           "Method_report_ref" (Reference to a report object @id ws
+           KBaseReport.Report), parameter "method_version" of String,
+           parameter "score" of String, parameter "score_interpretation" of
+           String, parameter "timestamp" of String, parameter "suspect" of
+           type "Bool", parameter "genome_type" of String, parameter "hidden"
+           of type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
+           1)), parameter "upgrade" of type "boolean" (A boolean - 0 for
+           false, 1 for true. @range (0, 1))
+        :returns: instance of type "SaveGenomesResults" -> structure:
+           parameter "results" of list of type "SaveGenomeResult" ->
+           structure: parameter "info" of type "object_info" (Information
+           about an object, including user provided metadata. obj_id objid -
+           the numerical id of the object. obj_name name - the name of the
+           object. type_string type - the type of the object. timestamp
+           save_date - the save date of the object. obj_ver ver - the version
+           of the object. username saved_by - the user that saved or copied
+           the object. ws_id wsid - the workspace containing the object.
+           ws_name workspace - the workspace containing the object. string
+           chsum - the md5 checksum of the object. int size - the size of the
+           object in bytes. usermeta meta - arbitrary user-supplied metadata
+           about the object.) -> tuple of size 11: parameter "objid" of type
+           "obj_id" (The unique, permanent numerical ID of an object.),
+           parameter "name" of type "obj_name" (A string used as a name for
+           an object. Any string consisting of alphanumeric characters and
+           the characters |._- that is not an integer is acceptable.),
+           parameter "type" of type "type_string" (A type string. Specifies
+           the type and its version in a single string in the format
+           [module].[typename]-[major].[minor]: module - a string. The module
+           name of the typespec containing the type. typename - a string. The
+           name of the type as assigned by the typedef statement. major - an
+           integer. The major version of the type. A change in the major
+           version implies the type has changed in a non-backwards compatible
+           way. minor - an integer. The minor version of the type. A change
+           in the minor version implies that the type has changed in a way
+           that is backwards compatible with previous type definitions. In
+           many cases, the major and minor versions are optional, and if not
+           provided the most recent version will be used. Example:
+           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
+           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
+           character Z (representing the UTC timezone) or the difference in
+           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
+           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
+           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
+           parameter "saved_by" of type "username" (Login name of a KBase
+           user account.), parameter "wsid" of type "ws_id" (The unique,
+           permanent numerical ID of a workspace.), parameter "workspace" of
+           type "ws_name" (A string used as a name for a workspace. Any
+           string consisting of alphanumeric characters and "_", ".", or "-"
+           that is not an integer is acceptable. The name may optionally be
+           prefixed with the workspace owner's user name and a colon, e.g.
+           kbasetest:my_workspace.), parameter "chsum" of String, parameter
+           "size" of Long, parameter "meta" of type "usermeta" (User provided
+           metadata about an object. Arbitrary key-value pairs provided by
+           the user.) -> mapping from String to String
+        """
+        # ctx is the context object
+        # return variables are: results
+        #BEGIN save_genomes
+        results = {
+            "results": GenomeInterface(self.cfg).save_genome_mass(params)
+        }
+        #END save_genomes
+
+        # At some point might do deeper type checking...
+        if not isinstance(results, dict):
+            raise ValueError('Method save_genomes return value ' +
+                             'results is not type dict as required.')
+        # return the results
+        return [results]
+
     def ws_obj_gff_to_genome(self, ctx, params):
         """
         This function takes in a workspace object of type KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly and a gff file and produces a KBaseGenomes.Genome reanotated according to the the input gff file.
diff --git a/lib/GenomeFileUtil/GenomeFileUtilServer.py b/lib/GenomeFileUtil/GenomeFileUtilServer.py
index 6ae2c611..2c09c2e0 100644
--- a/lib/GenomeFileUtil/GenomeFileUtilServer.py
+++ b/lib/GenomeFileUtil/GenomeFileUtilServer.py
@@ -394,6 +394,10 @@ def __init__(self):
                              name='GenomeFileUtil.save_one_genome',
                              types=[dict])
         self.method_authentication['GenomeFileUtil.save_one_genome'] = 'required'  # noqa
+        self.rpc_service.add(impl_GenomeFileUtil.save_genomes,
+                             name='GenomeFileUtil.save_genomes',
+                             types=[dict])
+        self.method_authentication['GenomeFileUtil.save_genomes'] = 'required'  # noqa
         self.rpc_service.add(impl_GenomeFileUtil.ws_obj_gff_to_genome,
                              name='GenomeFileUtil.ws_obj_gff_to_genome',
                              types=[dict])
diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py
index d1851a61..8342e284 100644
--- a/lib/GenomeFileUtil/core/GenbankToGenome.py
+++ b/lib/GenomeFileUtil/core/GenbankToGenome.py
@@ -19,6 +19,7 @@
 from installed_clients.AssemblyUtilClient import AssemblyUtil
 from installed_clients.DataFileUtilClient import DataFileUtil
 from GenomeFileUtil.core.GenomeInterface import GenomeInterface
+from GenomeFileUtil.core.MiscUtils import get_int
 from installed_clients.WorkspaceClient import Workspace
 from GenomeFileUtil.core.GenomeUtils import (
     is_parent, propagate_cds_props_to_gene, warnings, parse_inferences,
@@ -125,7 +126,7 @@ def _set_up_single_params(self, params):
         # avoid side effects and keep variables in params unmodfied
         inputs = dict(params)
         self._validate_params(inputs)
-        ws_id = self._get_int(inputs.pop(_WSID, None), _WSID)
+        ws_id = get_int(inputs.pop(_WSID, None), _WSID)
         ws_name = inputs.pop(_WSNAME, None)
         if (bool(ws_id) == bool(ws_name)):  # xnor
             raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided")
@@ -137,7 +138,7 @@ def _set_up_single_params(self, params):
         return mass_params
 
     def _validate_mass_params(self, params):
-        ws_id = self._get_int(params.get(_WSID), _WSID)
+        ws_id = get_int(params.get(_WSID), _WSID)
         if not ws_id:
             raise ValueError(f"{_WSID} is required")
         inputs = params.get(_INPUTS)
@@ -151,14 +152,6 @@ def _validate_mass_params(self, params):
             except Exception as e:
                 raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e
 
-    def _get_int(self, putative_int, name, minimum=1):
-        if putative_int is not None:
-            if type(putative_int) is not int:
-                raise ValueError(f"{name} must be an integer, got: {putative_int}")
-            if putative_int < minimum:
-                raise ValueError(f"{name} must be an integer >= {minimum}")
-        return putative_int
-
     def _import_genbank_mass(self, params):
 
         workspace_id = params[_WSID]
diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index def9a499..57cd8848 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -13,9 +13,14 @@
 from installed_clients.DataFileUtilClient import DataFileUtil
 from installed_clients.WSLargeDataIOClient import WsLargeDataIO
 from GenomeFileUtil.core import GenomeUtils
+from GenomeFileUtil.core.MiscUtils import get_int
 
 MAX_GENOME_SIZE = 2**30
 
+_WS = "workspace"
+_WSID = "workspace_id"
+_INPUTS = "inputs"
+
 
 class GenomeInterface:
     def __init__(self, config):
@@ -32,18 +37,80 @@ def __init__(self, config):
         self.scratch = config.raw['scratch']
         self.ws_large_data = WsLargeDataIO(self.callback_url)
 
-    @staticmethod
-    def _validate_save_one_genome_params(params):
+    def save_one_genome(self, params):
+        print("validating parameters")
+        mass_params = self._set_up_single_params(params)
+        return self._save_genome_mass(mass_params)[0]
+
+    def save_genome_mass(self, params):
+        print("validating parameters")
+        self._validate_mass_params(params)
+        return self._save_genome_mass(params)
+
+    def _set_up_single_params(self, params):
+        inputs = dict(params)
+        self._validate_genome_input_params(inputs)
+        ws_id = get_int(inputs.pop(_WSID, None), _WSID)
+        ws_name = inputs.pop('workspace', None)
+        if bool(ws_id) == bool(ws_name):  # xnor
+            raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided")
+        if not ws_id:
+            print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
+                  + "a workspace ID over a mutable workspace name that may cause race conditions")
+            ws_id = self.dfu.ws_name_to_id(ws_name)
+        mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
+        return mass_params
+
+    def _validate_mass_params(self, params):
+        ws_id = get_int(params.get(_WSID), _WSID)
+        if not ws_id:
+            raise ValueError(f"{_WSID} is required")
+        inputs = params.get(_INPUTS)
+        if not inputs or type(inputs) != list:
+            raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
+        for i, inp in enumerate(inputs, start=1):
+            if type(inp) != dict:
+                raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
+            try:
+                self._validate_genome_input_params(inp)
+            except Exception as e:
+                raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e
+
+    def _validate_genome_input_params(self, genome_input):
         """
-        _validate_save_one_genome_params:
-                validates params passed to save_one_genome method
+        Check required parameters are in genome_input
         """
-        logging.info('start validating save_one_genome params')
+        logging.info("start validating genome_input params")
         # check for required parameters
-        for p in ['workspace', 'name', 'data']:
-            if p not in params:
-                raise ValueError(
-                    '"{}" parameter is required, but missing'.format(p))
+        for p in ["name", "data"]:
+            if p not in genome_input:
+                raise ValueError(f"{p} parameter is required, but missing")
+
+    def _save_genome_objects(
+        self,
+        workspace_id,
+        ws_datatypes,
+        data_paths,
+        names,
+        meta_data,
+        hidden_data,
+    ):
+        ws_inputs = []
+        for ws_datatype, data_path, name, meta, hidden in zip(
+            ws_datatypes, data_paths, names, meta_data, hidden_data
+        ):
+            ws_inputs.append(
+                {
+                    'type': ws_datatype,
+                    'data_json_file': data_path,
+                    'name': name,
+                    'meta': meta,
+                    'hidden': hidden,
+                }
+            )
+        return self.ws_large_data.save_objects(
+            {'id': workspace_id, 'objects': ws_inputs}
+        )
 
     def _check_shock_response(self, response, errtxt):
         """
@@ -128,54 +195,75 @@ def get_one_genome(self, params):
         return data, res['info']
         # return self.dfu.get_objects(params)['data'][0]
 
-    def save_one_genome(self, params):
-        logging.info('start saving genome object')
-        self._validate_save_one_genome_params(params)
-        workspace = params['workspace']
-        name = params['name']
-        data = params['data']
-        # XXX there is no `workspace_datatype` param in the spec
-        ws_datatype = params.get('workspace_datatype', "KBaseGenomes.Genome")
-        # XXX there is no `meta` param in the spec
-        meta = params.get('meta', {})
-        if "AnnotatedMetagenomeAssembly" in ws_datatype:
-            if params.get('upgrade') or 'feature_counts' not in data:
-                data = self._update_metagenome(data)
-        else:
-            if params.get('upgrade') or 'feature_counts' not in data:
-                data = self._update_genome(data)
-
-        # check all handles point to shock nodes owned by calling user
-        self._own_handle(data, 'genbank_handle_ref')
-        self._own_handle(data, 'gff_handle_ref')
-        if "AnnotatedMetagenomeAssembly" not in ws_datatype:
-            self._check_dna_sequence_in_features(data)
-            data['warnings'] = self.validate_genome(data)
-
-        # sort data
-        data = GenomeUtils.sort_dict(data)
-        # dump genome to scratch for upload
-        data_path = os.path.join(self.scratch, name + ".json")
-        json.dump(data, open(data_path, 'w'))
-        if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'):
-            hidden = 1
-        else:
-            hidden = 0
-
-        if isinstance(workspace, int) or workspace.isdigit():
-            workspace_id = workspace
-        else:
-            workspace_id = self.dfu.ws_name_to_id(workspace)
-
-        save_params = {'id': workspace_id,
-                       'objects': [{'type': ws_datatype,
-                                    'data_json_file': data_path,
-                                    'name': name,
-                                    'meta': meta,
-                                    'hidden': hidden}]}
-        dfu_oi = self.ws_large_data.save_objects(save_params)[0]
-        returnVal = {'info': dfu_oi, 'warnings': data.get('warnings', [])}
-        return returnVal
+    def _save_genome_mass(self, params):
+
+        workspace_id = params[_WSID]
+        inputs = params[_INPUTS]
+
+        ws_datatypes = []
+        data_paths = []
+        names = []
+        meta_data = []
+        hidden_data = []
+        warnings = []
+
+        for input_params in inputs:
+
+            # retrive required params
+            name = input_params['name']
+            data = input_params['data']
+
+            # XXX there is no `workspace_datatype` param in the spec
+            ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome")
+            # XXX there is no `meta` param in the spec
+            meta = input_params.get('meta', {})
+
+            ws_datatypes.append(ws_datatype)
+            names.append(name)
+            meta_data.append(meta)
+
+            if "AnnotatedMetagenomeAssembly" in ws_datatype:
+                if input_params.get('upgrade') or 'feature_counts' not in data:
+                    data = self._update_metagenome(data)
+            else:
+                if input_params.get('upgrade') or 'feature_counts' not in data:
+                    data = self._update_genome(data)
+
+            # check all handles point to shock nodes owned by calling user
+            self._own_handle(data, 'genbank_handle_ref')
+            self._own_handle(data, 'gff_handle_ref')
+            if "AnnotatedMetagenomeAssembly" not in ws_datatype:
+                self._check_dna_sequence_in_features(data)
+                data['warnings'] = self.validate_genome(data)
+
+            # sort data
+            data = GenomeUtils.sort_dict(data)
+            # dump genome to scratch for upload
+            data_path = os.path.join(self.scratch, name + ".json")
+            json.dump(data, open(data_path, 'w'))
+            if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'):
+                hidden = 1
+            else:
+                hidden = 0
+
+            data_paths.append(data_path)
+            hidden_data.append(hidden)
+            warnings.append(data.get('warnings', []))
+
+        dfu_infos = self._save_genome_objects(
+            workspace_id,
+            ws_datatypes,
+            data_paths,
+            names,
+            meta_data,
+            hidden_data,
+        )
+
+        output = [
+            {'info': dfu_oi, 'warnings': warning}
+            for dfu_oi, warning in zip(dfu_infos, warnings)
+        ]
+        return output
 
     @staticmethod
     def determine_tier(source):
diff --git a/lib/GenomeFileUtil/core/MiscUtils.py b/lib/GenomeFileUtil/core/MiscUtils.py
index a34eb6f7..e99a7b71 100644
--- a/lib/GenomeFileUtil/core/MiscUtils.py
+++ b/lib/GenomeFileUtil/core/MiscUtils.py
@@ -12,3 +12,29 @@ def validate_lists_have_same_elements(l1, l2):
     diff = set(l1) ^ (set(l2))  # get the symmetric difference of the sets
     # check if all ids are shared
     return len(diff) == 0
+
+
+def get_int(putative_int, name, minimum=1):
+    """
+    Validates and returns an integer value.
+
+    This function checks whether the provided value is an integer and if it meets the specified minimum value.
+    If the checks are not passed, it raises a `ValueError` with a descriptive message.
+
+    Args:
+        putative_int (int or None): The value to be validated and returned. If `None`, it will be returned as is.
+        name (str): A descriptive name for the value being checked. This is used in error messages.
+        minimum (int, optional): The minimum acceptable value for `putative_int`. Defaults to 1.
+
+    Returns:
+        int: The validated integer if all checks are passed.
+
+    Raises:
+        ValueError: If `putative_int` is not an integer, or if it is less than `minimum`.
+    """
+    if putative_int is not None:
+        if type(putative_int) is not int:
+            raise ValueError(f"{name} must be an integer, got: {putative_int}")
+        if putative_int < minimum:
+            raise ValueError(f"{name} must be an integer >= {minimum}")
+    return putative_int
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 7a93d816..d2582d28 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -163,7 +163,7 @@ def test_bad_one_genome_params(self):
         invalidate_params = {'missing_workspace': 'workspace',
                              'name': 'name',
                              'data': 'data'}
-        error_msg = '"workspace" parameter is required, but missing'
+        error_msg = "Exactly one of a 'workspace_id' or a 'workspace' parameter must be provided"
         self.fail_save_one_genome(invalidate_params, error_msg)
 
     def test_one_genome(self):

From a2fabf4f8dd9ecc5ec9be7f545c628f4f6244430 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Thu, 15 Aug 2024 18:22:14 -0700
Subject: [PATCH 02/24] fix positional arg #1 is the wrong type bug

---
 lib/GenomeFileUtil/core/GenbankToGenome.py | 2 +-
 lib/GenomeFileUtil/core/GenomeInterface.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py
index 8342e284..3bf11b0f 100644
--- a/lib/GenomeFileUtil/core/GenbankToGenome.py
+++ b/lib/GenomeFileUtil/core/GenbankToGenome.py
@@ -223,7 +223,7 @@ def _save_genomes(self, workspace_id, genome_objs):
         results = [
             self.gi.save_one_genome(
                 {
-                    'workspace': workspace_id,
+                    'workspace_id': workspace_id,
                     'name': genome_obj.genome_name,
                     'data': genome_obj.genome_data,
                     "meta": genome_obj.genome_meta,
diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 57cd8848..14152493 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -38,12 +38,10 @@ def __init__(self, config):
         self.ws_large_data = WsLargeDataIO(self.callback_url)
 
     def save_one_genome(self, params):
-        print("validating parameters")
         mass_params = self._set_up_single_params(params)
         return self._save_genome_mass(mass_params)[0]
 
     def save_genome_mass(self, params):
-        print("validating parameters")
         self._validate_mass_params(params)
         return self._save_genome_mass(params)
 
@@ -51,7 +49,7 @@ def _set_up_single_params(self, params):
         inputs = dict(params)
         self._validate_genome_input_params(inputs)
         ws_id = get_int(inputs.pop(_WSID, None), _WSID)
-        ws_name = inputs.pop('workspace', None)
+        ws_name = inputs.pop(_WS, None)
         if bool(ws_id) == bool(ws_name):  # xnor
             raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided")
         if not ws_id:

From 330d6c2da8a0dc2d57efffe690e8db2928455776 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Thu, 15 Aug 2024 19:58:38 -0700
Subject: [PATCH 03/24] use batch genome save in GenbankToGenome.py

---
 lib/GenomeFileUtil/core/GenbankToGenome.py | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py
index 3bf11b0f..606f83e3 100644
--- a/lib/GenomeFileUtil/core/GenbankToGenome.py
+++ b/lib/GenomeFileUtil/core/GenbankToGenome.py
@@ -202,7 +202,6 @@ def _import_genbank_mass(self, params):
         for genome_obj in genome_objs:
             shutil.rmtree(genome_obj.input_directory)
 
-        # TODO make an internal mass function save_genomes
         results = self._save_genomes(workspace_id, genome_objs)
 
         # return the result
@@ -220,17 +219,18 @@ def _import_genbank_mass(self, params):
         return details
 
     def _save_genomes(self, workspace_id, genome_objs):
-        results = [
-            self.gi.save_one_genome(
-                {
-                    'workspace_id': workspace_id,
-                    'name': genome_obj.genome_name,
-                    'data': genome_obj.genome_data,
-                    "meta": genome_obj.genome_meta,
-                }
-            ) for genome_obj in genome_objs
-        ]
-
+        results = self.gi.save_genome_mass(
+            {
+                "workspace_id": workspace_id,
+                "inputs": [
+                    {
+                        "name": genome_obj.genome_name,
+                        "data": genome_obj.genome_data,
+                        "meta": genome_obj.genome_meta,
+                    } for genome_obj in genome_objs
+                ],
+            }
+        )
         return results
 
     def _validate_params(self, params):

From 27158ec45fcfdfb2883200e6500d2c2e34319979 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Fri, 16 Aug 2024 15:08:27 -0700
Subject: [PATCH 04/24] make save_genome_mass function internal

---
 GenomeFileUtil.spec                        |  19 -
 RELEASE_NOTES.md                           |   3 +-
 lib/GenomeFileUtil/GenomeFileUtilImpl.py   | 438 +--------------------
 lib/GenomeFileUtil/GenomeFileUtilServer.py |   4 -
 4 files changed, 2 insertions(+), 462 deletions(-)

diff --git a/GenomeFileUtil.spec b/GenomeFileUtil.spec
index 5c06cff1..05cca1f9 100644
--- a/GenomeFileUtil.spec
+++ b/GenomeFileUtil.spec
@@ -332,25 +332,6 @@ module GenomeFileUtil {
     funcdef save_one_genome(SaveOneGenomeParams params)
                 returns (SaveGenomeResult returnVal) authentication required;
 
-    typedef structure {
-        string name;
-        KBaseGenomes.Genome data;
-        boolean hidden;
-        boolean upgrade;
-    } GenomeInput;
-
-    typedef structure {
-        int workspace_id;
-        list<GenomeInput> inputs;
-    } SaveGenomesParams;
-
-    typedef structure {
-        list<SaveGenomeResult> results;
-    } SaveGenomesResults;
-
-    funcdef save_genomes(SaveGenomesParams params)
-                returns(SaveGenomesResults results) authentication required;
-
     /*
     gff_file - object containing path to gff_file
     ws_ref - input Assembly or Genome reference
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 260b4aa4..91c21b83 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -6,8 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.11.7] - TBD
-- The `save_genomes` method was added to allow users to save genmes in batch
-- Parsed and validated genome before upload
+- The internal method `save_genome_mass` was added to facilitate the batch saving of genomes
 - Unusable `export_genome_features_protein_to_fasta` function was removed
 - The `genbanks_to_genomes` method was added to allow users to upload multiple
 genome objects at once
diff --git a/lib/GenomeFileUtil/GenomeFileUtilImpl.py b/lib/GenomeFileUtil/GenomeFileUtilImpl.py
index 98543d35..9ae1311d 100644
--- a/lib/GenomeFileUtil/GenomeFileUtilImpl.py
+++ b/lib/GenomeFileUtil/GenomeFileUtilImpl.py
@@ -69,7 +69,7 @@ class GenomeFileUtil:
     ######################################### noqa
     VERSION = "0.11.7"
     GIT_URL = "git@github.com:kbaseapps/GenomeFileUtil.git"
-    GIT_COMMIT_HASH = "4819598e65eea38d3c38d8c2ab808aca0d9697fd"
+    GIT_COMMIT_HASH = "330d6c2da8a0dc2d57efffe690e8db2928455776"
 
     #BEGIN_CLASS_HEADER
     #END_CLASS_HEADER
@@ -1262,442 +1262,6 @@ def save_one_genome(self, ctx, params):
         # return the results
         return [returnVal]
 
-    def save_genomes(self, ctx, params):
-        """
-        :param params: instance of type "SaveGenomesParams" -> structure:
-           parameter "workspace_id" of Long, parameter "inputs" of list of
-           type "GenomeInput" -> structure: parameter "name" of String,
-           parameter "data" of type "Genome" (Genome type -- annotated and
-           assembled genome data. Field descriptions: id - string - KBase
-           legacy data ID scientific_name - string - human readable species
-           name domain - string - human readable phylogenetic domain name
-           (eg. "Bacteria") warnings - list of string - genome-level warnings
-           generated in the annotation process genome_tiers - list of string
-           - controlled vocabulary (based on app input and checked by
-           GenomeFileUtil) A list of labels describing the data source for
-           this genome. Allowed values - Representative, Reference,
-           ExternalDB, User Tier assignments based on genome source: * All
-           phytozome - Representative and ExternalDB * Phytozome flagship
-           genomes - Reference, Representative and ExternalDB * Ensembl -
-           Representative and ExternalDB * RefSeq Reference - Reference,
-           Representative and ExternalDB * RefSeq Representative -
-           Representative and ExternalDB * RefSeq Latest or All Assemblies
-           folder - ExternalDB * User Data - User tagged feature_counts - map
-           of string to integer - total counts of each type of feature keys
-           are a controlled vocabulary of - "CDS", "gene", "misc_feature",
-           "misc_recomb", "mobile_element", "ncRNA" - 72,
-           "non_coding_features", "non_coding_genes",
-           "protein_encoding_gene", "rRNA", "rep_origin", "repeat_region",
-           "tRNA" genetic_code - int - An NCBI-assigned taxonomic category
-           for the organism See here -
-           https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi dna_size
-           - integer - total number of nucleotides num_contigs - integer -
-           total number of contigs in the genome molecule_type - string -
-           controlled vocab - the type of molecule sequenced Possible values
-           are "Unknown", "DNA", "RNA", "genomic DNA", "genomic RNA", "mRNA",
-           "tRNA", "rRNA", "other RNA", "other DNA", "transcribed RNA",
-           "viral cRNA", "unassigned DNA", "unassigned RNA" contig_lengths -
-           list of int - nucleotide length of each contig in the genome
-           Indexes in this list correspond to indexes in the `contig_ids`
-           list. contig_ids - list of str - external database identifiers for
-           each contig (eg. "NC_000913.3") source - str - controlled vocab -
-           descriptor of where this data came from (eg. "RefSeq") Allowed
-           entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload
-           source_id - string - identifier of this genome from the source
-           database (eg. the RefSeq ID such as "NC_000913") md5 - string -
-           checksum of the underlying assembly sequence taxonomy - string -
-           semicolon-delimited taxonomy lineage, in order of parent to child
-           taxon_assignments - mapping of taxonomy namespace to taxon ID.
-           example - {"ncbi": "286", "gtdb": "s__staphylococcus_devriesei"}
-           gc_content - float - ratio of GC count to AT in the genome
-           publications - tuple of (pubmedid, source, title, web_addr, year,
-           authors, journal). See typedef above. ontology_events - A record
-           of the service and method used for a set of ontology assignments
-           on the genome. ontologies_present - a mapping of ontology source
-           id (eg. "GO") to a mapping of term IDs (eg "GO:16209") to term
-           names (eg. "histidine biosynthetic process"). features - array of
-           Feature - protein coding genes (see the separate Feature spec)
-           cdss - array of protein-coding sequences mrnas - array of
-           transcribed messenger RNA sequences (equal to cdss plus 5' and 3'
-           UTRs) non_coding_features - array of features that does not
-           include mRNA, CDS, and protein-encoding genes assembly_ref -
-           workspace reference to an assembly object from which this
-           annotated genome was derived. taxon_ref - workspace reference to a
-           taxon object that classifies the species or strain of this genome.
-           genbank_handle_ref - file server handle reference to the source
-           genbank file for this genome. gff_handle_ref - file server handle
-           reference to the source GFF file for this genome.
-           external_source_origination_date - TODO look at GFU for this
-           release - string - User-supplied release or version of the source
-           data. This most likely will come from an input field in the import
-           app. original_source_file_name - filename from which this genome
-           was derived (eg. genbank or gff filename). notes - TODO
-           quality_scores - TODO suspect - bool - flag of whether this
-           annotation is problematic due to some warning genome_type - string
-           - controlled vocab - One of "draft isolate", "finished isolate",
-           "mag", "sag", "virus", "plasmid", "construct" Features vs. coding
-           sequences: a feature is a sequence in the DNA that codes for a
-           protein, including non-transcribed introns. A coding sequence
-           (stored as `cdss`) includes **only** the sections of the feature
-           that codes for a protein, minus introns and UTRs. @optional
-           warnings contig_lengths contig_ids source_id taxonomy publications
-           @optional ontology_events ontologies_present non_coding_features
-           mrnas genome_type @optional genbank_handle_ref gff_handle_ref
-           external_source_origination_date @optional release
-           original_source_file_name notes quality_scores suspect
-           assembly_ref @optional taxon_ref taxon_assignments @metadata ws
-           gc_content as GC content @metadata ws taxonomy as Taxonomy
-           @metadata ws md5 as MD5 @metadata ws dna_size as Size @metadata ws
-           genetic_code as Genetic code @metadata ws domain as Domain
-           @metadata ws source_id as Source ID @metadata ws source as Source
-           @metadata ws scientific_name as Name @metadata ws genome_type as
-           Type @metadata ws length(features) as Number of Protein Encoding
-           Genes @metadata ws length(cdss) as Number of CDS @metadata ws
-           assembly_ref as Assembly Object @metadata ws num_contigs as Number
-           contigs @metadata ws length(warnings) as Number of Genome Level
-           Warnings @metadata ws suspect as Suspect Genome) -> structure:
-           parameter "id" of type "Genome_id" (KBase legacy data ID @id kb),
-           parameter "scientific_name" of String, parameter "domain" of
-           String, parameter "warnings" of list of String, parameter
-           "genome_tiers" of list of String, parameter "feature_counts" of
-           mapping from String to Long, parameter "genetic_code" of Long,
-           parameter "dna_size" of Long, parameter "num_contigs" of Long,
-           parameter "molecule_type" of String, parameter "contig_lengths" of
-           list of Long, parameter "contig_ids" of list of String, parameter
-           "source" of String, parameter "source_id" of type "source_id"
-           (Reference to a source_id @id external), parameter "md5" of
-           String, parameter "taxonomy" of String, parameter
-           "taxon_assignments" of mapping from String to String, parameter
-           "gc_content" of Double, parameter "publications" of list of type
-           "publication" (Structure for a publication Elements: (0) pubmedid
-           - float (1) source - string - (ex. Pubmed) (2) title - string (3)
-           string web address - string (4) publication year - string (5)
-           authors - string (6) journal - string) -> tuple of size 7:
-           parameter "pubmedid" of Double, parameter "source" of String,
-           parameter "title" of String, parameter "url" of String, parameter
-           "year" of String, parameter "authors" of String, parameter
-           "journal" of String, parameter "ontology_events" of list of type
-           "Ontology_event" (@optional ontology_ref method_version eco) ->
-           structure: parameter "id" of String, parameter "ontology_ref" of
-           type "Ontology_ref" (Reference to a ontology object @id ws
-           KBaseOntology.OntologyDictionary), parameter "method" of String,
-           parameter "method_version" of String, parameter "timestamp" of
-           String, parameter "eco" of String, parameter "ontologies_present"
-           of mapping from String to mapping from String to String, parameter
-           "features" of list of type "Feature" (Structure for a single CDS
-           encoding "gene" of a genome ONLY PUT GENES THAT HAVE A
-           CORRESPONDING CDS IN THIS ARRAY NOTE: Sequence is optional.
-           Ideally we can keep it in here, but Recognize due to space
-           constraints another solution may be needed. We may want to add
-           additional fields for other CDM functions (e.g., atomic regulons,
-           coexpressed fids, co_occurring fids,...)
-           protein_translation_length and protein_translation are for longest
-           coded protein (representative protein for splice variants) NOTE:
-           New Aliases field definitely breaks compatibility. As Does
-           Function. flags are flag fields in GenBank format. This will be a
-           controlled vocabulary. Initially Acceptable values are pseudo,
-           ribosomal_slippage, and trans_splicing Md5 is the md5 of
-           dna_sequence. @optional functions ontology_terms note
-           protein_translation mrnas flags warnings @optional inference_data
-           dna_sequence aliases db_xrefs children functional_descriptions) ->
-           structure: parameter "id" of type "Feature_id" (KBase Feature ID
-           @id external), parameter "location" of list of tuple of size 4:
-           type "Contig_id" (ContigSet contig ID @id external), Long, String,
-           Long, parameter "functions" of list of String, parameter
-           "functional_descriptions" of list of String, parameter
-           "ontology_terms" of mapping from String to mapping from String to
-           list of Long, parameter "note" of String, parameter "md5" of
-           String, parameter "protein_translation" of String, parameter
-           "protein_translation_length" of Long, parameter "cdss" of list of
-           String, parameter "mrnas" of list of String, parameter "children"
-           of list of String, parameter "flags" of list of String, parameter
-           "warnings" of list of String, parameter "inference_data" of list
-           of type "InferenceInfo" (Type spec for the "InferenceInfo" object.
-           TODO docs Found in the `inference_data` fields in mRNAs and CDSs
-           Fields: category - string - TODO type - string - TODO evidence -
-           string - TODO) -> structure: parameter "category" of String,
-           parameter "type" of String, parameter "evidence" of String,
-           parameter "dna_sequence" of String, parameter
-           "dna_sequence_length" of Long, parameter "aliases" of list of
-           tuple of size 2: parameter "fieldname" of String, parameter
-           "alias" of String, parameter "db_xrefs" of list of tuple of size
-           2: parameter "db_source" of String, parameter "db_identifier" of
-           String, parameter "non_coding_features" of list of type
-           "NonCodingFeature" (Structure for a single feature that is NOT one
-           of the following: - Protein encoding gene (gene that has a
-           corresponding CDS) - mRNA - CDS Note pseudo-genes and Non protein
-           encoding genes are put into this flags are flag fields in GenBank
-           format. This will be a controlled vocabulary. Initially Acceptable
-           values are pseudo, ribosomal_slippage, and trans_splicing Md5 is
-           the md5 of dna_sequence. @optional functions ontology_terms note
-           flags warnings functional_descriptions @optional inference_data
-           dna_sequence aliases db_xrefs children parent_gene) -> structure:
-           parameter "id" of type "Feature_id" (KBase Feature ID @id
-           external), parameter "location" of list of tuple of size 4: type
-           "Contig_id" (ContigSet contig ID @id external), Long, String,
-           Long, parameter "type" of String, parameter "functions" of list of
-           String, parameter "functional_descriptions" of list of String,
-           parameter "ontology_terms" of mapping from String to mapping from
-           String to list of Long, parameter "note" of String, parameter
-           "md5" of String, parameter "parent_gene" of String, parameter
-           "children" of list of String, parameter "flags" of list of String,
-           parameter "warnings" of list of String, parameter "inference_data"
-           of list of type "InferenceInfo" (Type spec for the "InferenceInfo"
-           object. TODO docs Found in the `inference_data` fields in mRNAs
-           and CDSs Fields: category - string - TODO type - string - TODO
-           evidence - string - TODO) -> structure: parameter "category" of
-           String, parameter "type" of String, parameter "evidence" of
-           String, parameter "dna_sequence" of String, parameter
-           "dna_sequence_length" of Long, parameter "aliases" of list of
-           tuple of size 2: parameter "fieldname" of String, parameter
-           "alias" of String, parameter "db_xrefs" of list of tuple of size
-           2: parameter "db_source" of String, parameter "db_identifier" of
-           String, parameter "cdss" of list of type "CDS" (Structure for a
-           single coding sequence. Coding sequences are the sections of a
-           feature's sequence that are translated to a protein (minus introns
-           and UTRs). Fields: id - string - identifier of the coding
-           sequence, such as "b0001_CDS_1" location - list<tuple<string, int,
-           string, int>> - list of locations from where this sequence
-           originates in the original assembly. Each sub-sequence in the list
-           constitutes a section of the resulting CDS. The first element in
-           the tuple corresponds to the "contig_id", such as "NC_000913.3".
-           The second element in the tuple is an index in the contig of where
-           the sequence starts. The third element is either a plus or minus
-           sign indicating whether it is on the 5' to 3' leading strand ("+")
-           or on the 3' to 5' lagging strand ("-"). The last element is the
-           length of the sub-sequence. For a location on the leading strand
-           (denoted by "+"), the index is of the leftmost base, and the
-           sequence extends to the right. For a location on the lagging
-           strand (denoted by "-"), the index is of the rightmost base, and
-           the sequence extends to the left. NOTE: the last element in each
-           tuple is the *length* of each sub-sequence. If you have a location
-           such as ("xyz", 100, "+", 50), then your sequence will go from
-           index 100 to index 149 (this has a length of 50). It *does not* go
-           from index 100 to index 150, as that would have a length of 51.
-           Likewise, if you have the location ("xyz", 100, "-", 50), then the
-           sequence extends from 100 down to 51, which has a length of 50
-           bases. It does not go from index 100 to 50, as that would have a
-           length of 51. md5 - string - md5 of the dna sequence - TODO
-           clarification protein_md5 - string - hash of the protein sequence
-           that this CDS encodes parent_gene - string - gene (feature) from
-           which this CDS comes from, including introns and UTRs that have
-           been removed to create this CDS. parent_mrna - string - mRNA
-           sequence from which this sequence is derived, including UTRs but
-           not introns. note - string - TODO functions - list<string> - list
-           of protein products or chemical processes that this sequence
-           creates, facilitates, or influences. functional_descriptions -
-           list<string> - TODO list of protein products or chemical processes
-           that sequence creates, facilitates, or influences. ontology_terms
-           - mapping<string, mapping<string, list<int>>> - a mapping of
-           ontology source id (eg. "GO") to a mapping of term IDs (eg
-           "GO:16209") to a list of indexes into the ontology_events data
-           (found in the top level of the genome object). The index into an
-           ontology event indicates what service and method created this term
-           assignment. flags - list<string>  - (controlled vocab) fields from
-           the genbank source. A common example is "pseudo" for pseudo-genes
-           that do not encode proteins, which shows up as "/pseudo" in the
-           genbank. Values can be: "pseudo", "ribosomal_slippage",
-           "trans_splicing" warnings - list<string> - TODO inference_data -
-           list<InferenceInfo> - TODO protein_translation - string - amino
-           acid sequence that this CDS gets translated into.
-           protein_translation_length - int - length of the above aliases -
-           list<(string, string)> - alternative list of names or identifiers
-           eg: [["gene", "thrA"], ["locus_tag", "b0002"]] db_xrefs -
-           list<(string, string)> - Identifiers from other databases
-           (database cross-references) The first string is the database name,
-           the second is the database identifier. eg: [["ASAP",
-           "ABE-0000006"], ["EcoGene", "EG11277"]] dna_sequence - string -
-           sequence of exons from the genome that constitute this protein
-           encoding sequence. dna_sequence_length - int - length of the above
-           @optional parent_gene parent_mrna functions ontology_terms note
-           flags warnings @optional inference_data dna_sequence aliases
-           db_xrefs functional_descriptions) -> structure: parameter "id" of
-           type "cds_id" (KBase CDS ID @id external), parameter "location" of
-           list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id
-           external), Long, String, Long, parameter "md5" of String,
-           parameter "protein_md5" of String, parameter "parent_gene" of type
-           "Feature_id" (KBase Feature ID @id external), parameter
-           "parent_mrna" of type "mrna_id" (KBase mRNA ID @id external),
-           parameter "note" of String, parameter "functions" of list of
-           String, parameter "functional_descriptions" of list of String,
-           parameter "ontology_terms" of mapping from String to mapping from
-           String to list of Long, parameter "flags" of list of String,
-           parameter "warnings" of list of String, parameter "inference_data"
-           of list of type "InferenceInfo" (Type spec for the "InferenceInfo"
-           object. TODO docs Found in the `inference_data` fields in mRNAs
-           and CDSs Fields: category - string - TODO type - string - TODO
-           evidence - string - TODO) -> structure: parameter "category" of
-           String, parameter "type" of String, parameter "evidence" of
-           String, parameter "protein_translation" of String, parameter
-           "protein_translation_length" of Long, parameter "aliases" of list
-           of tuple of size 2: parameter "fieldname" of String, parameter
-           "alias" of String, parameter "db_xrefs" of list of tuple of size
-           2: parameter "db_source" of String, parameter "db_identifier" of
-           String, parameter "dna_sequence" of String, parameter
-           "dna_sequence_length" of Long, parameter "mrnas" of list of type
-           "mRNA" (The mRNA is the transcribed sequence from the original
-           feature, minus the introns, but including the UTRs. Fields: id -
-           string - identifying string for the mRNA location -
-           list<tuple<string, int, string, int>> - list of locations from
-           where this sequence originates in the original assembly. Each
-           sub-sequence in the list constitutes a section of the resulting
-           CDS. The first element in the tuple corresponds to the
-           "contig_id", such as "NC_000913.3". The second element in the
-           tuple is an index in the contig of where the sequence starts. The
-           third element is either a plus or minus sign indicating whether it
-           is on the 5' to 3' leading strand ("+") or on the 3' to 5' lagging
-           strand ("-"). The last element is the length of the sub-sequence.
-           For a location on the leading strand (denoted by "+"), the index
-           is of the leftmost base, and the sequence extends to the right.
-           For a location on the lagging strand (denoted by "-"), the index
-           is of the rightmost base, and the sequence extends to the left.
-           NOTE: the last element in each tuple is the *length* of each
-           sub-sequence. If you have a location such as ("xyz", 100, "+",
-           50), then your sequence will go from index 100 to index 149 (this
-           has a length of 50). It *does not* go from index 100 to index 150,
-           as that would have a length of 51. Likewise, if you have the
-           location ("xyz", 100, "-", 50), then the sequence extends from 100
-           down to 51, which has a length of 50 bases. It does not go from
-           index 100 to 50, as that would have a length of 51. md5 - string -
-           md5 of the dna sequence - TODO clarification parent_gene -
-           Feature_id - corresponding feature for this sequence, including
-           introns and UTRs cds - string - corresponding coding sequence for
-           this mRNA (the sequence minus UTRs) dna_sequence - string -
-           sequence of UTRs and exons from the genome that constitute this
-           mRNA dna_sequence_length - int - length of the above note - string
-           - TODO functions - list<string> - TODO list of protein products or
-           chemical processes that sequence creates, facilitates, or
-           influences. functional_descriptions - list<string> - TODO list of
-           protein products or chemical processes that sequence creates,
-           facilitates, or influences. ontology_terms - mapping<string,
-           mapping<string, list<int>>> - a mapping of ontology source id (eg.
-           "GO") to a mapping of term IDs (eg "GO:16209") to a list of
-           indexes into the ontology_events data (found in the top level of
-           the genome object). The index into an ontology event indicates
-           what service and method created this term assignment. flags -
-           list<string> - controlled vocab - fields from the genbank source.
-           A common example is "pseudo" for pseudo-genes that do not encode
-           proteins, which shows up as "/pseudo" in the genbank. Values can
-           be: "pseudo", "ribosomal_slippage", "trans_splicing" warnings -
-           list<string> - TODO inference_data - list<InferenceInfo> - TODO
-           aliases - list<(string, string)> - alternative list of names or
-           identifiers eg: [["gene", "thrA"], ["locus_tag", "b0002"]]
-           db_xrefs - list<(string, string)> - Identifiers from other
-           databases (database cross-references). The first string is the
-           database name, the second is the database identifier. eg:
-           [["ASAP", "ABE-0000006"], ["EcoGene", "EG11277"]] @optional
-           parent_gene cds functions ontology_terms note flags warnings
-           @optional inference_data dna_sequence aliases db_xrefs
-           functional_descriptions) -> structure: parameter "id" of type
-           "mrna_id" (KBase mRNA ID @id external), parameter "location" of
-           list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id
-           external), Long, String, Long, parameter "md5" of String,
-           parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id
-           external), parameter "cds" of type "cds_id" (KBase CDS ID @id
-           external), parameter "dna_sequence" of String, parameter
-           "dna_sequence_length" of Long, parameter "note" of String,
-           parameter "functions" of list of String, parameter
-           "functional_descriptions" of list of String, parameter
-           "ontology_terms" of mapping from String to mapping from String to
-           list of Long, parameter "flags" of list of String, parameter
-           "warnings" of list of String, parameter "inference_data" of list
-           of type "InferenceInfo" (Type spec for the "InferenceInfo" object.
-           TODO docs Found in the `inference_data` fields in mRNAs and CDSs
-           Fields: category - string - TODO type - string - TODO evidence -
-           string - TODO) -> structure: parameter "category" of String,
-           parameter "type" of String, parameter "evidence" of String,
-           parameter "aliases" of list of tuple of size 2: parameter
-           "fieldname" of String, parameter "alias" of String, parameter
-           "db_xrefs" of list of tuple of size 2: parameter "db_source" of
-           String, parameter "db_identifier" of String, parameter
-           "assembly_ref" of type "Assembly_ref" (Reference to an Assembly
-           object in the workspace @id ws KBaseGenomeAnnotations.Assembly),
-           parameter "taxon_ref" of type "Taxon_ref" (Reference to a taxon
-           object @id ws KBaseGenomeAnnotations.Taxon), parameter
-           "genbank_handle_ref" of type "genbank_handle_ref" (Reference to a
-           handle to the Genbank file on shock @id handle), parameter
-           "gff_handle_ref" of type "gff_handle_ref" (Reference to a handle
-           to the GFF file on shock @id handle), parameter
-           "external_source_origination_date" of String, parameter "release"
-           of String, parameter "original_source_file_name" of String,
-           parameter "notes" of String, parameter "quality_scores" of list of
-           type "GenomeQualityScore" (Genome quality score Fields: method -
-           string - TODO method_report_ref - string - TODO method_version -
-           string - TODO score: string - TODO score_interpretation - string -
-           TODO timestamp - string - TODO Score_interpretation -
-           fraction_complete - controlled vocabulary managed by API @optional
-           method_report_ref method_version) -> structure: parameter "method"
-           of String, parameter "method_report_ref" of type
-           "Method_report_ref" (Reference to a report object @id ws
-           KBaseReport.Report), parameter "method_version" of String,
-           parameter "score" of String, parameter "score_interpretation" of
-           String, parameter "timestamp" of String, parameter "suspect" of
-           type "Bool", parameter "genome_type" of String, parameter "hidden"
-           of type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
-           1)), parameter "upgrade" of type "boolean" (A boolean - 0 for
-           false, 1 for true. @range (0, 1))
-        :returns: instance of type "SaveGenomesResults" -> structure:
-           parameter "results" of list of type "SaveGenomeResult" ->
-           structure: parameter "info" of type "object_info" (Information
-           about an object, including user provided metadata. obj_id objid -
-           the numerical id of the object. obj_name name - the name of the
-           object. type_string type - the type of the object. timestamp
-           save_date - the save date of the object. obj_ver ver - the version
-           of the object. username saved_by - the user that saved or copied
-           the object. ws_id wsid - the workspace containing the object.
-           ws_name workspace - the workspace containing the object. string
-           chsum - the md5 checksum of the object. int size - the size of the
-           object in bytes. usermeta meta - arbitrary user-supplied metadata
-           about the object.) -> tuple of size 11: parameter "objid" of type
-           "obj_id" (The unique, permanent numerical ID of an object.),
-           parameter "name" of type "obj_name" (A string used as a name for
-           an object. Any string consisting of alphanumeric characters and
-           the characters |._- that is not an integer is acceptable.),
-           parameter "type" of type "type_string" (A type string. Specifies
-           the type and its version in a single string in the format
-           [module].[typename]-[major].[minor]: module - a string. The module
-           name of the typespec containing the type. typename - a string. The
-           name of the type as assigned by the typedef statement. major - an
-           integer. The major version of the type. A change in the major
-           version implies the type has changed in a non-backwards compatible
-           way. minor - an integer. The minor version of the type. A change
-           in the minor version implies that the type has changed in a way
-           that is backwards compatible with previous type definitions. In
-           many cases, the major and minor versions are optional, and if not
-           provided the most recent version will be used. Example:
-           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
-           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
-           character Z (representing the UTC timezone) or the difference in
-           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
-           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
-           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
-           parameter "saved_by" of type "username" (Login name of a KBase
-           user account.), parameter "wsid" of type "ws_id" (The unique,
-           permanent numerical ID of a workspace.), parameter "workspace" of
-           type "ws_name" (A string used as a name for a workspace. Any
-           string consisting of alphanumeric characters and "_", ".", or "-"
-           that is not an integer is acceptable. The name may optionally be
-           prefixed with the workspace owner's user name and a colon, e.g.
-           kbasetest:my_workspace.), parameter "chsum" of String, parameter
-           "size" of Long, parameter "meta" of type "usermeta" (User provided
-           metadata about an object. Arbitrary key-value pairs provided by
-           the user.) -> mapping from String to String
-        """
-        # ctx is the context object
-        # return variables are: results
-        #BEGIN save_genomes
-        results = {
-            "results": GenomeInterface(self.cfg).save_genome_mass(params)
-        }
-        #END save_genomes
-
-        # At some point might do deeper type checking...
-        if not isinstance(results, dict):
-            raise ValueError('Method save_genomes return value ' +
-                             'results is not type dict as required.')
-        # return the results
-        return [results]
-
     def ws_obj_gff_to_genome(self, ctx, params):
         """
         This function takes in a workspace object of type KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly and a gff file and produces a KBaseGenomes.Genome reanotated according to the the input gff file.
diff --git a/lib/GenomeFileUtil/GenomeFileUtilServer.py b/lib/GenomeFileUtil/GenomeFileUtilServer.py
index 2c09c2e0..6ae2c611 100644
--- a/lib/GenomeFileUtil/GenomeFileUtilServer.py
+++ b/lib/GenomeFileUtil/GenomeFileUtilServer.py
@@ -394,10 +394,6 @@ def __init__(self):
                              name='GenomeFileUtil.save_one_genome',
                              types=[dict])
         self.method_authentication['GenomeFileUtil.save_one_genome'] = 'required'  # noqa
-        self.rpc_service.add(impl_GenomeFileUtil.save_genomes,
-                             name='GenomeFileUtil.save_genomes',
-                             types=[dict])
-        self.method_authentication['GenomeFileUtil.save_genomes'] = 'required'  # noqa
         self.rpc_service.add(impl_GenomeFileUtil.ws_obj_gff_to_genome,
                              name='GenomeFileUtil.ws_obj_gff_to_genome',
                              types=[dict])

From 04aa20302e6deee46b1a318330dd884f167a5a6f Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Fri, 16 Aug 2024 17:33:16 -0700
Subject: [PATCH 05/24] add tests for save_genome_mass function

---
 test/problematic_tests/save_genome_test.py | 85 +++++++++++++++++++++-
 1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index d2582d28..93b2ea31 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -92,6 +92,7 @@ def setUpClass(cls):
         suffix = int(time.time() * 1000)
         cls.wsName = "test_SaveGenomeTest_" + str(suffix)
         cls.wsClient.create_workspace({'workspace': cls.wsName})
+        cls.wsID = cls.dfu.ws_name_to_id(cls.wsName)
 
         cls.nodes_to_delete = []
         cls.prepare_data()
@@ -142,9 +143,12 @@ def start_test(self):
         testname = inspect.stack()[1][3]
         print(('\n*** starting test: ' + testname + ' **'))
 
-    def fail_save_one_genome(self, params, error, exception=ValueError, contains=False):
+    def fail_save_genome(self, params, error, exception=ValueError, contains=False, mass=False):
         with self.assertRaises(exception) as context:
-            self.getImpl().save_one_genome(self.ctx, params)
+            if mass:
+                self.genome_interface.save_genome_mass(params)
+            else:
+                self.getImpl().save_one_genome(self.ctx, params)
         if contains:
             self.assertIn(error, str(context.exception))
         else:
@@ -164,7 +168,7 @@ def test_bad_one_genome_params(self):
                              'name': 'name',
                              'data': 'data'}
         error_msg = "Exactly one of a 'workspace_id' or a 'workspace' parameter must be provided"
-        self.fail_save_one_genome(invalidate_params, error_msg)
+        self.fail_save_genome(invalidate_params, error_msg)
 
     def test_one_genome(self):
         self.start_test()
@@ -192,6 +196,81 @@ def test_one_genome_with_hidden(self):
         ret = self.getImpl().save_one_genome(self.ctx, params)[0]
         self.check_save_one_genome_output(ret, genome_name)
 
+    def test_genomes(self):
+        self.start_test()
+        genome_name = 'test_genome'
+        inputs = [
+            {
+                'name': genome_name,
+                'data': self.test_genome_data,
+            }
+        ]
+        params = {'workspace_id': self.wsID, 'inputs': inputs}
+        ret = self.genome_interface.save_genome_mass(params)[0]
+        self.check_save_one_genome_output(ret, genome_name)
+
+    def test_genomes_with_hidden(self):
+        self.start_test()
+        genome_name = 'test_genome_hidden'
+        inputs = [
+            {
+                'name': genome_name,
+                'data': self.test_genome_data,
+                'hidden': 1,
+            }
+        ]
+        params = {'workspace_id': self.wsID, 'inputs': inputs}
+        ret = self.genome_interface.save_genome_mass(params)[0]
+        self.check_save_one_genome_output(ret, genome_name)
+
+        inputs = [
+            {
+                'name': genome_name,
+                'data': self.test_genome_data,
+                'hidden': True,
+            }
+        ]
+        params = {'workspace_id': self.wsID, 'inputs': inputs}
+        ret = self.genome_interface.save_genome_mass(params)[0]
+        self.check_save_one_genome_output(ret, genome_name)
+
+    def test_bad_genomes_params_missing_wsid(self):
+        self.start_test()
+        invalidate_params = {
+            'missing_workspace_id': 'workspace_id',
+            'name': 'name',
+            'data': 'data',
+        }
+        error_msg = "workspace_id is required"
+        self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
+    def test_bad_genomes_params_empty_inputs(self):
+        self.start_test()
+        invalidate_params = {
+            'workspace_id': self.wsID,
+            'inputs': []
+        }
+        error_msg = "inputs field is required and must be a non-empty list"
+        self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
+    def test_bad_genomes_params_invalidate_entry_type(self):
+        self.start_test()
+        invalidate_params = {
+            'workspace_id': self.wsID,
+            'inputs': [['name', 'data']],
+        }
+        error_msg = "Entry #1 in inputs field is not a mapping as required"
+        self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
+    def test_bad_genomes_params_missing_parameter(self):
+        self.start_test()
+        invalidate_params = {
+            'workspace_id': self.wsID,
+            'inputs': [{'data': 'data'}],
+        }
+        error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
+        self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
     def test_GenomeInterface_check_dna_sequence_in_features(self):
         # no feature in genome
         genome = {'missing_features': 'features'}

From 8d672d17d45ff7dfb0150d3c4fc080a5bca43b16 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Fri, 16 Aug 2024 22:40:31 -0700
Subject: [PATCH 06/24] fix bug

---
 lib/GenomeFileUtil/core/GenomeInterface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 14152493..990a19a8 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -239,7 +239,7 @@ def _save_genome_mass(self, params):
             # dump genome to scratch for upload
             data_path = os.path.join(self.scratch, name + ".json")
             json.dump(data, open(data_path, 'w'))
-            if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'):
+            if 'hidden' in input_params and str(input_params['hidden']).lower() in ('yes', 'true', 't', '1'):
                 hidden = 1
             else:
                 hidden = 0

From d1dd768292bfff29067c45e34e2c6ad0c15f2e21 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Mon, 26 Aug 2024 14:42:37 -0700
Subject: [PATCH 07/24] update release notes && make the dicts in the loop

---
 RELEASE_NOTES.md                           |  2 +-
 lib/GenomeFileUtil/core/GenomeInterface.py | 31 +++++++++++-----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 91c21b83..c0e69fda 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.11.7] - TBD
-- The internal method `save_genome_mass` was added to facilitate the batch saving of genomes
+- Genomes are now saved in batches to the workspace
 - Unusable `export_genome_features_protein_to_fasta` function was removed
 - The `genbanks_to_genomes` method was added to allow users to upload multiple
 genome objects at once
diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 990a19a8..66f6bd0a 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -93,22 +93,23 @@ def _save_genome_objects(
         meta_data,
         hidden_data,
     ):
-        ws_inputs = []
-        for ws_datatype, data_path, name, meta, hidden in zip(
-            ws_datatypes, data_paths, names, meta_data, hidden_data
-        ):
-            ws_inputs.append(
-                {
-                    'type': ws_datatype,
-                    'data_json_file': data_path,
-                    'name': name,
-                    'meta': meta,
-                    'hidden': hidden,
-                }
-            )
-        return self.ws_large_data.save_objects(
-            {'id': workspace_id, 'objects': ws_inputs}
+        dfu_infos = self.ws_large_data.save_objects(
+            {
+                'id': workspace_id,
+                'objects': [
+                    {
+                        'type': ws_datatype,
+                        'data_json_file': data_path,
+                        'name': name,
+                        'meta': meta,
+                        'hidden': hidden,
+                    } for ws_datatype, data_path, name, meta, hidden in zip(
+                        ws_datatypes, data_paths, names, meta_data, hidden_data
+                    )
+                ]
+            }
         )
+        return dfu_infos
 
     def _check_shock_response(self, response, errtxt):
         """

From e79c297ab6522ee00faceb4bfb531d232dbb5b75 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Mon, 26 Aug 2024 15:23:53 -0700
Subject: [PATCH 08/24] remove logging && add NOTE for workspace_datatype

---
 lib/GenomeFileUtil/core/GenomeInterface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 66f6bd0a..321ae71b 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -78,7 +78,6 @@ def _validate_genome_input_params(self, genome_input):
         """
         Check required parameters are in genome_input
         """
-        logging.info("start validating genome_input params")
         # check for required parameters
         for p in ["name", "data"]:
             if p not in genome_input:
@@ -213,6 +212,7 @@ def _save_genome_mass(self, params):
             data = input_params['data']
 
             # XXX there is no `workspace_datatype` param in the spec
+            # NOTE: The method caller should not be able to choose an arbitrary workspace type
             ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome")
             # XXX there is no `meta` param in the spec
             meta = input_params.get('meta', {})

From 35030a03fe3271681dca2fd9c924ac59ba501354 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 28 Aug 2024 15:24:54 -0700
Subject: [PATCH 09/24] move set_up_single_params && validate_mass_params into
 GenomeUtils

---
 lib/GenomeFileUtil/core/GenbankToGenome.py | 40 ++--------
 lib/GenomeFileUtil/core/GenomeInterface.py | 48 +++--------
 lib/GenomeFileUtil/core/GenomeUtils.py     | 93 +++++++++++++++++++++-
 test/problematic_tests/save_genome_test.py | 72 ++++++++---------
 4 files changed, 146 insertions(+), 107 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py
index 606f83e3..ffdd55c3 100644
--- a/lib/GenomeFileUtil/core/GenbankToGenome.py
+++ b/lib/GenomeFileUtil/core/GenbankToGenome.py
@@ -19,11 +19,11 @@
 from installed_clients.AssemblyUtilClient import AssemblyUtil
 from installed_clients.DataFileUtilClient import DataFileUtil
 from GenomeFileUtil.core.GenomeInterface import GenomeInterface
-from GenomeFileUtil.core.MiscUtils import get_int
 from installed_clients.WorkspaceClient import Workspace
 from GenomeFileUtil.core.GenomeUtils import (
     is_parent, propagate_cds_props_to_gene, warnings, parse_inferences,
-    load_ontology_mappings, set_taxon_data, set_default_taxon_data
+    load_ontology_mappings, set_taxon_data, set_default_taxon_data,
+    set_up_single_params, validate_mass_params
 )
 
 MAX_MISC_FEATURE_SIZE = 10000
@@ -114,44 +114,16 @@ def __init__(self, config):
 
     def import_genbank(self, params):
         print('validating parameters')
-        mass_params = self._set_up_single_params(params)
+        mass_params = set_up_single_params(
+            params, _WSNAME, self._validate_params, self.dfu.ws_name_to_id
+        )
         return self._import_genbank_mass(mass_params)[0]
 
     def import_genbank_mass(self, params):
         print('validating parameters')
-        self._validate_mass_params(params)
+        validate_mass_params(params, self._validate_params)
         return self._import_genbank_mass(params)
 
-    def _set_up_single_params(self, params):
-        # avoid side effects and keep variables in params unmodfied
-        inputs = dict(params)
-        self._validate_params(inputs)
-        ws_id = get_int(inputs.pop(_WSID, None), _WSID)
-        ws_name = inputs.pop(_WSNAME, None)
-        if (bool(ws_id) == bool(ws_name)):  # xnor
-            raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided")
-        if not ws_id:
-            print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
-                  + "a workspace ID over a mutable workspace name that may cause race conditions")
-            ws_id = self.dfu.ws_name_to_id(ws_name)
-        mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
-        return mass_params
-
-    def _validate_mass_params(self, params):
-        ws_id = get_int(params.get(_WSID), _WSID)
-        if not ws_id:
-            raise ValueError(f"{_WSID} is required")
-        inputs = params.get(_INPUTS)
-        if not inputs or type(inputs) is not list:
-            raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
-        for i, inp in enumerate(inputs, start=1):
-            if type(inp) is not dict:
-                raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
-            try:
-                self._validate_params(inp)
-            except Exception as e:
-                raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e
-
     def _import_genbank_mass(self, params):
 
         workspace_id = params[_WSID]
diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 321ae71b..33b0ff07 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -12,8 +12,10 @@
 from installed_clients.AssemblySequenceAPIServiceClient import AssemblySequenceAPI
 from installed_clients.DataFileUtilClient import DataFileUtil
 from installed_clients.WSLargeDataIOClient import WsLargeDataIO
-from GenomeFileUtil.core import GenomeUtils
-from GenomeFileUtil.core.MiscUtils import get_int
+from GenomeFileUtil.core.GenomeUtils import (
+    set_taxon_data, set_default_taxon_data, sort_dict,
+    set_up_single_params, validate_mass_params
+)
 
 MAX_GENOME_SIZE = 2**30
 
@@ -38,42 +40,16 @@ def __init__(self, config):
         self.ws_large_data = WsLargeDataIO(self.callback_url)
 
     def save_one_genome(self, params):
-        mass_params = self._set_up_single_params(params)
+        mass_params = set_up_single_params(
+            params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id
+        )
         return self._save_genome_mass(mass_params)[0]
 
+    # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail.
     def save_genome_mass(self, params):
-        self._validate_mass_params(params)
+        validate_mass_params(params, self._validate_genome_input_params)
         return self._save_genome_mass(params)
 
-    def _set_up_single_params(self, params):
-        inputs = dict(params)
-        self._validate_genome_input_params(inputs)
-        ws_id = get_int(inputs.pop(_WSID, None), _WSID)
-        ws_name = inputs.pop(_WS, None)
-        if bool(ws_id) == bool(ws_name):  # xnor
-            raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided")
-        if not ws_id:
-            print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
-                  + "a workspace ID over a mutable workspace name that may cause race conditions")
-            ws_id = self.dfu.ws_name_to_id(ws_name)
-        mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
-        return mass_params
-
-    def _validate_mass_params(self, params):
-        ws_id = get_int(params.get(_WSID), _WSID)
-        if not ws_id:
-            raise ValueError(f"{_WSID} is required")
-        inputs = params.get(_INPUTS)
-        if not inputs or type(inputs) != list:
-            raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
-        for i, inp in enumerate(inputs, start=1):
-            if type(inp) != dict:
-                raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
-            try:
-                self._validate_genome_input_params(inp)
-            except Exception as e:
-                raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e
-
     def _validate_genome_input_params(self, genome_input):
         """
         Check required parameters are in genome_input
@@ -236,7 +212,7 @@ def _save_genome_mass(self, params):
                 data['warnings'] = self.validate_genome(data)
 
             # sort data
-            data = GenomeUtils.sort_dict(data)
+            data = sort_dict(data)
             # dump genome to scratch for upload
             data_path = os.path.join(self.scratch, name + ".json")
             json.dump(data, open(data_path, 'w'))
@@ -311,9 +287,9 @@ def _update_genome(self, genome):
         # NOTE: Metagenome object does not have a 'taxon_assignments' field
         if 'taxon_assignments' in genome and genome['taxon_assignments'].get('ncbi'):
             tax_id = int(genome['taxon_assignments']['ncbi'])
-            GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome)
+            set_taxon_data(tax_id, self.re_api_url, genome)
         else:
-            GenomeUtils.set_default_taxon_data(genome)
+            set_default_taxon_data(genome)
 
         if any([x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')]):
             if 'assembly_ref' in genome:
diff --git a/lib/GenomeFileUtil/core/GenomeUtils.py b/lib/GenomeFileUtil/core/GenomeUtils.py
index 0ffeae29..d1b63935 100644
--- a/lib/GenomeFileUtil/core/GenomeUtils.py
+++ b/lib/GenomeFileUtil/core/GenomeUtils.py
@@ -3,13 +3,18 @@
 import os
 import re
 import time
+from typing import Callable, Dict, Any
 
 from relation_engine_client import REClient
 from relation_engine_client.exceptions import RENotFound
+from GenomeFileUtil.core.MiscUtils import get_int
 
 # Name of the ncbi taxonomy namespace stored in "taxon_assignments"
 _NCBI_TAX = 'ncbi'
 
+_WSID = 'workspace_id'
+_INPUTS = 'inputs'
+
 warnings = {
     "cds_excluded": "SUSPECT: CDS from {} was excluded because the associated "
                     "CDS failed coordinates validation",
@@ -482,4 +487,90 @@ def set_taxon_data(tax_id, re_api_url, genome_dict):
         )
     # Assign the scientific name to the most specific (right-most) taxon in the lineage
     genome_dict['scientific_name'] = sciname
- 
\ No newline at end of file
+
+
+def set_up_single_params(
+    params: Dict[str, Any],
+    ws: str,
+    validate_params_func: Callable[[Dict[str, Any]], None],
+    ws_name_to_id_func: Callable[[str], int]
+) -> Dict[str, Any]:
+    """
+    Sets up parameters by validating them and ensuring that exactly one of workspace ID or name is provided.
+
+    Args:
+        params (Dict[str, Any]): A dictionary where the keys are parameter names (strings) and the values
+            can be of any type.
+        ws (str): A string representing the key for the workspace name or identifier.
+        validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters
+            and validates them. This function should raise an exception if the parameters are invalid.
+        ws_name_to_id_func (Callable[[str], int]): A function that takes a workspace name (string) and returns
+            its corresponding ID (integer).
+
+    Returns:
+        Dict[str, Any]: A dictionary containing the workspace ID and the processed parameters. The dictionary
+            has keys '_WSID' and '_INPUTS', where '_WSID' is the workspace ID and '_INPUTS' is a list containing
+            the input parameters.
+
+    Raises:
+        ValueError: If neither or both the workspace ID and workspace name are provided in the parameters.
+        KeyError: If the workspace ID or name is missing or invalid.
+
+    Notes:
+        - If a workspace ID is not provided, the function will attempt to convert the workspace name to an ID
+          using `ws_name_to_id_func`.
+        - It is preferable to provide a workspace ID directly to avoid potential race conditions with mutable
+          workspace names.
+    """
+    inputs = dict(params)
+    validate_params_func(inputs)
+    ws_id = get_int(inputs.pop(_WSID, None), _WSID)
+    ws_name = inputs.pop(ws, None)
+    if bool(ws_id) == bool(ws_name):  # xnor
+        raise ValueError(f"Exactly one of a '{_WSID}' or a '{ws}' parameter must be provided")
+    if not ws_id:
+        print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
+                + "a workspace ID over a mutable workspace name that may cause race conditions")
+        ws_id = ws_name_to_id_func(ws_name)
+    mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
+    return mass_params
+
+
+def validate_mass_params(
+    params: Dict[str, Any],
+    validate_params_func: Callable[[Dict[str, Any]], None]
+) -> None:
+    """
+    Validates the provided parameters according to specific rules.
+
+    Args:
+        params (Dict[str, Any]): A dictionary containing parameters to validate. Must include:
+            - _WSID: A workspace ID, which must be present and valid.
+            - _INPUTS: A list of parameter dictionaries, each of which must be validated by `validate_params_func`.
+
+        validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters
+            and validates it. The function should raise an exception if the parameters are invalid.
+
+    Raises:
+        ValueError: If `_WSID` is missing or invalid, if `_INPUTS` is missing or not a non-empty list, or if any
+            entry in `_INPUTS` is not a dictionary or fails validation.
+
+    Notes:
+        - The function checks that `_WSID` is present and converts it to an integer using `get_int`.
+        - The `_INPUTS` field must be a non-empty list of dictionaries. Each dictionary in the list is validated
+          using `validate_params_func`.
+        - If any validation fails, a `ValueError` is raised with a message indicating the issue and entry index.
+    """
+    ws_id = get_int(params.get(_WSID), _WSID)
+    if not ws_id:
+        raise ValueError(f"{_WSID} is required")
+    inputs = params.get(_INPUTS)
+    if not inputs or type(inputs) != list:
+        raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
+    for i, inp in enumerate(inputs, start=1):
+        if type(inp) != dict:
+            raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
+        try:
+            validate_params_func(inp)
+        except Exception as e:
+            raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e
\ No newline at end of file
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 93b2ea31..e09547bb 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -234,42 +234,42 @@ def test_genomes_with_hidden(self):
         ret = self.genome_interface.save_genome_mass(params)[0]
         self.check_save_one_genome_output(ret, genome_name)
 
-    def test_bad_genomes_params_missing_wsid(self):
-        self.start_test()
-        invalidate_params = {
-            'missing_workspace_id': 'workspace_id',
-            'name': 'name',
-            'data': 'data',
-        }
-        error_msg = "workspace_id is required"
-        self.fail_save_genome(invalidate_params, error_msg, mass=True)
-
-    def test_bad_genomes_params_empty_inputs(self):
-        self.start_test()
-        invalidate_params = {
-            'workspace_id': self.wsID,
-            'inputs': []
-        }
-        error_msg = "inputs field is required and must be a non-empty list"
-        self.fail_save_genome(invalidate_params, error_msg, mass=True)
-
-    def test_bad_genomes_params_invalidate_entry_type(self):
-        self.start_test()
-        invalidate_params = {
-            'workspace_id': self.wsID,
-            'inputs': [['name', 'data']],
-        }
-        error_msg = "Entry #1 in inputs field is not a mapping as required"
-        self.fail_save_genome(invalidate_params, error_msg, mass=True)
-
-    def test_bad_genomes_params_missing_parameter(self):
-        self.start_test()
-        invalidate_params = {
-            'workspace_id': self.wsID,
-            'inputs': [{'data': 'data'}],
-        }
-        error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
-        self.fail_save_genome(invalidate_params, error_msg, mass=True)
+    # def test_bad_genomes_params_missing_wsid(self):
+    #     self.start_test()
+    #     invalidate_params = {
+    #         'missing_workspace_id': 'workspace_id',
+    #         'name': 'name',
+    #         'data': 'data',
+    #     }
+    #     error_msg = "workspace_id is required"
+    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
+    # def test_bad_genomes_params_empty_inputs(self):
+    #     self.start_test()
+    #     invalidate_params = {
+    #         'workspace_id': self.wsID,
+    #         'inputs': []
+    #     }
+    #     error_msg = "inputs field is required and must be a non-empty list"
+    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
+    # def test_bad_genomes_params_invalidate_entry_type(self):
+    #     self.start_test()
+    #     invalidate_params = {
+    #         'workspace_id': self.wsID,
+    #         'inputs': [['name', 'data']],
+    #     }
+    #     error_msg = "Entry #1 in inputs field is not a mapping as required"
+    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
+
+    # def test_bad_genomes_params_missing_parameter(self):
+    #     self.start_test()
+    #     invalidate_params = {
+    #         'workspace_id': self.wsID,
+    #         'inputs': [{'data': 'data'}],
+    #     }
+    #     error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
+    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
 
     def test_GenomeInterface_check_dna_sequence_in_features(self):
         # no feature in genome

From 4b1b88bc9e389ba6a8fd9a21b5a0903abb2f6578 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 28 Aug 2024 16:21:33 -0700
Subject: [PATCH 10/24] remove redundant tests

---
 lib/GenomeFileUtil/core/GenomeInterface.py | 15 +++-----
 test/problematic_tests/save_genome_test.py | 44 ++++------------------
 2 files changed, 14 insertions(+), 45 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 33b0ff07..7f77218e 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -12,10 +12,7 @@
 from installed_clients.AssemblySequenceAPIServiceClient import AssemblySequenceAPI
 from installed_clients.DataFileUtilClient import DataFileUtil
 from installed_clients.WSLargeDataIOClient import WsLargeDataIO
-from GenomeFileUtil.core.GenomeUtils import (
-    set_taxon_data, set_default_taxon_data, sort_dict,
-    set_up_single_params, validate_mass_params
-)
+from GenomeFileUtil.core import GenomeUtils
 
 MAX_GENOME_SIZE = 2**30
 
@@ -40,14 +37,14 @@ def __init__(self, config):
         self.ws_large_data = WsLargeDataIO(self.callback_url)
 
     def save_one_genome(self, params):
-        mass_params = set_up_single_params(
+        mass_params = GenomeUtils.set_up_single_params(
             params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id
         )
         return self._save_genome_mass(mass_params)[0]
 
     # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail.
     def save_genome_mass(self, params):
-        validate_mass_params(params, self._validate_genome_input_params)
+        GenomeUtils.validate_mass_params(params, self._validate_genome_input_params)
         return self._save_genome_mass(params)
 
     def _validate_genome_input_params(self, genome_input):
@@ -212,7 +209,7 @@ def _save_genome_mass(self, params):
                 data['warnings'] = self.validate_genome(data)
 
             # sort data
-            data = sort_dict(data)
+            data = GenomeUtils.sort_dict(data)
             # dump genome to scratch for upload
             data_path = os.path.join(self.scratch, name + ".json")
             json.dump(data, open(data_path, 'w'))
@@ -287,9 +284,9 @@ def _update_genome(self, genome):
         # NOTE: Metagenome object does not have a 'taxon_assignments' field
         if 'taxon_assignments' in genome and genome['taxon_assignments'].get('ncbi'):
             tax_id = int(genome['taxon_assignments']['ncbi'])
-            set_taxon_data(tax_id, self.re_api_url, genome)
+            GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome)
         else:
-            set_default_taxon_data(genome)
+            GenomeUtils.set_default_taxon_data(genome)
 
         if any([x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')]):
             if 'assembly_ref' in genome:
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index e09547bb..c3c0e017 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -234,42 +234,14 @@ def test_genomes_with_hidden(self):
         ret = self.genome_interface.save_genome_mass(params)[0]
         self.check_save_one_genome_output(ret, genome_name)
 
-    # def test_bad_genomes_params_missing_wsid(self):
-    #     self.start_test()
-    #     invalidate_params = {
-    #         'missing_workspace_id': 'workspace_id',
-    #         'name': 'name',
-    #         'data': 'data',
-    #     }
-    #     error_msg = "workspace_id is required"
-    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
-
-    # def test_bad_genomes_params_empty_inputs(self):
-    #     self.start_test()
-    #     invalidate_params = {
-    #         'workspace_id': self.wsID,
-    #         'inputs': []
-    #     }
-    #     error_msg = "inputs field is required and must be a non-empty list"
-    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
-
-    # def test_bad_genomes_params_invalidate_entry_type(self):
-    #     self.start_test()
-    #     invalidate_params = {
-    #         'workspace_id': self.wsID,
-    #         'inputs': [['name', 'data']],
-    #     }
-    #     error_msg = "Entry #1 in inputs field is not a mapping as required"
-    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
-
-    # def test_bad_genomes_params_missing_parameter(self):
-    #     self.start_test()
-    #     invalidate_params = {
-    #         'workspace_id': self.wsID,
-    #         'inputs': [{'data': 'data'}],
-    #     }
-    #     error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
-    #     self.fail_save_genome(invalidate_params, error_msg, mass=True)
+    def test_bad_genomes_params_missing_parameter(self):
+        self.start_test()
+        invalidate_params = {
+            'workspace_id': self.wsID,
+            'inputs': [{'data': 'data'}],
+        }
+        error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
+        self.fail_save_genome(invalidate_params, error_msg, mass=True)
 
     def test_GenomeInterface_check_dna_sequence_in_features(self):
         # no feature in genome

From b56d77fb4b212bb2102e2e4a62371c751ba7edf2 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Tue, 3 Sep 2024 14:28:38 -0700
Subject: [PATCH 11/24] add test to cover the missing line

---
 test/problematic_tests/save_genome_test.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index c3c0e017..29d1754d 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -209,6 +209,21 @@ def test_genomes(self):
         ret = self.genome_interface.save_genome_mass(params)[0]
         self.check_save_one_genome_output(ret, genome_name)
 
+    def test_genomes_with_upgrade(self):
+        self.start_test()
+        genome_name = 'test_genome'
+        inputs = [
+            {
+                'name': genome_name,
+                'data': self.test_genome_data,
+                'ws_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly",
+                'upgrade': True,
+            }
+        ]
+        params = {'workspace_id': self.wsID, 'inputs': inputs}
+        ret = self.genome_interface.save_genome_mass(params)[0]
+        self.check_save_one_genome_output(ret, genome_name)
+
     def test_genomes_with_hidden(self):
         self.start_test()
         genome_name = 'test_genome_hidden'

From 788c7ee524ee3e355c41ab89ac50093b8509ced0 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Tue, 3 Sep 2024 15:19:14 -0700
Subject: [PATCH 12/24] fix params name typo

---
 test/problematic_tests/save_genome_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 29d1754d..c453bc87 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -216,7 +216,7 @@ def test_genomes_with_upgrade(self):
             {
                 'name': genome_name,
                 'data': self.test_genome_data,
-                'ws_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly",
+                'workspace_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly",
                 'upgrade': True,
             }
         ]

From 65467ea89c918b98d22bf3b55cb4578de0e6b634 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Thu, 5 Sep 2024 01:00:58 -0700
Subject: [PATCH 13/24] add metagenome json file && cover the missing line

---
 lib/GenomeFileUtil/core/GenomeInterface.py |  1 +
 test/data/metagenomes/toy/metagenome.json  | 53 ++++++++++++++++++++++
 test/problematic_tests/save_genome_test.py | 50 ++++++++++++++------
 3 files changed, 90 insertions(+), 14 deletions(-)
 create mode 100644 test/data/metagenomes/toy/metagenome.json

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 7f77218e..07538bda 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -267,6 +267,7 @@ def _update_metagenome(self, genome):
         """Checks for missing required fields and fixes breaking changes"""
         if 'molecule_type' not in genome:
             genome['molecule_type'] = 'Unknown'
+        return genome
 
     def _update_genome(self, genome):
         """Checks for missing required fields and fixes breaking changes"""
diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json
new file mode 100644
index 00000000..a3c8e5ba
--- /dev/null
+++ b/test/data/metagenomes/toy/metagenome.json
@@ -0,0 +1,53 @@
+{
+    "contig_ids": [
+        "Ga0065724_100001"
+    ],
+    "contig_lengths": [
+        538871
+    ],
+    "dna_size": 538871,
+    "domain": "Eukaryota",
+    "environment": null,
+    "external_source_origination_date": null,
+    "feature_counts": {
+        "CDS": 20,
+        "gene": 20,
+        "non_coding_features": 0,
+        "protein_encoding_gene": 20
+    },
+    "features_handle_ref": "KBH_736245",
+    "gc_content": 0.64469,
+    "genetic_code": 1,
+    "genome_type": "Metagenome",
+    "gff_handle_ref": "KBH_736244",
+    "id": "MyMetagenome",
+    "md5": "e2ccbd5a9bed0148015bd6b784e3c1c3",
+    "molecule_type": "SingleLetterAlphabet",
+    "notes": null,
+    "num_contigs": 1,
+    "num_features": 40,
+    "ontologies_present": {},
+    "ontology_events": [
+        {
+            "id": "GO",
+            "method": "GenomeFileUtils Genbank uploader from annotations",
+            "method_version": "0.11.7",
+            "ontology_ref": "KBaseOntology/gene_ontology",
+            "timestamp": "2024_09_05_06_45_31"
+        }
+    ],
+    "original_source_file_name": null,
+    "protein_handle_ref": "KBH_736243",
+    "publications": [],
+    "scientific_name": "Arabidopsis thaliana",
+    "source": "GFF",
+    "source_id": "unknown",
+    "suspect": 1,
+    "taxon_assignments": {
+        "ncbi": "3702"
+    },
+    "taxonomy": "cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; eudicotyledons; Gunneridae; Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; Camelineae; Arabidopsis",
+    "warnings": [
+        "SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS."
+    ]
+}
\ No newline at end of file
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index c453bc87..5380738c 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -23,6 +23,9 @@
 from GenomeFileUtil.core.GenomeInterface import GenomeInterface
 from installed_clients.WorkspaceClient import Workspace as workspaceService
 
+KBASE_GENOME = "KBaseGenomes.Genome"
+KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly"
+
 
 class SaveGenomeTest(unittest.TestCase):
 
@@ -115,17 +118,36 @@ def delete_shock_node(cls, node_id):
 
     @classmethod
     def prepare_data(cls):
-        assembly_file_path = os.path.join(cls.scratch,
-                                          'e_coli_assembly.fasta')
+
+        assembly_file_path = os.path.join(cls.scratch,'e_coli_assembly.fasta')
+        meta_file_path = os.path.join(cls.scratch,'metagenome.fa')
+
         shutil.copy('data/e_coli/e_coli_assembly.fasta', assembly_file_path)
+        shutil.copy('data/metagenomes/toy/metagenome.fa', meta_file_path)
+
         au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
-        assembly_ref = au.save_assembly_from_fasta({
-            'workspace_name': cls.wsName,
-            'assembly_name': 'e_coli.assembly',
-            'file': {'path': assembly_file_path}
-        })
+
+        assembly_refs = au.save_assemblies_from_fastas(
+            {
+                'workspace_id': cls.wsID,
+                'inputs': [
+                    {
+                        'assembly_name': 'e_coli.assembly',
+                        'file': assembly_file_path
+                    },
+                    {
+                        'assembly_name': 'metagenome.assembly',
+                        'file': meta_file_path
+                    }
+                ]
+            }
+        )["results"]
+
         cls.test_genome_data = json.load(open('data/e_coli/e_coli.json'))
-        cls.test_genome_data['assembly_ref'] = assembly_ref
+        cls.test_genome_data['assembly_ref'] = assembly_refs[0]["upa"]
+
+        cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json'))
+        cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"]
 
     def getWsClient(self):
         return self.__class__.wsClient
@@ -154,12 +176,12 @@ def fail_save_genome(self, params, error, exception=ValueError, contains=False,
         else:
             self.assertEqual(error, str(context.exception))
 
-    def check_save_one_genome_output(self, ret, genome_name):
+    def check_save_one_genome_output(self, ret, genome_name, data_type=KBASE_GENOME):
         self.assertTrue('info' in ret)
 
         genome_info = ret['info']
         self.assertEqual(genome_info[1], genome_name)
-        self.assertEqual(genome_info[2].split('-')[0], 'KBaseGenomes.Genome')
+        self.assertEqual(genome_info[2].split('-')[0], data_type)
         self.assertEqual(genome_info[5], self.user_id)
 
     def test_bad_one_genome_params(self):
@@ -211,18 +233,18 @@ def test_genomes(self):
 
     def test_genomes_with_upgrade(self):
         self.start_test()
-        genome_name = 'test_genome'
+        genome_name = 'MyMetagenome'
         inputs = [
             {
                 'name': genome_name,
-                'data': self.test_genome_data,
-                'workspace_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly",
+                'data': self.test_metagenome_data,
+                'workspace_datatype': KBASE_METAGENOME,
                 'upgrade': True,
             }
         ]
         params = {'workspace_id': self.wsID, 'inputs': inputs}
         ret = self.genome_interface.save_genome_mass(params)[0]
-        self.check_save_one_genome_output(ret, genome_name)
+        self.check_save_one_genome_output(ret, genome_name, data_type=KBASE_METAGENOME)
 
     def test_genomes_with_hidden(self):
         self.start_test()

From 66af0b756a8c19fe17cca9afdcc024fe3f56bef5 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Thu, 5 Sep 2024 08:13:12 -0700
Subject: [PATCH 14/24] rm gff_handle_ref

---
 test/data/metagenomes/toy/metagenome.json | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json
index a3c8e5ba..57e42815 100644
--- a/test/data/metagenomes/toy/metagenome.json
+++ b/test/data/metagenomes/toy/metagenome.json
@@ -19,7 +19,6 @@
     "gc_content": 0.64469,
     "genetic_code": 1,
     "genome_type": "Metagenome",
-    "gff_handle_ref": "KBH_736244",
     "id": "MyMetagenome",
     "md5": "e2ccbd5a9bed0148015bd6b784e3c1c3",
     "molecule_type": "SingleLetterAlphabet",

From fcbb510713e34b932c216082ebc04d3d4d1e0118 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Mon, 9 Sep 2024 17:37:22 -0700
Subject: [PATCH 15/24] add features_handle_ref && protein_handle_ref before
 upload

---
 test/data/metagenomes/toy/features_handle_ref |  1 +
 test/data/metagenomes/toy/metagenome.json     |  2 --
 test/data/metagenomes/toy/protein_handle_ref  |  1 +
 test/problematic_tests/save_genome_test.py    | 23 +++++++++++++++++++
 4 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 test/data/metagenomes/toy/features_handle_ref
 create mode 100644 test/data/metagenomes/toy/protein_handle_ref

diff --git a/test/data/metagenomes/toy/features_handle_ref b/test/data/metagenomes/toy/features_handle_ref
new file mode 100644
index 00000000..f67a087d
--- /dev/null
+++ b/test/data/metagenomes/toy/features_handle_ref
@@ -0,0 +1 @@
+test features_handle_ref
\ No newline at end of file
diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json
index 57e42815..95c23ef8 100644
--- a/test/data/metagenomes/toy/metagenome.json
+++ b/test/data/metagenomes/toy/metagenome.json
@@ -15,7 +15,6 @@
         "non_coding_features": 0,
         "protein_encoding_gene": 20
     },
-    "features_handle_ref": "KBH_736245",
     "gc_content": 0.64469,
     "genetic_code": 1,
     "genome_type": "Metagenome",
@@ -36,7 +35,6 @@
         }
     ],
     "original_source_file_name": null,
-    "protein_handle_ref": "KBH_736243",
     "publications": [],
     "scientific_name": "Arabidopsis thaliana",
     "source": "GFF",
diff --git a/test/data/metagenomes/toy/protein_handle_ref b/test/data/metagenomes/toy/protein_handle_ref
new file mode 100644
index 00000000..4e4a8e1f
--- /dev/null
+++ b/test/data/metagenomes/toy/protein_handle_ref
@@ -0,0 +1 @@
+test protein_handle_ref
\ No newline at end of file
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 5380738c..70eff5c1 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -149,6 +149,29 @@ def prepare_data(cls):
         cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json'))
         cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"]
 
+        # Move files to the share folder
+        fhr_path = os.path.join(cls.scratch,'features_handle_ref')
+        phr_path = os.path.join(cls.scratch,'protein_handle_ref')
+
+        shutil.copy('data/metagenomes/toy/features_handle_ref', fhr_path)
+        shutil.copy('data/metagenomes/toy/protein_handle_ref', phr_path)
+
+        # Upload files to the blobstore
+        handle_service_outputs = cls.dfu.file_to_shock_mass(
+            [
+                {'file_path': fhr_path, 'make_handle': 1, 'pack': 'gzip'},
+                {'file_path': phr_path, 'make_handle': 1, 'pack': 'gzip'}
+            ]
+        )
+
+        # Update metagenome
+        cls.test_metagenome_data["features_handle_ref"] = handle_service_outputs[0]["handle"]["hid"]
+        cls.test_metagenome_data["protein_handle_ref"]= handle_service_outputs[1]["handle"]["hid"]
+
+        # Delete shock_ids
+        cls.nodes_to_delete.append(handle_service_outputs[0]["shock_id"])
+        cls.nodes_to_delete.append(handle_service_outputs[1]["shock_id"])
+
     def getWsClient(self):
         return self.__class__.wsClient
 

From 32f96a04a98f333e79d315b0b302b75d9eab98c7 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Tue, 10 Sep 2024 16:06:14 -0700
Subject: [PATCH 16/24] add boolean flag for validate_genome

---
 lib/GenomeFileUtil/core/GenbankToGenome.py |  6 ++++++
 lib/GenomeFileUtil/core/GenomeInterface.py | 17 +++++++----------
 test/problematic_tests/save_genome_test.py |  4 ++--
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py
index ffdd55c3..d5e42b55 100644
--- a/lib/GenomeFileUtil/core/GenbankToGenome.py
+++ b/lib/GenomeFileUtil/core/GenbankToGenome.py
@@ -162,6 +162,12 @@ def _import_genbank_mass(self, params):
             # parse genbank file
             self._parse_genbank(genome_obj)
 
+            # check features
+            self.gi.check_dna_sequence_in_features(genome_obj.genome_data)
+
+            # validate genome
+            genome_obj.genome_data['warnings'] = self.gi.validate_genome(genome_obj.genome_data)
+
             # gather all objects
             genome_objs.append(genome_obj)
 
diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 07538bda..cec2b3e3 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -43,9 +43,9 @@ def save_one_genome(self, params):
         return self._save_genome_mass(mass_params)[0]
 
     # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail.
-    def save_genome_mass(self, params):
+    def save_genome_mass(self, params, validate_genome=False):
         GenomeUtils.validate_mass_params(params, self._validate_genome_input_params)
-        return self._save_genome_mass(params)
+        return self._save_genome_mass(params, validate_genome=validate_genome)
 
     def _validate_genome_input_params(self, genome_input):
         """
@@ -128,11 +128,10 @@ def _own_handle(self, genome_data, handle_property):
                 handle_id = dfu_shock['handle']['hid']
                 genome_data[handle_property] = handle_id
 
-    def _check_dna_sequence_in_features(self, genome):
+    def check_dna_sequence_in_features(self, genome):
         """
-        _check_dna_sequence_in_features: check dna sequence in each feature
+        check_dna_sequence_in_features: check dna sequence in each feature
         """
-        logging.info('start checking dna sequence in each feature')
 
         if 'features' in genome:
             features_to_work = {}
@@ -166,7 +165,7 @@ def get_one_genome(self, params):
         return data, res['info']
         # return self.dfu.get_objects(params)['data'][0]
 
-    def _save_genome_mass(self, params):
+    def _save_genome_mass(self, params, validate_genome=True):
 
         workspace_id = params[_WSID]
         inputs = params[_INPUTS]
@@ -204,8 +203,8 @@ def _save_genome_mass(self, params):
             # check all handles point to shock nodes owned by calling user
             self._own_handle(data, 'genbank_handle_ref')
             self._own_handle(data, 'gff_handle_ref')
-            if "AnnotatedMetagenomeAssembly" not in ws_datatype:
-                self._check_dna_sequence_in_features(data)
+            if "AnnotatedMetagenomeAssembly" not in ws_datatype and validate_genome:
+                self.check_dna_sequence_in_features(data)
                 data['warnings'] = self.validate_genome(data)
 
             # sort data
@@ -406,8 +405,6 @@ def validate_genome(g):
         """
 
         allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'}
-
-        logging.info('Validating genome object contents')
         warnings = g.get('warnings', [])
 
         # TODO: Determine whether these checks make any sense for Metagenome
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 70eff5c1..f45944ae 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -307,7 +307,7 @@ def test_GenomeInterface_check_dna_sequence_in_features(self):
         # no feature in genome
         genome = {'missing_features': 'features'}
         copied_genome = genome.copy()
-        self.genome_interface._check_dna_sequence_in_features(copied_genome)
+        self.genome_interface.check_dna_sequence_in_features(copied_genome)
         self.assertEqual(copied_genome, genome)
 
         # with contigs
@@ -315,7 +315,7 @@ def test_GenomeInterface_check_dna_sequence_in_features(self):
         for feat in copied_genome['features']:
             if 'dna_sequence' in feat:
                 del feat['dna_sequence']
-        self.genome_interface._check_dna_sequence_in_features(copied_genome)
+        self.genome_interface.check_dna_sequence_in_features(copied_genome)
 
         feature_dna_sum = 0
         for feature in copied_genome['features']:

From 41eed0b037601d6b33775f6674fb9002eb62be9d Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Tue, 10 Sep 2024 17:10:43 -0700
Subject: [PATCH 17/24] test validate_genome boolean flag

---
 test/problematic_tests/save_genome_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index f45944ae..e13d415d 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -251,7 +251,7 @@ def test_genomes(self):
             }
         ]
         params = {'workspace_id': self.wsID, 'inputs': inputs}
-        ret = self.genome_interface.save_genome_mass(params)[0]
+        ret = self.genome_interface.save_genome_mass(params, validate_genome=True)[0]
         self.check_save_one_genome_output(ret, genome_name)
 
     def test_genomes_with_upgrade(self):

From 71670bce3fd95445172e415c6167a0471cd697f6 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 11 Sep 2024 16:09:33 -0700
Subject: [PATCH 18/24] 1. add pydoc for save_genme_mass; 2. make the dicts in
 the _save_genome_mass loop; 3. make the note much more explicit

---
 lib/GenomeFileUtil/core/GenomeInterface.py | 97 ++++++++++++----------
 1 file changed, 51 insertions(+), 46 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index cec2b3e3..8e74c53d 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -37,13 +37,48 @@ def __init__(self, config):
         self.ws_large_data = WsLargeDataIO(self.callback_url)
 
     def save_one_genome(self, params):
+        """
+        Saves a single genome object to the workspace.
+
+        This method prepares the parameters for saving a single genome and calls the
+        `_save_genome_mass` method to handle the actual saving process. It processes
+        the input parameters and performs necessary validation before saving the genome.
+
+        Args:
+            params (dict): A dictionary containing the parameters for saving the genome.
+                Must include workspace and genome-specific information.
+
+        Returns:
+            dict: The information about the saved genome object, including metadata.
+                The return value is derived from the `_save_genome_mass` method.
+        """
         mass_params = GenomeUtils.set_up_single_params(
             params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id
         )
         return self._save_genome_mass(mass_params)[0]
 
-    # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail.
     def save_genome_mass(self, params, validate_genome=False):
+        """
+        Saves multiple genome objects to the workspace.
+
+        This method handles the saving of multiple genome objects in bulk. It validates
+        the parameters, processes each genome individually, and performs necessary
+        updates or validations before saving. If requested, it will also validate the
+        genomes before saving.
+
+        # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload,
+        # the workspace will fail.
+
+        Args:
+            params (dict): A dictionary containing the parameters for saving the genomes.
+                Should include workspace ID and a list of genome inputs with their data.
+            validate_genome (bool, optional): A flag indicating whether to validate the
+                genomes before saving. Defaults to False.
+
+        Returns:
+            list: A list of dictionaries, each containing information about a saved
+                genome object and any warnings encountered during the saving process.
+        """
         GenomeUtils.validate_mass_params(params, self._validate_genome_input_params)
         return self._save_genome_mass(params, validate_genome=validate_genome)
 
@@ -56,33 +91,6 @@ def _validate_genome_input_params(self, genome_input):
             if p not in genome_input:
                 raise ValueError(f"{p} parameter is required, but missing")
 
-    def _save_genome_objects(
-        self,
-        workspace_id,
-        ws_datatypes,
-        data_paths,
-        names,
-        meta_data,
-        hidden_data,
-    ):
-        dfu_infos = self.ws_large_data.save_objects(
-            {
-                'id': workspace_id,
-                'objects': [
-                    {
-                        'type': ws_datatype,
-                        'data_json_file': data_path,
-                        'name': name,
-                        'meta': meta,
-                        'hidden': hidden,
-                    } for ws_datatype, data_path, name, meta, hidden in zip(
-                        ws_datatypes, data_paths, names, meta_data, hidden_data
-                    )
-                ]
-            }
-        )
-        return dfu_infos
-
     def _check_shock_response(self, response, errtxt):
         """
         _check_shock_response: check shock node response (Copied from DataFileUtil)
@@ -170,28 +178,28 @@ def _save_genome_mass(self, params, validate_genome=True):
         workspace_id = params[_WSID]
         inputs = params[_INPUTS]
 
-        ws_datatypes = []
-        data_paths = []
-        names = []
-        meta_data = []
-        hidden_data = []
+        objects = []
         warnings = []
 
         for input_params in inputs:
 
+            obj = {}
+
             # retrive required params
             name = input_params['name']
             data = input_params['data']
 
             # XXX there is no `workspace_datatype` param in the spec
-            # NOTE: The method caller should not be able to choose an arbitrary workspace type
+            # NOTE: This allows a user to specify any arbitrary workspace type which could cause,
+            # in the worst case, data corruption. It should be removed from the API
+            # (note it is not currently documented) so users cannot access it.
             ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome")
             # XXX there is no `meta` param in the spec
             meta = input_params.get('meta', {})
 
-            ws_datatypes.append(ws_datatype)
-            names.append(name)
-            meta_data.append(meta)
+            obj["type"] = ws_datatype
+            obj["name"] = name
+            obj["meta"] = meta
 
             if "AnnotatedMetagenomeAssembly" in ws_datatype:
                 if input_params.get('upgrade') or 'feature_counts' not in data:
@@ -217,17 +225,14 @@ def _save_genome_mass(self, params, validate_genome=True):
             else:
                 hidden = 0
 
-            data_paths.append(data_path)
-            hidden_data.append(hidden)
+            obj["data_json_file"] = data_path
+            obj["hidden"] = hidden
+
+            objects.append(obj)
             warnings.append(data.get('warnings', []))
 
-        dfu_infos = self._save_genome_objects(
-            workspace_id,
-            ws_datatypes,
-            data_paths,
-            names,
-            meta_data,
-            hidden_data,
+        dfu_infos = self.ws_large_data.save_objects(
+            {'id': workspace_id, 'objects': objects}
         )
 
         output = [

From 0cbe1551b827ac860afcccaf02fa134208a45308 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Thu, 12 Sep 2024 17:22:54 -0700
Subject: [PATCH 19/24] update release notes && remove tiny files

---
 RELEASE_NOTES.md                              |  1 -
 test/data/metagenomes/toy/features_handle_ref |  1 -
 test/data/metagenomes/toy/protein_handle_ref  |  1 -
 test/problematic_tests/save_genome_test.py    | 19 ++++++++++++++++---
 4 files changed, 16 insertions(+), 6 deletions(-)
 delete mode 100644 test/data/metagenomes/toy/features_handle_ref
 delete mode 100644 test/data/metagenomes/toy/protein_handle_ref

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index c0e69fda..c020b8f0 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -6,7 +6,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.11.7] - TBD
-- Genomes are now saved in batches to the workspace
 - Unusable `export_genome_features_protein_to_fasta` function was removed
 - The `genbanks_to_genomes` method was added to allow users to upload multiple
 genome objects at once
diff --git a/test/data/metagenomes/toy/features_handle_ref b/test/data/metagenomes/toy/features_handle_ref
deleted file mode 100644
index f67a087d..00000000
--- a/test/data/metagenomes/toy/features_handle_ref
+++ /dev/null
@@ -1 +0,0 @@
-test features_handle_ref
\ No newline at end of file
diff --git a/test/data/metagenomes/toy/protein_handle_ref b/test/data/metagenomes/toy/protein_handle_ref
deleted file mode 100644
index 4e4a8e1f..00000000
--- a/test/data/metagenomes/toy/protein_handle_ref
+++ /dev/null
@@ -1 +0,0 @@
-test protein_handle_ref
\ No newline at end of file
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index e13d415d..1c4644aa 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -4,6 +4,7 @@
 import json  # noqa: F401
 import os  # noqa: F401
 import shutil
+import tempfile
 import time
 import unittest
 import urllib.error
@@ -149,12 +150,20 @@ def prepare_data(cls):
         cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json'))
         cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"]
 
-        # Move files to the share folder
+        # Set taregt paths in the share folder
         fhr_path = os.path.join(cls.scratch,'features_handle_ref')
         phr_path = os.path.join(cls.scratch,'protein_handle_ref')
 
-        shutil.copy('data/metagenomes/toy/features_handle_ref', fhr_path)
-        shutil.copy('data/metagenomes/toy/protein_handle_ref', phr_path)
+        # Create temp files
+        with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_features_file:
+            temp_features_file.write("test features_handle_ref")
+
+        with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_protein_file:
+            temp_protein_file.write("test protein_handle_ref")
+
+        # Move files to the share folder
+        shutil.copy(temp_features_file.name, fhr_path)
+        shutil.copy(temp_protein_file.name, phr_path)
 
         # Upload files to the blobstore
         handle_service_outputs = cls.dfu.file_to_shock_mass(
@@ -172,6 +181,10 @@ def prepare_data(cls):
         cls.nodes_to_delete.append(handle_service_outputs[0]["shock_id"])
         cls.nodes_to_delete.append(handle_service_outputs[1]["shock_id"])
 
+        # Remove temp files
+        os.remove(temp_features_file.name)
+        os.remove(temp_protein_file.name)
+
     def getWsClient(self):
         return self.__class__.wsClient
 

From 52e280d6b26116cb2d353f168cd54ff787297376 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Tue, 17 Sep 2024 15:22:36 -0700
Subject: [PATCH 20/24] add more info and warnings checks in test

---
 lib/GenomeFileUtil/core/GenomeInterface.py |  2 +-
 test/problematic_tests/save_genome_test.py | 40 ++++++++++++++++++----
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 8e74c53d..1bcac050 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -187,7 +187,7 @@ def _save_genome_mass(self, params, validate_genome=True):
 
             # retrive required params
             name = input_params['name']
-            data = input_params['data']
+            data = dict(input_params['data'])
 
             # XXX there is no `workspace_datatype` param in the spec
             # NOTE: This allows a user to specify any arbitrary workspace type which could cause,
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 1c4644aa..648a36a6 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -12,6 +12,7 @@
 import urllib.request
 from configparser import ConfigParser
 from os import environ
+from datetime import datetime
 
 import requests  # noqa: F401
 
@@ -24,8 +25,17 @@
 from GenomeFileUtil.core.GenomeInterface import GenomeInterface
 from installed_clients.WorkspaceClient import Workspace as workspaceService
 
-KBASE_GENOME = "KBaseGenomes.Genome"
-KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly"
+_KBASE_GENOME = "KBaseGenomes.Genome"
+_KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly"
+
+_GENOME_FILE_WARNINGS = [
+    'For prokaryotes, CDS array should generally be the same length as the Features array.',
+    'Genome molecule_type Unknown is not expected for domain Bacteria.',
+    'Unable to determine organism taxonomy'
+]
+_METAGENOME_FILE_WARNINGS = [
+    'SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS.'
+]
 
 
 class SaveGenomeTest(unittest.TestCase):
@@ -212,13 +222,27 @@ def fail_save_genome(self, params, error, exception=ValueError, contains=False,
         else:
             self.assertEqual(error, str(context.exception))
 
-    def check_save_one_genome_output(self, ret, genome_name, data_type=KBASE_GENOME):
+    def check_save_one_genome_output(
+        self,
+        ret,
+        genome_name,
+        data_type=_KBASE_GENOME,
+        warnings=_GENOME_FILE_WARNINGS
+    ):
         self.assertTrue('info' in ret)
+        self.assertTrue('warnings' in ret)
 
+        # Check info
         genome_info = ret['info']
         self.assertEqual(genome_info[1], genome_name)
         self.assertEqual(genome_info[2].split('-')[0], data_type)
+        self.assertTrue(datetime.strptime(genome_info[3], '%Y-%m-%dT%H:%M:%S+%f'))
         self.assertEqual(genome_info[5], self.user_id)
+        self.assertEqual(genome_info[6], self.wsID)
+        self.assertEqual(genome_info[7], self.wsName)
+
+        # Check warnings
+        self.assertEqual(ret['warnings'], warnings)
 
     def test_bad_one_genome_params(self):
         self.start_test()
@@ -274,13 +298,15 @@ def test_genomes_with_upgrade(self):
             {
                 'name': genome_name,
                 'data': self.test_metagenome_data,
-                'workspace_datatype': KBASE_METAGENOME,
+                'workspace_datatype': _KBASE_METAGENOME,
                 'upgrade': True,
             }
         ]
         params = {'workspace_id': self.wsID, 'inputs': inputs}
         ret = self.genome_interface.save_genome_mass(params)[0]
-        self.check_save_one_genome_output(ret, genome_name, data_type=KBASE_METAGENOME)
+        self.check_save_one_genome_output(
+            ret, genome_name, data_type=_KBASE_METAGENOME, warnings=_METAGENOME_FILE_WARNINGS
+        )
 
     def test_genomes_with_hidden(self):
         self.start_test()
@@ -294,7 +320,7 @@ def test_genomes_with_hidden(self):
         ]
         params = {'workspace_id': self.wsID, 'inputs': inputs}
         ret = self.genome_interface.save_genome_mass(params)[0]
-        self.check_save_one_genome_output(ret, genome_name)
+        self.check_save_one_genome_output(ret, genome_name, warnings=[])
 
         inputs = [
             {
@@ -305,7 +331,7 @@ def test_genomes_with_hidden(self):
         ]
         params = {'workspace_id': self.wsID, 'inputs': inputs}
         ret = self.genome_interface.save_genome_mass(params)[0]
-        self.check_save_one_genome_output(ret, genome_name)
+        self.check_save_one_genome_output(ret, genome_name, warnings=[])
 
     def test_bad_genomes_params_missing_parameter(self):
         self.start_test()

From f018f0e7ab09f39b535e54c84e8e5f090d6cee9d Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 18 Sep 2024 21:40:19 -0700
Subject: [PATCH 21/24] remove metagenome from test

---
 lib/GenomeFileUtil/core/GenomeInterface.py | 11 +--
 test/data/metagenomes/toy/metagenome.json  | 50 ------------
 test/problematic_tests/save_genome_test.py | 89 ++--------------------
 3 files changed, 7 insertions(+), 143 deletions(-)
 delete mode 100644 test/data/metagenomes/toy/metagenome.json

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 1bcac050..0265dd6a 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -201,10 +201,7 @@ def _save_genome_mass(self, params, validate_genome=True):
             obj["name"] = name
             obj["meta"] = meta
 
-            if "AnnotatedMetagenomeAssembly" in ws_datatype:
-                if input_params.get('upgrade') or 'feature_counts' not in data:
-                    data = self._update_metagenome(data)
-            else:
+            if "AnnotatedMetagenomeAssembly" not in ws_datatype:
                 if input_params.get('upgrade') or 'feature_counts' not in data:
                     data = self._update_genome(data)
 
@@ -267,12 +264,6 @@ def determine_tier(source):
             return "Ensembl", ['Representative', 'ExternalDB']
         return source, ['User']
 
-    def _update_metagenome(self, genome):
-        """Checks for missing required fields and fixes breaking changes"""
-        if 'molecule_type' not in genome:
-            genome['molecule_type'] = 'Unknown'
-        return genome
-
     def _update_genome(self, genome):
         """Checks for missing required fields and fixes breaking changes"""
         # do top level updates
diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json
deleted file mode 100644
index 95c23ef8..00000000
--- a/test/data/metagenomes/toy/metagenome.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
-    "contig_ids": [
-        "Ga0065724_100001"
-    ],
-    "contig_lengths": [
-        538871
-    ],
-    "dna_size": 538871,
-    "domain": "Eukaryota",
-    "environment": null,
-    "external_source_origination_date": null,
-    "feature_counts": {
-        "CDS": 20,
-        "gene": 20,
-        "non_coding_features": 0,
-        "protein_encoding_gene": 20
-    },
-    "gc_content": 0.64469,
-    "genetic_code": 1,
-    "genome_type": "Metagenome",
-    "id": "MyMetagenome",
-    "md5": "e2ccbd5a9bed0148015bd6b784e3c1c3",
-    "molecule_type": "SingleLetterAlphabet",
-    "notes": null,
-    "num_contigs": 1,
-    "num_features": 40,
-    "ontologies_present": {},
-    "ontology_events": [
-        {
-            "id": "GO",
-            "method": "GenomeFileUtils Genbank uploader from annotations",
-            "method_version": "0.11.7",
-            "ontology_ref": "KBaseOntology/gene_ontology",
-            "timestamp": "2024_09_05_06_45_31"
-        }
-    ],
-    "original_source_file_name": null,
-    "publications": [],
-    "scientific_name": "Arabidopsis thaliana",
-    "source": "GFF",
-    "source_id": "unknown",
-    "suspect": 1,
-    "taxon_assignments": {
-        "ncbi": "3702"
-    },
-    "taxonomy": "cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; eudicotyledons; Gunneridae; Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; Camelineae; Arabidopsis",
-    "warnings": [
-        "SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS."
-    ]
-}
\ No newline at end of file
diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index 648a36a6..a396c5ba 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -26,16 +26,11 @@
 from installed_clients.WorkspaceClient import Workspace as workspaceService
 
 _KBASE_GENOME = "KBaseGenomes.Genome"
-_KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly"
-
 _GENOME_FILE_WARNINGS = [
     'For prokaryotes, CDS array should generally be the same length as the Features array.',
     'Genome molecule_type Unknown is not expected for domain Bacteria.',
     'Unable to determine organism taxonomy'
 ]
-_METAGENOME_FILE_WARNINGS = [
-    'SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS.'
-]
 
 
 class SaveGenomeTest(unittest.TestCase):
@@ -129,71 +124,16 @@ def delete_shock_node(cls, node_id):
 
     @classmethod
     def prepare_data(cls):
-
         assembly_file_path = os.path.join(cls.scratch,'e_coli_assembly.fasta')
-        meta_file_path = os.path.join(cls.scratch,'metagenome.fa')
-
         shutil.copy('data/e_coli/e_coli_assembly.fasta', assembly_file_path)
-        shutil.copy('data/metagenomes/toy/metagenome.fa', meta_file_path)
-
         au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
-
-        assembly_refs = au.save_assemblies_from_fastas(
-            {
-                'workspace_id': cls.wsID,
-                'inputs': [
-                    {
-                        'assembly_name': 'e_coli.assembly',
-                        'file': assembly_file_path
-                    },
-                    {
-                        'assembly_name': 'metagenome.assembly',
-                        'file': meta_file_path
-                    }
-                ]
-            }
-        )["results"]
-
+        assembly_ref = au.save_assembly_from_fasta({
+            'workspace_name': cls.wsName,
+            'assembly_name': 'e_coli.assembly',
+            'file': {'path': assembly_file_path}
+        })
         cls.test_genome_data = json.load(open('data/e_coli/e_coli.json'))
-        cls.test_genome_data['assembly_ref'] = assembly_refs[0]["upa"]
-
-        cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json'))
-        cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"]
-
-        # Set taregt paths in the share folder
-        fhr_path = os.path.join(cls.scratch,'features_handle_ref')
-        phr_path = os.path.join(cls.scratch,'protein_handle_ref')
-
-        # Create temp files
-        with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_features_file:
-            temp_features_file.write("test features_handle_ref")
-
-        with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_protein_file:
-            temp_protein_file.write("test protein_handle_ref")
-
-        # Move files to the share folder
-        shutil.copy(temp_features_file.name, fhr_path)
-        shutil.copy(temp_protein_file.name, phr_path)
-
-        # Upload files to the blobstore
-        handle_service_outputs = cls.dfu.file_to_shock_mass(
-            [
-                {'file_path': fhr_path, 'make_handle': 1, 'pack': 'gzip'},
-                {'file_path': phr_path, 'make_handle': 1, 'pack': 'gzip'}
-            ]
-        )
-
-        # Update metagenome
-        cls.test_metagenome_data["features_handle_ref"] = handle_service_outputs[0]["handle"]["hid"]
-        cls.test_metagenome_data["protein_handle_ref"]= handle_service_outputs[1]["handle"]["hid"]
-
-        # Delete shock_ids
-        cls.nodes_to_delete.append(handle_service_outputs[0]["shock_id"])
-        cls.nodes_to_delete.append(handle_service_outputs[1]["shock_id"])
-
-        # Remove temp files
-        os.remove(temp_features_file.name)
-        os.remove(temp_protein_file.name)
+        cls.test_genome_data['assembly_ref'] = assembly_ref
 
     def getWsClient(self):
         return self.__class__.wsClient
@@ -291,23 +231,6 @@ def test_genomes(self):
         ret = self.genome_interface.save_genome_mass(params, validate_genome=True)[0]
         self.check_save_one_genome_output(ret, genome_name)
 
-    def test_genomes_with_upgrade(self):
-        self.start_test()
-        genome_name = 'MyMetagenome'
-        inputs = [
-            {
-                'name': genome_name,
-                'data': self.test_metagenome_data,
-                'workspace_datatype': _KBASE_METAGENOME,
-                'upgrade': True,
-            }
-        ]
-        params = {'workspace_id': self.wsID, 'inputs': inputs}
-        ret = self.genome_interface.save_genome_mass(params)[0]
-        self.check_save_one_genome_output(
-            ret, genome_name, data_type=_KBASE_METAGENOME, warnings=_METAGENOME_FILE_WARNINGS
-        )
-
     def test_genomes_with_hidden(self):
         self.start_test()
         genome_name = 'test_genome_hidden'

From 54fedf0a3620a71458812bdfcd8b07267fdc34fa Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 18 Sep 2024 21:42:51 -0700
Subject: [PATCH 22/24] remove unused lib

---
 test/problematic_tests/save_genome_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py
index a396c5ba..c2e51b46 100644
--- a/test/problematic_tests/save_genome_test.py
+++ b/test/problematic_tests/save_genome_test.py
@@ -4,7 +4,6 @@
 import json  # noqa: F401
 import os  # noqa: F401
 import shutil
-import tempfile
 import time
 import unittest
 import urllib.error

From 861a50b5ce2d0b8b481fe514b41a6d6b89ee0c96 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Wed, 30 Oct 2024 16:35:38 -0700
Subject: [PATCH 23/24] fix documentation

---
 lib/GenomeFileUtil/core/GenomeInterface.py |  6 ++----
 lib/GenomeFileUtil/core/GenomeUtils.py     | 19 +++++++++----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py
index 0265dd6a..e16b12e1 100644
--- a/lib/GenomeFileUtil/core/GenomeInterface.py
+++ b/lib/GenomeFileUtil/core/GenomeInterface.py
@@ -40,9 +40,8 @@ def save_one_genome(self, params):
         """
         Saves a single genome object to the workspace.
 
-        This method prepares the parameters for saving a single genome and calls the
-        `_save_genome_mass` method to handle the actual saving process. It processes
-        the input parameters and performs necessary validation before saving the genome.
+        This method prepares and validates the necessary parameters for saving a genome.
+        It then executes the saving process and returns relevant information about the saved genome.
 
         Args:
             params (dict): A dictionary containing the parameters for saving the genome.
@@ -50,7 +49,6 @@ def save_one_genome(self, params):
 
         Returns:
             dict: The information about the saved genome object, including metadata.
-                The return value is derived from the `_save_genome_mass` method.
         """
         mass_params = GenomeUtils.set_up_single_params(
             params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id
diff --git a/lib/GenomeFileUtil/core/GenomeUtils.py b/lib/GenomeFileUtil/core/GenomeUtils.py
index d1b63935..491f6d24 100644
--- a/lib/GenomeFileUtil/core/GenomeUtils.py
+++ b/lib/GenomeFileUtil/core/GenomeUtils.py
@@ -495,7 +495,7 @@ def set_up_single_params(
     validate_params_func: Callable[[Dict[str, Any]], None],
     ws_name_to_id_func: Callable[[str], int]
 ) -> Dict[str, Any]:
-    """
+    f"""
     Sets up parameters by validating them and ensuring that exactly one of workspace ID or name is provided.
 
     Args:
@@ -509,12 +509,11 @@ def set_up_single_params(
 
     Returns:
         Dict[str, Any]: A dictionary containing the workspace ID and the processed parameters. The dictionary
-            has keys '_WSID' and '_INPUTS', where '_WSID' is the workspace ID and '_INPUTS' is a list containing
+            has keys {_WSID} and {_INPUTS}, where {_WSID} is the workspace ID and {_INPUTS} is a list containing
             the input parameters.
 
     Raises:
         ValueError: If neither or both the workspace ID and workspace name are provided in the parameters.
-        KeyError: If the workspace ID or name is missing or invalid.
 
     Notes:
         - If a workspace ID is not provided, the function will attempt to convert the workspace name to an ID
@@ -540,24 +539,24 @@ def validate_mass_params(
     params: Dict[str, Any],
     validate_params_func: Callable[[Dict[str, Any]], None]
 ) -> None:
-    """
+    f"""
     Validates the provided parameters according to specific rules.
 
     Args:
         params (Dict[str, Any]): A dictionary containing parameters to validate. Must include:
-            - _WSID: A workspace ID, which must be present and valid.
-            - _INPUTS: A list of parameter dictionaries, each of which must be validated by `validate_params_func`.
+            - {_WSID}: A workspace ID, which must be present and valid.
+            - {_INPUTS}: A list of parameter dictionaries, each of which must be validated by `validate_params_func`.
 
         validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters
             and validates it. The function should raise an exception if the parameters are invalid.
 
     Raises:
-        ValueError: If `_WSID` is missing or invalid, if `_INPUTS` is missing or not a non-empty list, or if any
-            entry in `_INPUTS` is not a dictionary or fails validation.
+        ValueError: If {_WSID} is missing or invalid, if {_INPUTS} is missing or not a non-empty list, or if any
+            entry in {_INPUTS} is not a dictionary or fails validation.
 
     Notes:
-        - The function checks that `_WSID` is present and converts it to an integer using `get_int`.
-        - The `_INPUTS` field must be a non-empty list of dictionaries. Each dictionary in the list is validated
+        - The function checks that {_WSID} is present and converts it to an integer using `get_int`.
+        - The {_INPUTS} field must be a non-empty list of dictionaries. Each dictionary in the list is validated
           using `validate_params_func`.
         - If any validation fails, a `ValueError` is raised with a message indicating the issue and entry index.
     """

From 01a1dc87a775123e18ed82d7dc2dfdea1fbba148 Mon Sep 17 00:00:00 2001
From: Sijie <sijiex@lbl.gov>
Date: Fri, 1 Nov 2024 10:35:19 -0700
Subject: [PATCH 24/24] add workspace_id in GenomeFileUtil.spec

---
 GenomeFileUtil.spec                      |  1 +
 lib/GenomeFileUtil/GenomeFileUtilImpl.py | 44 ++++++++++++------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/GenomeFileUtil.spec b/GenomeFileUtil.spec
index 05cca1f9..5971b43d 100644
--- a/GenomeFileUtil.spec
+++ b/GenomeFileUtil.spec
@@ -318,6 +318,7 @@ module GenomeFileUtil {
                 returns (MetagenomeSaveResult returnVal) authentication required;
 
     typedef structure {
+        int workspace_id;
         string workspace;
         string name;
         KBaseGenomes.Genome data;
diff --git a/lib/GenomeFileUtil/GenomeFileUtilImpl.py b/lib/GenomeFileUtil/GenomeFileUtilImpl.py
index 9ae1311d..b41f4d82 100644
--- a/lib/GenomeFileUtil/GenomeFileUtilImpl.py
+++ b/lib/GenomeFileUtil/GenomeFileUtilImpl.py
@@ -69,7 +69,7 @@ class GenomeFileUtil:
     ######################################### noqa
     VERSION = "0.11.7"
     GIT_URL = "git@github.com:kbaseapps/GenomeFileUtil.git"
-    GIT_COMMIT_HASH = "330d6c2da8a0dc2d57efffe690e8db2928455776"
+    GIT_COMMIT_HASH = "861a50b5ce2d0b8b481fe514b41a6d6b89ee0c96"
 
     #BEGIN_CLASS_HEADER
     #END_CLASS_HEADER
@@ -831,27 +831,27 @@ def fasta_gff_to_metagenome(self, ctx, params):
     def save_one_genome(self, ctx, params):
         """
         :param params: instance of type "SaveOneGenomeParams" -> structure:
-           parameter "workspace" of String, parameter "name" of String,
-           parameter "data" of type "Genome" (Genome type -- annotated and
-           assembled genome data. Field descriptions: id - string - KBase
-           legacy data ID scientific_name - string - human readable species
-           name domain - string - human readable phylogenetic domain name
-           (eg. "Bacteria") warnings - list of string - genome-level warnings
-           generated in the annotation process genome_tiers - list of string
-           - controlled vocabulary (based on app input and checked by
-           GenomeFileUtil) A list of labels describing the data source for
-           this genome. Allowed values - Representative, Reference,
-           ExternalDB, User Tier assignments based on genome source: * All
-           phytozome - Representative and ExternalDB * Phytozome flagship
-           genomes - Reference, Representative and ExternalDB * Ensembl -
-           Representative and ExternalDB * RefSeq Reference - Reference,
-           Representative and ExternalDB * RefSeq Representative -
-           Representative and ExternalDB * RefSeq Latest or All Assemblies
-           folder - ExternalDB * User Data - User tagged feature_counts - map
-           of string to integer - total counts of each type of feature keys
-           are a controlled vocabulary of - "CDS", "gene", "misc_feature",
-           "misc_recomb", "mobile_element", "ncRNA" - 72,
-           "non_coding_features", "non_coding_genes",
+           parameter "workspace_id" of Long, parameter "workspace" of String,
+           parameter "name" of String, parameter "data" of type "Genome"
+           (Genome type -- annotated and assembled genome data. Field
+           descriptions: id - string - KBase legacy data ID scientific_name -
+           string - human readable species name domain - string - human
+           readable phylogenetic domain name (eg. "Bacteria") warnings - list
+           of string - genome-level warnings generated in the annotation
+           process genome_tiers - list of string - controlled vocabulary
+           (based on app input and checked by GenomeFileUtil) A list of
+           labels describing the data source for this genome. Allowed values
+           - Representative, Reference, ExternalDB, User Tier assignments
+           based on genome source: * All phytozome - Representative and
+           ExternalDB * Phytozome flagship genomes - Reference,
+           Representative and ExternalDB * Ensembl - Representative and
+           ExternalDB * RefSeq Reference - Reference, Representative and
+           ExternalDB * RefSeq Representative - Representative and ExternalDB
+           * RefSeq Latest or All Assemblies folder - ExternalDB * User Data
+           - User tagged feature_counts - map of string to integer - total
+           counts of each type of feature keys are a controlled vocabulary of
+           - "CDS", "gene", "misc_feature", "misc_recomb", "mobile_element",
+           "ncRNA" - 72, "non_coding_features", "non_coding_genes",
            "protein_encoding_gene", "rRNA", "rep_origin", "repeat_region",
            "tRNA" genetic_code - int - An NCBI-assigned taxonomic category
            for the organism See here -