From 6208a9a8fa31b6f84345099a158fdd0ea90ec838 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 14 Aug 2024 17:14:23 -0700 Subject: [PATCH 01/24] add save_genomes function --- GenomeFileUtil.spec | 19 + RELEASE_NOTES.md | 2 + lib/GenomeFileUtil/GenomeFileUtilImpl.py | 438 ++++++++++++++++++++- lib/GenomeFileUtil/GenomeFileUtilServer.py | 4 + lib/GenomeFileUtil/core/GenbankToGenome.py | 13 +- lib/GenomeFileUtil/core/GenomeInterface.py | 202 +++++++--- lib/GenomeFileUtil/core/MiscUtils.py | 26 ++ test/problematic_tests/save_genome_test.py | 2 +- 8 files changed, 637 insertions(+), 69 deletions(-) diff --git a/GenomeFileUtil.spec b/GenomeFileUtil.spec index 05cca1f9..5c06cff1 100644 --- a/GenomeFileUtil.spec +++ b/GenomeFileUtil.spec @@ -332,6 +332,25 @@ module GenomeFileUtil { funcdef save_one_genome(SaveOneGenomeParams params) returns (SaveGenomeResult returnVal) authentication required; + typedef structure { + string name; + KBaseGenomes.Genome data; + boolean hidden; + boolean upgrade; + } GenomeInput; + + typedef structure { + int workspace_id; + list inputs; + } SaveGenomesParams; + + typedef structure { + list results; + } SaveGenomesResults; + + funcdef save_genomes(SaveGenomesParams params) + returns(SaveGenomesResults results) authentication required; + /* gff_file - object containing path to gff_file ws_ref - input Assembly or Genome reference diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index c020b8f0..260b4aa4 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.11.7] - TBD +- The `save_genomes` method was added to allow users to save genmes in batch +- Parsed and validated genome before upload - Unusable `export_genome_features_protein_to_fasta` function was removed - The `genbanks_to_genomes` method was added to allow users to upload multiple genome objects at once diff --git a/lib/GenomeFileUtil/GenomeFileUtilImpl.py b/lib/GenomeFileUtil/GenomeFileUtilImpl.py index 0d01dd6e..98543d35 100644 --- a/lib/GenomeFileUtil/GenomeFileUtilImpl.py +++ b/lib/GenomeFileUtil/GenomeFileUtilImpl.py @@ -69,7 +69,7 @@ class GenomeFileUtil: ######################################### noqa VERSION = "0.11.7" GIT_URL = "git@github.com:kbaseapps/GenomeFileUtil.git" - GIT_COMMIT_HASH = "591a19ccf4d1b42f01cc06486654b6d3a8ea08e4" + GIT_COMMIT_HASH = "4819598e65eea38d3c38d8c2ab808aca0d9697fd" #BEGIN_CLASS_HEADER #END_CLASS_HEADER @@ -1262,6 +1262,442 @@ def save_one_genome(self, ctx, params): # return the results return [returnVal] + def save_genomes(self, ctx, params): + """ + :param params: instance of type "SaveGenomesParams" -> structure: + parameter "workspace_id" of Long, parameter "inputs" of list of + type "GenomeInput" -> structure: parameter "name" of String, + parameter "data" of type "Genome" (Genome type -- annotated and + assembled genome data. Field descriptions: id - string - KBase + legacy data ID scientific_name - string - human readable species + name domain - string - human readable phylogenetic domain name + (eg. "Bacteria") warnings - list of string - genome-level warnings + generated in the annotation process genome_tiers - list of string + - controlled vocabulary (based on app input and checked by + GenomeFileUtil) A list of labels describing the data source for + this genome. Allowed values - Representative, Reference, + ExternalDB, User Tier assignments based on genome source: * All + phytozome - Representative and ExternalDB * Phytozome flagship + genomes - Reference, Representative and ExternalDB * Ensembl - + Representative and ExternalDB * RefSeq Reference - Reference, + Representative and ExternalDB * RefSeq Representative - + Representative and ExternalDB * RefSeq Latest or All Assemblies + folder - ExternalDB * User Data - User tagged feature_counts - map + of string to integer - total counts of each type of feature keys + are a controlled vocabulary of - "CDS", "gene", "misc_feature", + "misc_recomb", "mobile_element", "ncRNA" - 72, + "non_coding_features", "non_coding_genes", + "protein_encoding_gene", "rRNA", "rep_origin", "repeat_region", + "tRNA" genetic_code - int - An NCBI-assigned taxonomic category + for the organism See here - + https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi dna_size + - integer - total number of nucleotides num_contigs - integer - + total number of contigs in the genome molecule_type - string - + controlled vocab - the type of molecule sequenced Possible values + are "Unknown", "DNA", "RNA", "genomic DNA", "genomic RNA", "mRNA", + "tRNA", "rRNA", "other RNA", "other DNA", "transcribed RNA", + "viral cRNA", "unassigned DNA", "unassigned RNA" contig_lengths - + list of int - nucleotide length of each contig in the genome + Indexes in this list correspond to indexes in the `contig_ids` + list. contig_ids - list of str - external database identifiers for + each contig (eg. "NC_000913.3") source - str - controlled vocab - + descriptor of where this data came from (eg. "RefSeq") Allowed + entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload + source_id - string - identifier of this genome from the source + database (eg. the RefSeq ID such as "NC_000913") md5 - string - + checksum of the underlying assembly sequence taxonomy - string - + semicolon-delimited taxonomy lineage, in order of parent to child + taxon_assignments - mapping of taxonomy namespace to taxon ID. + example - {"ncbi": "286", "gtdb": "s__staphylococcus_devriesei"} + gc_content - float - ratio of GC count to AT in the genome + publications - tuple of (pubmedid, source, title, web_addr, year, + authors, journal). See typedef above. ontology_events - A record + of the service and method used for a set of ontology assignments + on the genome. ontologies_present - a mapping of ontology source + id (eg. "GO") to a mapping of term IDs (eg "GO:16209") to term + names (eg. "histidine biosynthetic process"). features - array of + Feature - protein coding genes (see the separate Feature spec) + cdss - array of protein-coding sequences mrnas - array of + transcribed messenger RNA sequences (equal to cdss plus 5' and 3' + UTRs) non_coding_features - array of features that does not + include mRNA, CDS, and protein-encoding genes assembly_ref - + workspace reference to an assembly object from which this + annotated genome was derived. taxon_ref - workspace reference to a + taxon object that classifies the species or strain of this genome. + genbank_handle_ref - file server handle reference to the source + genbank file for this genome. gff_handle_ref - file server handle + reference to the source GFF file for this genome. + external_source_origination_date - TODO look at GFU for this + release - string - User-supplied release or version of the source + data. This most likely will come from an input field in the import + app. original_source_file_name - filename from which this genome + was derived (eg. genbank or gff filename). notes - TODO + quality_scores - TODO suspect - bool - flag of whether this + annotation is problematic due to some warning genome_type - string + - controlled vocab - One of "draft isolate", "finished isolate", + "mag", "sag", "virus", "plasmid", "construct" Features vs. coding + sequences: a feature is a sequence in the DNA that codes for a + protein, including non-transcribed introns. A coding sequence + (stored as `cdss`) includes **only** the sections of the feature + that codes for a protein, minus introns and UTRs. @optional + warnings contig_lengths contig_ids source_id taxonomy publications + @optional ontology_events ontologies_present non_coding_features + mrnas genome_type @optional genbank_handle_ref gff_handle_ref + external_source_origination_date @optional release + original_source_file_name notes quality_scores suspect + assembly_ref @optional taxon_ref taxon_assignments @metadata ws + gc_content as GC content @metadata ws taxonomy as Taxonomy + @metadata ws md5 as MD5 @metadata ws dna_size as Size @metadata ws + genetic_code as Genetic code @metadata ws domain as Domain + @metadata ws source_id as Source ID @metadata ws source as Source + @metadata ws scientific_name as Name @metadata ws genome_type as + Type @metadata ws length(features) as Number of Protein Encoding + Genes @metadata ws length(cdss) as Number of CDS @metadata ws + assembly_ref as Assembly Object @metadata ws num_contigs as Number + contigs @metadata ws length(warnings) as Number of Genome Level + Warnings @metadata ws suspect as Suspect Genome) -> structure: + parameter "id" of type "Genome_id" (KBase legacy data ID @id kb), + parameter "scientific_name" of String, parameter "domain" of + String, parameter "warnings" of list of String, parameter + "genome_tiers" of list of String, parameter "feature_counts" of + mapping from String to Long, parameter "genetic_code" of Long, + parameter "dna_size" of Long, parameter "num_contigs" of Long, + parameter "molecule_type" of String, parameter "contig_lengths" of + list of Long, parameter "contig_ids" of list of String, parameter + "source" of String, parameter "source_id" of type "source_id" + (Reference to a source_id @id external), parameter "md5" of + String, parameter "taxonomy" of String, parameter + "taxon_assignments" of mapping from String to String, parameter + "gc_content" of Double, parameter "publications" of list of type + "publication" (Structure for a publication Elements: (0) pubmedid + - float (1) source - string - (ex. Pubmed) (2) title - string (3) + string web address - string (4) publication year - string (5) + authors - string (6) journal - string) -> tuple of size 7: + parameter "pubmedid" of Double, parameter "source" of String, + parameter "title" of String, parameter "url" of String, parameter + "year" of String, parameter "authors" of String, parameter + "journal" of String, parameter "ontology_events" of list of type + "Ontology_event" (@optional ontology_ref method_version eco) -> + structure: parameter "id" of String, parameter "ontology_ref" of + type "Ontology_ref" (Reference to a ontology object @id ws + KBaseOntology.OntologyDictionary), parameter "method" of String, + parameter "method_version" of String, parameter "timestamp" of + String, parameter "eco" of String, parameter "ontologies_present" + of mapping from String to mapping from String to String, parameter + "features" of list of type "Feature" (Structure for a single CDS + encoding "gene" of a genome ONLY PUT GENES THAT HAVE A + CORRESPONDING CDS IN THIS ARRAY NOTE: Sequence is optional. + Ideally we can keep it in here, but Recognize due to space + constraints another solution may be needed. We may want to add + additional fields for other CDM functions (e.g., atomic regulons, + coexpressed fids, co_occurring fids,...) + protein_translation_length and protein_translation are for longest + coded protein (representative protein for splice variants) NOTE: + New Aliases field definitely breaks compatibility. As Does + Function. flags are flag fields in GenBank format. This will be a + controlled vocabulary. Initially Acceptable values are pseudo, + ribosomal_slippage, and trans_splicing Md5 is the md5 of + dna_sequence. @optional functions ontology_terms note + protein_translation mrnas flags warnings @optional inference_data + dna_sequence aliases db_xrefs children functional_descriptions) -> + structure: parameter "id" of type "Feature_id" (KBase Feature ID + @id external), parameter "location" of list of tuple of size 4: + type "Contig_id" (ContigSet contig ID @id external), Long, String, + Long, parameter "functions" of list of String, parameter + "functional_descriptions" of list of String, parameter + "ontology_terms" of mapping from String to mapping from String to + list of Long, parameter "note" of String, parameter "md5" of + String, parameter "protein_translation" of String, parameter + "protein_translation_length" of Long, parameter "cdss" of list of + String, parameter "mrnas" of list of String, parameter "children" + of list of String, parameter "flags" of list of String, parameter + "warnings" of list of String, parameter "inference_data" of list + of type "InferenceInfo" (Type spec for the "InferenceInfo" object. + TODO docs Found in the `inference_data` fields in mRNAs and CDSs + Fields: category - string - TODO type - string - TODO evidence - + string - TODO) -> structure: parameter "category" of String, + parameter "type" of String, parameter "evidence" of String, + parameter "dna_sequence" of String, parameter + "dna_sequence_length" of Long, parameter "aliases" of list of + tuple of size 2: parameter "fieldname" of String, parameter + "alias" of String, parameter "db_xrefs" of list of tuple of size + 2: parameter "db_source" of String, parameter "db_identifier" of + String, parameter "non_coding_features" of list of type + "NonCodingFeature" (Structure for a single feature that is NOT one + of the following: - Protein encoding gene (gene that has a + corresponding CDS) - mRNA - CDS Note pseudo-genes and Non protein + encoding genes are put into this flags are flag fields in GenBank + format. This will be a controlled vocabulary. Initially Acceptable + values are pseudo, ribosomal_slippage, and trans_splicing Md5 is + the md5 of dna_sequence. @optional functions ontology_terms note + flags warnings functional_descriptions @optional inference_data + dna_sequence aliases db_xrefs children parent_gene) -> structure: + parameter "id" of type "Feature_id" (KBase Feature ID @id + external), parameter "location" of list of tuple of size 4: type + "Contig_id" (ContigSet contig ID @id external), Long, String, + Long, parameter "type" of String, parameter "functions" of list of + String, parameter "functional_descriptions" of list of String, + parameter "ontology_terms" of mapping from String to mapping from + String to list of Long, parameter "note" of String, parameter + "md5" of String, parameter "parent_gene" of String, parameter + "children" of list of String, parameter "flags" of list of String, + parameter "warnings" of list of String, parameter "inference_data" + of list of type "InferenceInfo" (Type spec for the "InferenceInfo" + object. TODO docs Found in the `inference_data` fields in mRNAs + and CDSs Fields: category - string - TODO type - string - TODO + evidence - string - TODO) -> structure: parameter "category" of + String, parameter "type" of String, parameter "evidence" of + String, parameter "dna_sequence" of String, parameter + "dna_sequence_length" of Long, parameter "aliases" of list of + tuple of size 2: parameter "fieldname" of String, parameter + "alias" of String, parameter "db_xrefs" of list of tuple of size + 2: parameter "db_source" of String, parameter "db_identifier" of + String, parameter "cdss" of list of type "CDS" (Structure for a + single coding sequence. Coding sequences are the sections of a + feature's sequence that are translated to a protein (minus introns + and UTRs). Fields: id - string - identifier of the coding + sequence, such as "b0001_CDS_1" location - list> - list of locations from where this sequence + originates in the original assembly. Each sub-sequence in the list + constitutes a section of the resulting CDS. The first element in + the tuple corresponds to the "contig_id", such as "NC_000913.3". + The second element in the tuple is an index in the contig of where + the sequence starts. The third element is either a plus or minus + sign indicating whether it is on the 5' to 3' leading strand ("+") + or on the 3' to 5' lagging strand ("-"). The last element is the + length of the sub-sequence. For a location on the leading strand + (denoted by "+"), the index is of the leftmost base, and the + sequence extends to the right. For a location on the lagging + strand (denoted by "-"), the index is of the rightmost base, and + the sequence extends to the left. NOTE: the last element in each + tuple is the *length* of each sub-sequence. If you have a location + such as ("xyz", 100, "+", 50), then your sequence will go from + index 100 to index 149 (this has a length of 50). It *does not* go + from index 100 to index 150, as that would have a length of 51. + Likewise, if you have the location ("xyz", 100, "-", 50), then the + sequence extends from 100 down to 51, which has a length of 50 + bases. It does not go from index 100 to 50, as that would have a + length of 51. md5 - string - md5 of the dna sequence - TODO + clarification protein_md5 - string - hash of the protein sequence + that this CDS encodes parent_gene - string - gene (feature) from + which this CDS comes from, including introns and UTRs that have + been removed to create this CDS. parent_mrna - string - mRNA + sequence from which this sequence is derived, including UTRs but + not introns. note - string - TODO functions - list - list + of protein products or chemical processes that this sequence + creates, facilitates, or influences. functional_descriptions - + list - TODO list of protein products or chemical processes + that sequence creates, facilitates, or influences. ontology_terms + - mapping>> - a mapping of + ontology source id (eg. "GO") to a mapping of term IDs (eg + "GO:16209") to a list of indexes into the ontology_events data + (found in the top level of the genome object). The index into an + ontology event indicates what service and method created this term + assignment. flags - list - (controlled vocab) fields from + the genbank source. A common example is "pseudo" for pseudo-genes + that do not encode proteins, which shows up as "/pseudo" in the + genbank. Values can be: "pseudo", "ribosomal_slippage", + "trans_splicing" warnings - list - TODO inference_data - + list - TODO protein_translation - string - amino + acid sequence that this CDS gets translated into. + protein_translation_length - int - length of the above aliases - + list<(string, string)> - alternative list of names or identifiers + eg: [["gene", "thrA"], ["locus_tag", "b0002"]] db_xrefs - + list<(string, string)> - Identifiers from other databases + (database cross-references) The first string is the database name, + the second is the database identifier. eg: [["ASAP", + "ABE-0000006"], ["EcoGene", "EG11277"]] dna_sequence - string - + sequence of exons from the genome that constitute this protein + encoding sequence. dna_sequence_length - int - length of the above + @optional parent_gene parent_mrna functions ontology_terms note + flags warnings @optional inference_data dna_sequence aliases + db_xrefs functional_descriptions) -> structure: parameter "id" of + type "cds_id" (KBase CDS ID @id external), parameter "location" of + list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id + external), Long, String, Long, parameter "md5" of String, + parameter "protein_md5" of String, parameter "parent_gene" of type + "Feature_id" (KBase Feature ID @id external), parameter + "parent_mrna" of type "mrna_id" (KBase mRNA ID @id external), + parameter "note" of String, parameter "functions" of list of + String, parameter "functional_descriptions" of list of String, + parameter "ontology_terms" of mapping from String to mapping from + String to list of Long, parameter "flags" of list of String, + parameter "warnings" of list of String, parameter "inference_data" + of list of type "InferenceInfo" (Type spec for the "InferenceInfo" + object. TODO docs Found in the `inference_data` fields in mRNAs + and CDSs Fields: category - string - TODO type - string - TODO + evidence - string - TODO) -> structure: parameter "category" of + String, parameter "type" of String, parameter "evidence" of + String, parameter "protein_translation" of String, parameter + "protein_translation_length" of Long, parameter "aliases" of list + of tuple of size 2: parameter "fieldname" of String, parameter + "alias" of String, parameter "db_xrefs" of list of tuple of size + 2: parameter "db_source" of String, parameter "db_identifier" of + String, parameter "dna_sequence" of String, parameter + "dna_sequence_length" of Long, parameter "mrnas" of list of type + "mRNA" (The mRNA is the transcribed sequence from the original + feature, minus the introns, but including the UTRs. Fields: id - + string - identifying string for the mRNA location - + list> - list of locations from + where this sequence originates in the original assembly. Each + sub-sequence in the list constitutes a section of the resulting + CDS. The first element in the tuple corresponds to the + "contig_id", such as "NC_000913.3". The second element in the + tuple is an index in the contig of where the sequence starts. The + third element is either a plus or minus sign indicating whether it + is on the 5' to 3' leading strand ("+") or on the 3' to 5' lagging + strand ("-"). The last element is the length of the sub-sequence. + For a location on the leading strand (denoted by "+"), the index + is of the leftmost base, and the sequence extends to the right. + For a location on the lagging strand (denoted by "-"), the index + is of the rightmost base, and the sequence extends to the left. + NOTE: the last element in each tuple is the *length* of each + sub-sequence. If you have a location such as ("xyz", 100, "+", + 50), then your sequence will go from index 100 to index 149 (this + has a length of 50). It *does not* go from index 100 to index 150, + as that would have a length of 51. Likewise, if you have the + location ("xyz", 100, "-", 50), then the sequence extends from 100 + down to 51, which has a length of 50 bases. It does not go from + index 100 to 50, as that would have a length of 51. md5 - string - + md5 of the dna sequence - TODO clarification parent_gene - + Feature_id - corresponding feature for this sequence, including + introns and UTRs cds - string - corresponding coding sequence for + this mRNA (the sequence minus UTRs) dna_sequence - string - + sequence of UTRs and exons from the genome that constitute this + mRNA dna_sequence_length - int - length of the above note - string + - TODO functions - list - TODO list of protein products or + chemical processes that sequence creates, facilitates, or + influences. functional_descriptions - list - TODO list of + protein products or chemical processes that sequence creates, + facilitates, or influences. ontology_terms - mapping>> - a mapping of ontology source id (eg. + "GO") to a mapping of term IDs (eg "GO:16209") to a list of + indexes into the ontology_events data (found in the top level of + the genome object). The index into an ontology event indicates + what service and method created this term assignment. flags - + list - controlled vocab - fields from the genbank source. + A common example is "pseudo" for pseudo-genes that do not encode + proteins, which shows up as "/pseudo" in the genbank. Values can + be: "pseudo", "ribosomal_slippage", "trans_splicing" warnings - + list - TODO inference_data - list - TODO + aliases - list<(string, string)> - alternative list of names or + identifiers eg: [["gene", "thrA"], ["locus_tag", "b0002"]] + db_xrefs - list<(string, string)> - Identifiers from other + databases (database cross-references). The first string is the + database name, the second is the database identifier. eg: + [["ASAP", "ABE-0000006"], ["EcoGene", "EG11277"]] @optional + parent_gene cds functions ontology_terms note flags warnings + @optional inference_data dna_sequence aliases db_xrefs + functional_descriptions) -> structure: parameter "id" of type + "mrna_id" (KBase mRNA ID @id external), parameter "location" of + list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id + external), Long, String, Long, parameter "md5" of String, + parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id + external), parameter "cds" of type "cds_id" (KBase CDS ID @id + external), parameter "dna_sequence" of String, parameter + "dna_sequence_length" of Long, parameter "note" of String, + parameter "functions" of list of String, parameter + "functional_descriptions" of list of String, parameter + "ontology_terms" of mapping from String to mapping from String to + list of Long, parameter "flags" of list of String, parameter + "warnings" of list of String, parameter "inference_data" of list + of type "InferenceInfo" (Type spec for the "InferenceInfo" object. + TODO docs Found in the `inference_data` fields in mRNAs and CDSs + Fields: category - string - TODO type - string - TODO evidence - + string - TODO) -> structure: parameter "category" of String, + parameter "type" of String, parameter "evidence" of String, + parameter "aliases" of list of tuple of size 2: parameter + "fieldname" of String, parameter "alias" of String, parameter + "db_xrefs" of list of tuple of size 2: parameter "db_source" of + String, parameter "db_identifier" of String, parameter + "assembly_ref" of type "Assembly_ref" (Reference to an Assembly + object in the workspace @id ws KBaseGenomeAnnotations.Assembly), + parameter "taxon_ref" of type "Taxon_ref" (Reference to a taxon + object @id ws KBaseGenomeAnnotations.Taxon), parameter + "genbank_handle_ref" of type "genbank_handle_ref" (Reference to a + handle to the Genbank file on shock @id handle), parameter + "gff_handle_ref" of type "gff_handle_ref" (Reference to a handle + to the GFF file on shock @id handle), parameter + "external_source_origination_date" of String, parameter "release" + of String, parameter "original_source_file_name" of String, + parameter "notes" of String, parameter "quality_scores" of list of + type "GenomeQualityScore" (Genome quality score Fields: method - + string - TODO method_report_ref - string - TODO method_version - + string - TODO score: string - TODO score_interpretation - string - + TODO timestamp - string - TODO Score_interpretation - + fraction_complete - controlled vocabulary managed by API @optional + method_report_ref method_version) -> structure: parameter "method" + of String, parameter "method_report_ref" of type + "Method_report_ref" (Reference to a report object @id ws + KBaseReport.Report), parameter "method_version" of String, + parameter "score" of String, parameter "score_interpretation" of + String, parameter "timestamp" of String, parameter "suspect" of + type "Bool", parameter "genome_type" of String, parameter "hidden" + of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, + 1)), parameter "upgrade" of type "boolean" (A boolean - 0 for + false, 1 for true. @range (0, 1)) + :returns: instance of type "SaveGenomesResults" -> structure: + parameter "results" of list of type "SaveGenomeResult" -> + structure: parameter "info" of type "object_info" (Information + about an object, including user provided metadata. obj_id objid - + the numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + # ctx is the context object + # return variables are: results + #BEGIN save_genomes + results = { + "results": GenomeInterface(self.cfg).save_genome_mass(params) + } + #END save_genomes + + # At some point might do deeper type checking... + if not isinstance(results, dict): + raise ValueError('Method save_genomes return value ' + + 'results is not type dict as required.') + # return the results + return [results] + def ws_obj_gff_to_genome(self, ctx, params): """ This function takes in a workspace object of type KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly and a gff file and produces a KBaseGenomes.Genome reanotated according to the the input gff file. diff --git a/lib/GenomeFileUtil/GenomeFileUtilServer.py b/lib/GenomeFileUtil/GenomeFileUtilServer.py index 6ae2c611..2c09c2e0 100644 --- a/lib/GenomeFileUtil/GenomeFileUtilServer.py +++ b/lib/GenomeFileUtil/GenomeFileUtilServer.py @@ -394,6 +394,10 @@ def __init__(self): name='GenomeFileUtil.save_one_genome', types=[dict]) self.method_authentication['GenomeFileUtil.save_one_genome'] = 'required' # noqa + self.rpc_service.add(impl_GenomeFileUtil.save_genomes, + name='GenomeFileUtil.save_genomes', + types=[dict]) + self.method_authentication['GenomeFileUtil.save_genomes'] = 'required' # noqa self.rpc_service.add(impl_GenomeFileUtil.ws_obj_gff_to_genome, name='GenomeFileUtil.ws_obj_gff_to_genome', types=[dict]) diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py index d1851a61..8342e284 100644 --- a/lib/GenomeFileUtil/core/GenbankToGenome.py +++ b/lib/GenomeFileUtil/core/GenbankToGenome.py @@ -19,6 +19,7 @@ from installed_clients.AssemblyUtilClient import AssemblyUtil from installed_clients.DataFileUtilClient import DataFileUtil from GenomeFileUtil.core.GenomeInterface import GenomeInterface +from GenomeFileUtil.core.MiscUtils import get_int from installed_clients.WorkspaceClient import Workspace from GenomeFileUtil.core.GenomeUtils import ( is_parent, propagate_cds_props_to_gene, warnings, parse_inferences, @@ -125,7 +126,7 @@ def _set_up_single_params(self, params): # avoid side effects and keep variables in params unmodfied inputs = dict(params) self._validate_params(inputs) - ws_id = self._get_int(inputs.pop(_WSID, None), _WSID) + ws_id = get_int(inputs.pop(_WSID, None), _WSID) ws_name = inputs.pop(_WSNAME, None) if (bool(ws_id) == bool(ws_name)): # xnor raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided") @@ -137,7 +138,7 @@ def _set_up_single_params(self, params): return mass_params def _validate_mass_params(self, params): - ws_id = self._get_int(params.get(_WSID), _WSID) + ws_id = get_int(params.get(_WSID), _WSID) if not ws_id: raise ValueError(f"{_WSID} is required") inputs = params.get(_INPUTS) @@ -151,14 +152,6 @@ def _validate_mass_params(self, params): except Exception as e: raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e - def _get_int(self, putative_int, name, minimum=1): - if putative_int is not None: - if type(putative_int) is not int: - raise ValueError(f"{name} must be an integer, got: {putative_int}") - if putative_int < minimum: - raise ValueError(f"{name} must be an integer >= {minimum}") - return putative_int - def _import_genbank_mass(self, params): workspace_id = params[_WSID] diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index def9a499..57cd8848 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -13,9 +13,14 @@ from installed_clients.DataFileUtilClient import DataFileUtil from installed_clients.WSLargeDataIOClient import WsLargeDataIO from GenomeFileUtil.core import GenomeUtils +from GenomeFileUtil.core.MiscUtils import get_int MAX_GENOME_SIZE = 2**30 +_WS = "workspace" +_WSID = "workspace_id" +_INPUTS = "inputs" + class GenomeInterface: def __init__(self, config): @@ -32,18 +37,80 @@ def __init__(self, config): self.scratch = config.raw['scratch'] self.ws_large_data = WsLargeDataIO(self.callback_url) - @staticmethod - def _validate_save_one_genome_params(params): + def save_one_genome(self, params): + print("validating parameters") + mass_params = self._set_up_single_params(params) + return self._save_genome_mass(mass_params)[0] + + def save_genome_mass(self, params): + print("validating parameters") + self._validate_mass_params(params) + return self._save_genome_mass(params) + + def _set_up_single_params(self, params): + inputs = dict(params) + self._validate_genome_input_params(inputs) + ws_id = get_int(inputs.pop(_WSID, None), _WSID) + ws_name = inputs.pop('workspace', None) + if bool(ws_id) == bool(ws_name): # xnor + raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided") + if not ws_id: + print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting " + + "a workspace ID over a mutable workspace name that may cause race conditions") + ws_id = self.dfu.ws_name_to_id(ws_name) + mass_params = {_WSID: ws_id, _INPUTS: [inputs]} + return mass_params + + def _validate_mass_params(self, params): + ws_id = get_int(params.get(_WSID), _WSID) + if not ws_id: + raise ValueError(f"{_WSID} is required") + inputs = params.get(_INPUTS) + if not inputs or type(inputs) != list: + raise ValueError(f"{_INPUTS} field is required and must be a non-empty list") + for i, inp in enumerate(inputs, start=1): + if type(inp) != dict: + raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required") + try: + self._validate_genome_input_params(inp) + except Exception as e: + raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e + + def _validate_genome_input_params(self, genome_input): """ - _validate_save_one_genome_params: - validates params passed to save_one_genome method + Check required parameters are in genome_input """ - logging.info('start validating save_one_genome params') + logging.info("start validating genome_input params") # check for required parameters - for p in ['workspace', 'name', 'data']: - if p not in params: - raise ValueError( - '"{}" parameter is required, but missing'.format(p)) + for p in ["name", "data"]: + if p not in genome_input: + raise ValueError(f"{p} parameter is required, but missing") + + def _save_genome_objects( + self, + workspace_id, + ws_datatypes, + data_paths, + names, + meta_data, + hidden_data, + ): + ws_inputs = [] + for ws_datatype, data_path, name, meta, hidden in zip( + ws_datatypes, data_paths, names, meta_data, hidden_data + ): + ws_inputs.append( + { + 'type': ws_datatype, + 'data_json_file': data_path, + 'name': name, + 'meta': meta, + 'hidden': hidden, + } + ) + return self.ws_large_data.save_objects( + {'id': workspace_id, 'objects': ws_inputs} + ) def _check_shock_response(self, response, errtxt): """ @@ -128,54 +195,75 @@ def get_one_genome(self, params): return data, res['info'] # return self.dfu.get_objects(params)['data'][0] - def save_one_genome(self, params): - logging.info('start saving genome object') - self._validate_save_one_genome_params(params) - workspace = params['workspace'] - name = params['name'] - data = params['data'] - # XXX there is no `workspace_datatype` param in the spec - ws_datatype = params.get('workspace_datatype', "KBaseGenomes.Genome") - # XXX there is no `meta` param in the spec - meta = params.get('meta', {}) - if "AnnotatedMetagenomeAssembly" in ws_datatype: - if params.get('upgrade') or 'feature_counts' not in data: - data = self._update_metagenome(data) - else: - if params.get('upgrade') or 'feature_counts' not in data: - data = self._update_genome(data) - - # check all handles point to shock nodes owned by calling user - self._own_handle(data, 'genbank_handle_ref') - self._own_handle(data, 'gff_handle_ref') - if "AnnotatedMetagenomeAssembly" not in ws_datatype: - self._check_dna_sequence_in_features(data) - data['warnings'] = self.validate_genome(data) - - # sort data - data = GenomeUtils.sort_dict(data) - # dump genome to scratch for upload - data_path = os.path.join(self.scratch, name + ".json") - json.dump(data, open(data_path, 'w')) - if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'): - hidden = 1 - else: - hidden = 0 - - if isinstance(workspace, int) or workspace.isdigit(): - workspace_id = workspace - else: - workspace_id = self.dfu.ws_name_to_id(workspace) - - save_params = {'id': workspace_id, - 'objects': [{'type': ws_datatype, - 'data_json_file': data_path, - 'name': name, - 'meta': meta, - 'hidden': hidden}]} - dfu_oi = self.ws_large_data.save_objects(save_params)[0] - returnVal = {'info': dfu_oi, 'warnings': data.get('warnings', [])} - return returnVal + def _save_genome_mass(self, params): + + workspace_id = params[_WSID] + inputs = params[_INPUTS] + + ws_datatypes = [] + data_paths = [] + names = [] + meta_data = [] + hidden_data = [] + warnings = [] + + for input_params in inputs: + + # retrive required params + name = input_params['name'] + data = input_params['data'] + + # XXX there is no `workspace_datatype` param in the spec + ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome") + # XXX there is no `meta` param in the spec + meta = input_params.get('meta', {}) + + ws_datatypes.append(ws_datatype) + names.append(name) + meta_data.append(meta) + + if "AnnotatedMetagenomeAssembly" in ws_datatype: + if input_params.get('upgrade') or 'feature_counts' not in data: + data = self._update_metagenome(data) + else: + if input_params.get('upgrade') or 'feature_counts' not in data: + data = self._update_genome(data) + + # check all handles point to shock nodes owned by calling user + self._own_handle(data, 'genbank_handle_ref') + self._own_handle(data, 'gff_handle_ref') + if "AnnotatedMetagenomeAssembly" not in ws_datatype: + self._check_dna_sequence_in_features(data) + data['warnings'] = self.validate_genome(data) + + # sort data + data = GenomeUtils.sort_dict(data) + # dump genome to scratch for upload + data_path = os.path.join(self.scratch, name + ".json") + json.dump(data, open(data_path, 'w')) + if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'): + hidden = 1 + else: + hidden = 0 + + data_paths.append(data_path) + hidden_data.append(hidden) + warnings.append(data.get('warnings', [])) + + dfu_infos = self._save_genome_objects( + workspace_id, + ws_datatypes, + data_paths, + names, + meta_data, + hidden_data, + ) + + output = [ + {'info': dfu_oi, 'warnings': warning} + for dfu_oi, warning in zip(dfu_infos, warnings) + ] + return output @staticmethod def determine_tier(source): diff --git a/lib/GenomeFileUtil/core/MiscUtils.py b/lib/GenomeFileUtil/core/MiscUtils.py index a34eb6f7..e99a7b71 100644 --- a/lib/GenomeFileUtil/core/MiscUtils.py +++ b/lib/GenomeFileUtil/core/MiscUtils.py @@ -12,3 +12,29 @@ def validate_lists_have_same_elements(l1, l2): diff = set(l1) ^ (set(l2)) # get the symmetric difference of the sets # check if all ids are shared return len(diff) == 0 + + +def get_int(putative_int, name, minimum=1): + """ + Validates and returns an integer value. + + This function checks whether the provided value is an integer and if it meets the specified minimum value. + If the checks are not passed, it raises a `ValueError` with a descriptive message. + + Args: + putative_int (int or None): The value to be validated and returned. If `None`, it will be returned as is. + name (str): A descriptive name for the value being checked. This is used in error messages. + minimum (int, optional): The minimum acceptable value for `putative_int`. Defaults to 1. + + Returns: + int: The validated integer if all checks are passed. + + Raises: + ValueError: If `putative_int` is not an integer, or if it is less than `minimum`. + """ + if putative_int is not None: + if type(putative_int) is not int: + raise ValueError(f"{name} must be an integer, got: {putative_int}") + if putative_int < minimum: + raise ValueError(f"{name} must be an integer >= {minimum}") + return putative_int diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 7a93d816..d2582d28 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -163,7 +163,7 @@ def test_bad_one_genome_params(self): invalidate_params = {'missing_workspace': 'workspace', 'name': 'name', 'data': 'data'} - error_msg = '"workspace" parameter is required, but missing' + error_msg = "Exactly one of a 'workspace_id' or a 'workspace' parameter must be provided" self.fail_save_one_genome(invalidate_params, error_msg) def test_one_genome(self): From a2fabf4f8dd9ecc5ec9be7f545c628f4f6244430 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 15 Aug 2024 18:22:14 -0700 Subject: [PATCH 02/24] fix positional arg #1 is the wrong type bug --- lib/GenomeFileUtil/core/GenbankToGenome.py | 2 +- lib/GenomeFileUtil/core/GenomeInterface.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py index 8342e284..3bf11b0f 100644 --- a/lib/GenomeFileUtil/core/GenbankToGenome.py +++ b/lib/GenomeFileUtil/core/GenbankToGenome.py @@ -223,7 +223,7 @@ def _save_genomes(self, workspace_id, genome_objs): results = [ self.gi.save_one_genome( { - 'workspace': workspace_id, + 'workspace_id': workspace_id, 'name': genome_obj.genome_name, 'data': genome_obj.genome_data, "meta": genome_obj.genome_meta, diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 57cd8848..14152493 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -38,12 +38,10 @@ def __init__(self, config): self.ws_large_data = WsLargeDataIO(self.callback_url) def save_one_genome(self, params): - print("validating parameters") mass_params = self._set_up_single_params(params) return self._save_genome_mass(mass_params)[0] def save_genome_mass(self, params): - print("validating parameters") self._validate_mass_params(params) return self._save_genome_mass(params) @@ -51,7 +49,7 @@ def _set_up_single_params(self, params): inputs = dict(params) self._validate_genome_input_params(inputs) ws_id = get_int(inputs.pop(_WSID, None), _WSID) - ws_name = inputs.pop('workspace', None) + ws_name = inputs.pop(_WS, None) if bool(ws_id) == bool(ws_name): # xnor raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided") if not ws_id: From 330d6c2da8a0dc2d57efffe690e8db2928455776 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 15 Aug 2024 19:58:38 -0700 Subject: [PATCH 03/24] use batch genome save in GenbankToGenome.py --- lib/GenomeFileUtil/core/GenbankToGenome.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py index 3bf11b0f..606f83e3 100644 --- a/lib/GenomeFileUtil/core/GenbankToGenome.py +++ b/lib/GenomeFileUtil/core/GenbankToGenome.py @@ -202,7 +202,6 @@ def _import_genbank_mass(self, params): for genome_obj in genome_objs: shutil.rmtree(genome_obj.input_directory) - # TODO make an internal mass function save_genomes results = self._save_genomes(workspace_id, genome_objs) # return the result @@ -220,17 +219,18 @@ def _import_genbank_mass(self, params): return details def _save_genomes(self, workspace_id, genome_objs): - results = [ - self.gi.save_one_genome( - { - 'workspace_id': workspace_id, - 'name': genome_obj.genome_name, - 'data': genome_obj.genome_data, - "meta": genome_obj.genome_meta, - } - ) for genome_obj in genome_objs - ] - + results = self.gi.save_genome_mass( + { + "workspace_id": workspace_id, + "inputs": [ + { + "name": genome_obj.genome_name, + "data": genome_obj.genome_data, + "meta": genome_obj.genome_meta, + } for genome_obj in genome_objs + ], + } + ) return results def _validate_params(self, params): From 27158ec45fcfdfb2883200e6500d2c2e34319979 Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 16 Aug 2024 15:08:27 -0700 Subject: [PATCH 04/24] make save_genome_mass function internal --- GenomeFileUtil.spec | 19 - RELEASE_NOTES.md | 3 +- lib/GenomeFileUtil/GenomeFileUtilImpl.py | 438 +-------------------- lib/GenomeFileUtil/GenomeFileUtilServer.py | 4 - 4 files changed, 2 insertions(+), 462 deletions(-) diff --git a/GenomeFileUtil.spec b/GenomeFileUtil.spec index 5c06cff1..05cca1f9 100644 --- a/GenomeFileUtil.spec +++ b/GenomeFileUtil.spec @@ -332,25 +332,6 @@ module GenomeFileUtil { funcdef save_one_genome(SaveOneGenomeParams params) returns (SaveGenomeResult returnVal) authentication required; - typedef structure { - string name; - KBaseGenomes.Genome data; - boolean hidden; - boolean upgrade; - } GenomeInput; - - typedef structure { - int workspace_id; - list inputs; - } SaveGenomesParams; - - typedef structure { - list results; - } SaveGenomesResults; - - funcdef save_genomes(SaveGenomesParams params) - returns(SaveGenomesResults results) authentication required; - /* gff_file - object containing path to gff_file ws_ref - input Assembly or Genome reference diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 260b4aa4..91c21b83 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -6,8 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.11.7] - TBD -- The `save_genomes` method was added to allow users to save genmes in batch -- Parsed and validated genome before upload +- The internal method `save_genome_mass` was added to facilitate the batch saving of genomes - Unusable `export_genome_features_protein_to_fasta` function was removed - The `genbanks_to_genomes` method was added to allow users to upload multiple genome objects at once diff --git a/lib/GenomeFileUtil/GenomeFileUtilImpl.py b/lib/GenomeFileUtil/GenomeFileUtilImpl.py index 98543d35..9ae1311d 100644 --- a/lib/GenomeFileUtil/GenomeFileUtilImpl.py +++ b/lib/GenomeFileUtil/GenomeFileUtilImpl.py @@ -69,7 +69,7 @@ class GenomeFileUtil: ######################################### noqa VERSION = "0.11.7" GIT_URL = "git@github.com:kbaseapps/GenomeFileUtil.git" - GIT_COMMIT_HASH = "4819598e65eea38d3c38d8c2ab808aca0d9697fd" + GIT_COMMIT_HASH = "330d6c2da8a0dc2d57efffe690e8db2928455776" #BEGIN_CLASS_HEADER #END_CLASS_HEADER @@ -1262,442 +1262,6 @@ def save_one_genome(self, ctx, params): # return the results return [returnVal] - def save_genomes(self, ctx, params): - """ - :param params: instance of type "SaveGenomesParams" -> structure: - parameter "workspace_id" of Long, parameter "inputs" of list of - type "GenomeInput" -> structure: parameter "name" of String, - parameter "data" of type "Genome" (Genome type -- annotated and - assembled genome data. Field descriptions: id - string - KBase - legacy data ID scientific_name - string - human readable species - name domain - string - human readable phylogenetic domain name - (eg. "Bacteria") warnings - list of string - genome-level warnings - generated in the annotation process genome_tiers - list of string - - controlled vocabulary (based on app input and checked by - GenomeFileUtil) A list of labels describing the data source for - this genome. Allowed values - Representative, Reference, - ExternalDB, User Tier assignments based on genome source: * All - phytozome - Representative and ExternalDB * Phytozome flagship - genomes - Reference, Representative and ExternalDB * Ensembl - - Representative and ExternalDB * RefSeq Reference - Reference, - Representative and ExternalDB * RefSeq Representative - - Representative and ExternalDB * RefSeq Latest or All Assemblies - folder - ExternalDB * User Data - User tagged feature_counts - map - of string to integer - total counts of each type of feature keys - are a controlled vocabulary of - "CDS", "gene", "misc_feature", - "misc_recomb", "mobile_element", "ncRNA" - 72, - "non_coding_features", "non_coding_genes", - "protein_encoding_gene", "rRNA", "rep_origin", "repeat_region", - "tRNA" genetic_code - int - An NCBI-assigned taxonomic category - for the organism See here - - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi dna_size - - integer - total number of nucleotides num_contigs - integer - - total number of contigs in the genome molecule_type - string - - controlled vocab - the type of molecule sequenced Possible values - are "Unknown", "DNA", "RNA", "genomic DNA", "genomic RNA", "mRNA", - "tRNA", "rRNA", "other RNA", "other DNA", "transcribed RNA", - "viral cRNA", "unassigned DNA", "unassigned RNA" contig_lengths - - list of int - nucleotide length of each contig in the genome - Indexes in this list correspond to indexes in the `contig_ids` - list. contig_ids - list of str - external database identifiers for - each contig (eg. "NC_000913.3") source - str - controlled vocab - - descriptor of where this data came from (eg. "RefSeq") Allowed - entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload - source_id - string - identifier of this genome from the source - database (eg. the RefSeq ID such as "NC_000913") md5 - string - - checksum of the underlying assembly sequence taxonomy - string - - semicolon-delimited taxonomy lineage, in order of parent to child - taxon_assignments - mapping of taxonomy namespace to taxon ID. - example - {"ncbi": "286", "gtdb": "s__staphylococcus_devriesei"} - gc_content - float - ratio of GC count to AT in the genome - publications - tuple of (pubmedid, source, title, web_addr, year, - authors, journal). See typedef above. ontology_events - A record - of the service and method used for a set of ontology assignments - on the genome. ontologies_present - a mapping of ontology source - id (eg. "GO") to a mapping of term IDs (eg "GO:16209") to term - names (eg. "histidine biosynthetic process"). features - array of - Feature - protein coding genes (see the separate Feature spec) - cdss - array of protein-coding sequences mrnas - array of - transcribed messenger RNA sequences (equal to cdss plus 5' and 3' - UTRs) non_coding_features - array of features that does not - include mRNA, CDS, and protein-encoding genes assembly_ref - - workspace reference to an assembly object from which this - annotated genome was derived. taxon_ref - workspace reference to a - taxon object that classifies the species or strain of this genome. - genbank_handle_ref - file server handle reference to the source - genbank file for this genome. gff_handle_ref - file server handle - reference to the source GFF file for this genome. - external_source_origination_date - TODO look at GFU for this - release - string - User-supplied release or version of the source - data. This most likely will come from an input field in the import - app. original_source_file_name - filename from which this genome - was derived (eg. genbank or gff filename). notes - TODO - quality_scores - TODO suspect - bool - flag of whether this - annotation is problematic due to some warning genome_type - string - - controlled vocab - One of "draft isolate", "finished isolate", - "mag", "sag", "virus", "plasmid", "construct" Features vs. coding - sequences: a feature is a sequence in the DNA that codes for a - protein, including non-transcribed introns. A coding sequence - (stored as `cdss`) includes **only** the sections of the feature - that codes for a protein, minus introns and UTRs. @optional - warnings contig_lengths contig_ids source_id taxonomy publications - @optional ontology_events ontologies_present non_coding_features - mrnas genome_type @optional genbank_handle_ref gff_handle_ref - external_source_origination_date @optional release - original_source_file_name notes quality_scores suspect - assembly_ref @optional taxon_ref taxon_assignments @metadata ws - gc_content as GC content @metadata ws taxonomy as Taxonomy - @metadata ws md5 as MD5 @metadata ws dna_size as Size @metadata ws - genetic_code as Genetic code @metadata ws domain as Domain - @metadata ws source_id as Source ID @metadata ws source as Source - @metadata ws scientific_name as Name @metadata ws genome_type as - Type @metadata ws length(features) as Number of Protein Encoding - Genes @metadata ws length(cdss) as Number of CDS @metadata ws - assembly_ref as Assembly Object @metadata ws num_contigs as Number - contigs @metadata ws length(warnings) as Number of Genome Level - Warnings @metadata ws suspect as Suspect Genome) -> structure: - parameter "id" of type "Genome_id" (KBase legacy data ID @id kb), - parameter "scientific_name" of String, parameter "domain" of - String, parameter "warnings" of list of String, parameter - "genome_tiers" of list of String, parameter "feature_counts" of - mapping from String to Long, parameter "genetic_code" of Long, - parameter "dna_size" of Long, parameter "num_contigs" of Long, - parameter "molecule_type" of String, parameter "contig_lengths" of - list of Long, parameter "contig_ids" of list of String, parameter - "source" of String, parameter "source_id" of type "source_id" - (Reference to a source_id @id external), parameter "md5" of - String, parameter "taxonomy" of String, parameter - "taxon_assignments" of mapping from String to String, parameter - "gc_content" of Double, parameter "publications" of list of type - "publication" (Structure for a publication Elements: (0) pubmedid - - float (1) source - string - (ex. Pubmed) (2) title - string (3) - string web address - string (4) publication year - string (5) - authors - string (6) journal - string) -> tuple of size 7: - parameter "pubmedid" of Double, parameter "source" of String, - parameter "title" of String, parameter "url" of String, parameter - "year" of String, parameter "authors" of String, parameter - "journal" of String, parameter "ontology_events" of list of type - "Ontology_event" (@optional ontology_ref method_version eco) -> - structure: parameter "id" of String, parameter "ontology_ref" of - type "Ontology_ref" (Reference to a ontology object @id ws - KBaseOntology.OntologyDictionary), parameter "method" of String, - parameter "method_version" of String, parameter "timestamp" of - String, parameter "eco" of String, parameter "ontologies_present" - of mapping from String to mapping from String to String, parameter - "features" of list of type "Feature" (Structure for a single CDS - encoding "gene" of a genome ONLY PUT GENES THAT HAVE A - CORRESPONDING CDS IN THIS ARRAY NOTE: Sequence is optional. - Ideally we can keep it in here, but Recognize due to space - constraints another solution may be needed. We may want to add - additional fields for other CDM functions (e.g., atomic regulons, - coexpressed fids, co_occurring fids,...) - protein_translation_length and protein_translation are for longest - coded protein (representative protein for splice variants) NOTE: - New Aliases field definitely breaks compatibility. As Does - Function. flags are flag fields in GenBank format. This will be a - controlled vocabulary. Initially Acceptable values are pseudo, - ribosomal_slippage, and trans_splicing Md5 is the md5 of - dna_sequence. @optional functions ontology_terms note - protein_translation mrnas flags warnings @optional inference_data - dna_sequence aliases db_xrefs children functional_descriptions) -> - structure: parameter "id" of type "Feature_id" (KBase Feature ID - @id external), parameter "location" of list of tuple of size 4: - type "Contig_id" (ContigSet contig ID @id external), Long, String, - Long, parameter "functions" of list of String, parameter - "functional_descriptions" of list of String, parameter - "ontology_terms" of mapping from String to mapping from String to - list of Long, parameter "note" of String, parameter "md5" of - String, parameter "protein_translation" of String, parameter - "protein_translation_length" of Long, parameter "cdss" of list of - String, parameter "mrnas" of list of String, parameter "children" - of list of String, parameter "flags" of list of String, parameter - "warnings" of list of String, parameter "inference_data" of list - of type "InferenceInfo" (Type spec for the "InferenceInfo" object. - TODO docs Found in the `inference_data` fields in mRNAs and CDSs - Fields: category - string - TODO type - string - TODO evidence - - string - TODO) -> structure: parameter "category" of String, - parameter "type" of String, parameter "evidence" of String, - parameter "dna_sequence" of String, parameter - "dna_sequence_length" of Long, parameter "aliases" of list of - tuple of size 2: parameter "fieldname" of String, parameter - "alias" of String, parameter "db_xrefs" of list of tuple of size - 2: parameter "db_source" of String, parameter "db_identifier" of - String, parameter "non_coding_features" of list of type - "NonCodingFeature" (Structure for a single feature that is NOT one - of the following: - Protein encoding gene (gene that has a - corresponding CDS) - mRNA - CDS Note pseudo-genes and Non protein - encoding genes are put into this flags are flag fields in GenBank - format. This will be a controlled vocabulary. Initially Acceptable - values are pseudo, ribosomal_slippage, and trans_splicing Md5 is - the md5 of dna_sequence. @optional functions ontology_terms note - flags warnings functional_descriptions @optional inference_data - dna_sequence aliases db_xrefs children parent_gene) -> structure: - parameter "id" of type "Feature_id" (KBase Feature ID @id - external), parameter "location" of list of tuple of size 4: type - "Contig_id" (ContigSet contig ID @id external), Long, String, - Long, parameter "type" of String, parameter "functions" of list of - String, parameter "functional_descriptions" of list of String, - parameter "ontology_terms" of mapping from String to mapping from - String to list of Long, parameter "note" of String, parameter - "md5" of String, parameter "parent_gene" of String, parameter - "children" of list of String, parameter "flags" of list of String, - parameter "warnings" of list of String, parameter "inference_data" - of list of type "InferenceInfo" (Type spec for the "InferenceInfo" - object. TODO docs Found in the `inference_data` fields in mRNAs - and CDSs Fields: category - string - TODO type - string - TODO - evidence - string - TODO) -> structure: parameter "category" of - String, parameter "type" of String, parameter "evidence" of - String, parameter "dna_sequence" of String, parameter - "dna_sequence_length" of Long, parameter "aliases" of list of - tuple of size 2: parameter "fieldname" of String, parameter - "alias" of String, parameter "db_xrefs" of list of tuple of size - 2: parameter "db_source" of String, parameter "db_identifier" of - String, parameter "cdss" of list of type "CDS" (Structure for a - single coding sequence. Coding sequences are the sections of a - feature's sequence that are translated to a protein (minus introns - and UTRs). Fields: id - string - identifier of the coding - sequence, such as "b0001_CDS_1" location - list> - list of locations from where this sequence - originates in the original assembly. Each sub-sequence in the list - constitutes a section of the resulting CDS. The first element in - the tuple corresponds to the "contig_id", such as "NC_000913.3". - The second element in the tuple is an index in the contig of where - the sequence starts. The third element is either a plus or minus - sign indicating whether it is on the 5' to 3' leading strand ("+") - or on the 3' to 5' lagging strand ("-"). The last element is the - length of the sub-sequence. For a location on the leading strand - (denoted by "+"), the index is of the leftmost base, and the - sequence extends to the right. For a location on the lagging - strand (denoted by "-"), the index is of the rightmost base, and - the sequence extends to the left. NOTE: the last element in each - tuple is the *length* of each sub-sequence. If you have a location - such as ("xyz", 100, "+", 50), then your sequence will go from - index 100 to index 149 (this has a length of 50). It *does not* go - from index 100 to index 150, as that would have a length of 51. - Likewise, if you have the location ("xyz", 100, "-", 50), then the - sequence extends from 100 down to 51, which has a length of 50 - bases. It does not go from index 100 to 50, as that would have a - length of 51. md5 - string - md5 of the dna sequence - TODO - clarification protein_md5 - string - hash of the protein sequence - that this CDS encodes parent_gene - string - gene (feature) from - which this CDS comes from, including introns and UTRs that have - been removed to create this CDS. parent_mrna - string - mRNA - sequence from which this sequence is derived, including UTRs but - not introns. note - string - TODO functions - list - list - of protein products or chemical processes that this sequence - creates, facilitates, or influences. functional_descriptions - - list - TODO list of protein products or chemical processes - that sequence creates, facilitates, or influences. ontology_terms - - mapping>> - a mapping of - ontology source id (eg. "GO") to a mapping of term IDs (eg - "GO:16209") to a list of indexes into the ontology_events data - (found in the top level of the genome object). The index into an - ontology event indicates what service and method created this term - assignment. flags - list - (controlled vocab) fields from - the genbank source. A common example is "pseudo" for pseudo-genes - that do not encode proteins, which shows up as "/pseudo" in the - genbank. Values can be: "pseudo", "ribosomal_slippage", - "trans_splicing" warnings - list - TODO inference_data - - list - TODO protein_translation - string - amino - acid sequence that this CDS gets translated into. - protein_translation_length - int - length of the above aliases - - list<(string, string)> - alternative list of names or identifiers - eg: [["gene", "thrA"], ["locus_tag", "b0002"]] db_xrefs - - list<(string, string)> - Identifiers from other databases - (database cross-references) The first string is the database name, - the second is the database identifier. eg: [["ASAP", - "ABE-0000006"], ["EcoGene", "EG11277"]] dna_sequence - string - - sequence of exons from the genome that constitute this protein - encoding sequence. dna_sequence_length - int - length of the above - @optional parent_gene parent_mrna functions ontology_terms note - flags warnings @optional inference_data dna_sequence aliases - db_xrefs functional_descriptions) -> structure: parameter "id" of - type "cds_id" (KBase CDS ID @id external), parameter "location" of - list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id - external), Long, String, Long, parameter "md5" of String, - parameter "protein_md5" of String, parameter "parent_gene" of type - "Feature_id" (KBase Feature ID @id external), parameter - "parent_mrna" of type "mrna_id" (KBase mRNA ID @id external), - parameter "note" of String, parameter "functions" of list of - String, parameter "functional_descriptions" of list of String, - parameter "ontology_terms" of mapping from String to mapping from - String to list of Long, parameter "flags" of list of String, - parameter "warnings" of list of String, parameter "inference_data" - of list of type "InferenceInfo" (Type spec for the "InferenceInfo" - object. TODO docs Found in the `inference_data` fields in mRNAs - and CDSs Fields: category - string - TODO type - string - TODO - evidence - string - TODO) -> structure: parameter "category" of - String, parameter "type" of String, parameter "evidence" of - String, parameter "protein_translation" of String, parameter - "protein_translation_length" of Long, parameter "aliases" of list - of tuple of size 2: parameter "fieldname" of String, parameter - "alias" of String, parameter "db_xrefs" of list of tuple of size - 2: parameter "db_source" of String, parameter "db_identifier" of - String, parameter "dna_sequence" of String, parameter - "dna_sequence_length" of Long, parameter "mrnas" of list of type - "mRNA" (The mRNA is the transcribed sequence from the original - feature, minus the introns, but including the UTRs. Fields: id - - string - identifying string for the mRNA location - - list> - list of locations from - where this sequence originates in the original assembly. Each - sub-sequence in the list constitutes a section of the resulting - CDS. The first element in the tuple corresponds to the - "contig_id", such as "NC_000913.3". The second element in the - tuple is an index in the contig of where the sequence starts. The - third element is either a plus or minus sign indicating whether it - is on the 5' to 3' leading strand ("+") or on the 3' to 5' lagging - strand ("-"). The last element is the length of the sub-sequence. - For a location on the leading strand (denoted by "+"), the index - is of the leftmost base, and the sequence extends to the right. - For a location on the lagging strand (denoted by "-"), the index - is of the rightmost base, and the sequence extends to the left. - NOTE: the last element in each tuple is the *length* of each - sub-sequence. If you have a location such as ("xyz", 100, "+", - 50), then your sequence will go from index 100 to index 149 (this - has a length of 50). It *does not* go from index 100 to index 150, - as that would have a length of 51. Likewise, if you have the - location ("xyz", 100, "-", 50), then the sequence extends from 100 - down to 51, which has a length of 50 bases. It does not go from - index 100 to 50, as that would have a length of 51. md5 - string - - md5 of the dna sequence - TODO clarification parent_gene - - Feature_id - corresponding feature for this sequence, including - introns and UTRs cds - string - corresponding coding sequence for - this mRNA (the sequence minus UTRs) dna_sequence - string - - sequence of UTRs and exons from the genome that constitute this - mRNA dna_sequence_length - int - length of the above note - string - - TODO functions - list - TODO list of protein products or - chemical processes that sequence creates, facilitates, or - influences. functional_descriptions - list - TODO list of - protein products or chemical processes that sequence creates, - facilitates, or influences. ontology_terms - mapping>> - a mapping of ontology source id (eg. - "GO") to a mapping of term IDs (eg "GO:16209") to a list of - indexes into the ontology_events data (found in the top level of - the genome object). The index into an ontology event indicates - what service and method created this term assignment. flags - - list - controlled vocab - fields from the genbank source. - A common example is "pseudo" for pseudo-genes that do not encode - proteins, which shows up as "/pseudo" in the genbank. Values can - be: "pseudo", "ribosomal_slippage", "trans_splicing" warnings - - list - TODO inference_data - list - TODO - aliases - list<(string, string)> - alternative list of names or - identifiers eg: [["gene", "thrA"], ["locus_tag", "b0002"]] - db_xrefs - list<(string, string)> - Identifiers from other - databases (database cross-references). The first string is the - database name, the second is the database identifier. eg: - [["ASAP", "ABE-0000006"], ["EcoGene", "EG11277"]] @optional - parent_gene cds functions ontology_terms note flags warnings - @optional inference_data dna_sequence aliases db_xrefs - functional_descriptions) -> structure: parameter "id" of type - "mrna_id" (KBase mRNA ID @id external), parameter "location" of - list of tuple of size 4: type "Contig_id" (ContigSet contig ID @id - external), Long, String, Long, parameter "md5" of String, - parameter "parent_gene" of type "Feature_id" (KBase Feature ID @id - external), parameter "cds" of type "cds_id" (KBase CDS ID @id - external), parameter "dna_sequence" of String, parameter - "dna_sequence_length" of Long, parameter "note" of String, - parameter "functions" of list of String, parameter - "functional_descriptions" of list of String, parameter - "ontology_terms" of mapping from String to mapping from String to - list of Long, parameter "flags" of list of String, parameter - "warnings" of list of String, parameter "inference_data" of list - of type "InferenceInfo" (Type spec for the "InferenceInfo" object. - TODO docs Found in the `inference_data` fields in mRNAs and CDSs - Fields: category - string - TODO type - string - TODO evidence - - string - TODO) -> structure: parameter "category" of String, - parameter "type" of String, parameter "evidence" of String, - parameter "aliases" of list of tuple of size 2: parameter - "fieldname" of String, parameter "alias" of String, parameter - "db_xrefs" of list of tuple of size 2: parameter "db_source" of - String, parameter "db_identifier" of String, parameter - "assembly_ref" of type "Assembly_ref" (Reference to an Assembly - object in the workspace @id ws KBaseGenomeAnnotations.Assembly), - parameter "taxon_ref" of type "Taxon_ref" (Reference to a taxon - object @id ws KBaseGenomeAnnotations.Taxon), parameter - "genbank_handle_ref" of type "genbank_handle_ref" (Reference to a - handle to the Genbank file on shock @id handle), parameter - "gff_handle_ref" of type "gff_handle_ref" (Reference to a handle - to the GFF file on shock @id handle), parameter - "external_source_origination_date" of String, parameter "release" - of String, parameter "original_source_file_name" of String, - parameter "notes" of String, parameter "quality_scores" of list of - type "GenomeQualityScore" (Genome quality score Fields: method - - string - TODO method_report_ref - string - TODO method_version - - string - TODO score: string - TODO score_interpretation - string - - TODO timestamp - string - TODO Score_interpretation - - fraction_complete - controlled vocabulary managed by API @optional - method_report_ref method_version) -> structure: parameter "method" - of String, parameter "method_report_ref" of type - "Method_report_ref" (Reference to a report object @id ws - KBaseReport.Report), parameter "method_version" of String, - parameter "score" of String, parameter "score_interpretation" of - String, parameter "timestamp" of String, parameter "suspect" of - type "Bool", parameter "genome_type" of String, parameter "hidden" - of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, - 1)), parameter "upgrade" of type "boolean" (A boolean - 0 for - false, 1 for true. @range (0, 1)) - :returns: instance of type "SaveGenomesResults" -> structure: - parameter "results" of list of type "SaveGenomeResult" -> - structure: parameter "info" of type "object_info" (Information - about an object, including user provided metadata. obj_id objid - - the numerical id of the object. obj_name name - the name of the - object. type_string type - the type of the object. timestamp - save_date - the save date of the object. obj_ver ver - the version - of the object. username saved_by - the user that saved or copied - the object. ws_id wsid - the workspace containing the object. - ws_name workspace - the workspace containing the object. string - chsum - the md5 checksum of the object. int size - the size of the - object in bytes. usermeta meta - arbitrary user-supplied metadata - about the object.) -> tuple of size 11: parameter "objid" of type - "obj_id" (The unique, permanent numerical ID of an object.), - parameter "name" of type "obj_name" (A string used as a name for - an object. Any string consisting of alphanumeric characters and - the characters |._- that is not an integer is acceptable.), - parameter "type" of type "type_string" (A type string. Specifies - the type and its version in a single string in the format - [module].[typename]-[major].[minor]: module - a string. The module - name of the typespec containing the type. typename - a string. The - name of the type as assigned by the typedef statement. major - an - integer. The major version of the type. A change in the major - version implies the type has changed in a non-backwards compatible - way. minor - an integer. The minor version of the type. A change - in the minor version implies that the type has changed in a way - that is backwards compatible with previous type definitions. In - many cases, the major and minor versions are optional, and if not - provided the most recent version will be used. Example: - MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A - time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the - character Z (representing the UTC timezone) or the difference in - time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 - (EST time) 2013-04-03T08:56:32+0000 (UTC time) - 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, - parameter "saved_by" of type "username" (Login name of a KBase - user account.), parameter "wsid" of type "ws_id" (The unique, - permanent numerical ID of a workspace.), parameter "workspace" of - type "ws_name" (A string used as a name for a workspace. Any - string consisting of alphanumeric characters and "_", ".", or "-" - that is not an integer is acceptable. The name may optionally be - prefixed with the workspace owner's user name and a colon, e.g. - kbasetest:my_workspace.), parameter "chsum" of String, parameter - "size" of Long, parameter "meta" of type "usermeta" (User provided - metadata about an object. Arbitrary key-value pairs provided by - the user.) -> mapping from String to String - """ - # ctx is the context object - # return variables are: results - #BEGIN save_genomes - results = { - "results": GenomeInterface(self.cfg).save_genome_mass(params) - } - #END save_genomes - - # At some point might do deeper type checking... - if not isinstance(results, dict): - raise ValueError('Method save_genomes return value ' + - 'results is not type dict as required.') - # return the results - return [results] - def ws_obj_gff_to_genome(self, ctx, params): """ This function takes in a workspace object of type KBaseGenomes.Genome or KBaseGenomeAnnotations.Assembly and a gff file and produces a KBaseGenomes.Genome reanotated according to the the input gff file. diff --git a/lib/GenomeFileUtil/GenomeFileUtilServer.py b/lib/GenomeFileUtil/GenomeFileUtilServer.py index 2c09c2e0..6ae2c611 100644 --- a/lib/GenomeFileUtil/GenomeFileUtilServer.py +++ b/lib/GenomeFileUtil/GenomeFileUtilServer.py @@ -394,10 +394,6 @@ def __init__(self): name='GenomeFileUtil.save_one_genome', types=[dict]) self.method_authentication['GenomeFileUtil.save_one_genome'] = 'required' # noqa - self.rpc_service.add(impl_GenomeFileUtil.save_genomes, - name='GenomeFileUtil.save_genomes', - types=[dict]) - self.method_authentication['GenomeFileUtil.save_genomes'] = 'required' # noqa self.rpc_service.add(impl_GenomeFileUtil.ws_obj_gff_to_genome, name='GenomeFileUtil.ws_obj_gff_to_genome', types=[dict]) From 04aa20302e6deee46b1a318330dd884f167a5a6f Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 16 Aug 2024 17:33:16 -0700 Subject: [PATCH 05/24] add tests for save_genome_mass function --- test/problematic_tests/save_genome_test.py | 85 +++++++++++++++++++++- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index d2582d28..93b2ea31 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -92,6 +92,7 @@ def setUpClass(cls): suffix = int(time.time() * 1000) cls.wsName = "test_SaveGenomeTest_" + str(suffix) cls.wsClient.create_workspace({'workspace': cls.wsName}) + cls.wsID = cls.dfu.ws_name_to_id(cls.wsName) cls.nodes_to_delete = [] cls.prepare_data() @@ -142,9 +143,12 @@ def start_test(self): testname = inspect.stack()[1][3] print(('\n*** starting test: ' + testname + ' **')) - def fail_save_one_genome(self, params, error, exception=ValueError, contains=False): + def fail_save_genome(self, params, error, exception=ValueError, contains=False, mass=False): with self.assertRaises(exception) as context: - self.getImpl().save_one_genome(self.ctx, params) + if mass: + self.genome_interface.save_genome_mass(params) + else: + self.getImpl().save_one_genome(self.ctx, params) if contains: self.assertIn(error, str(context.exception)) else: @@ -164,7 +168,7 @@ def test_bad_one_genome_params(self): 'name': 'name', 'data': 'data'} error_msg = "Exactly one of a 'workspace_id' or a 'workspace' parameter must be provided" - self.fail_save_one_genome(invalidate_params, error_msg) + self.fail_save_genome(invalidate_params, error_msg) def test_one_genome(self): self.start_test() @@ -192,6 +196,81 @@ def test_one_genome_with_hidden(self): ret = self.getImpl().save_one_genome(self.ctx, params)[0] self.check_save_one_genome_output(ret, genome_name) + def test_genomes(self): + self.start_test() + genome_name = 'test_genome' + inputs = [ + { + 'name': genome_name, + 'data': self.test_genome_data, + } + ] + params = {'workspace_id': self.wsID, 'inputs': inputs} + ret = self.genome_interface.save_genome_mass(params)[0] + self.check_save_one_genome_output(ret, genome_name) + + def test_genomes_with_hidden(self): + self.start_test() + genome_name = 'test_genome_hidden' + inputs = [ + { + 'name': genome_name, + 'data': self.test_genome_data, + 'hidden': 1, + } + ] + params = {'workspace_id': self.wsID, 'inputs': inputs} + ret = self.genome_interface.save_genome_mass(params)[0] + self.check_save_one_genome_output(ret, genome_name) + + inputs = [ + { + 'name': genome_name, + 'data': self.test_genome_data, + 'hidden': True, + } + ] + params = {'workspace_id': self.wsID, 'inputs': inputs} + ret = self.genome_interface.save_genome_mass(params)[0] + self.check_save_one_genome_output(ret, genome_name) + + def test_bad_genomes_params_missing_wsid(self): + self.start_test() + invalidate_params = { + 'missing_workspace_id': 'workspace_id', + 'name': 'name', + 'data': 'data', + } + error_msg = "workspace_id is required" + self.fail_save_genome(invalidate_params, error_msg, mass=True) + + def test_bad_genomes_params_empty_inputs(self): + self.start_test() + invalidate_params = { + 'workspace_id': self.wsID, + 'inputs': [] + } + error_msg = "inputs field is required and must be a non-empty list" + self.fail_save_genome(invalidate_params, error_msg, mass=True) + + def test_bad_genomes_params_invalidate_entry_type(self): + self.start_test() + invalidate_params = { + 'workspace_id': self.wsID, + 'inputs': [['name', 'data']], + } + error_msg = "Entry #1 in inputs field is not a mapping as required" + self.fail_save_genome(invalidate_params, error_msg, mass=True) + + def test_bad_genomes_params_missing_parameter(self): + self.start_test() + invalidate_params = { + 'workspace_id': self.wsID, + 'inputs': [{'data': 'data'}], + } + error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing" + self.fail_save_genome(invalidate_params, error_msg, mass=True) + def test_GenomeInterface_check_dna_sequence_in_features(self): # no feature in genome genome = {'missing_features': 'features'} From 8d672d17d45ff7dfb0150d3c4fc080a5bca43b16 Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 16 Aug 2024 22:40:31 -0700 Subject: [PATCH 06/24] fix bug --- lib/GenomeFileUtil/core/GenomeInterface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 14152493..990a19a8 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -239,7 +239,7 @@ def _save_genome_mass(self, params): # dump genome to scratch for upload data_path = os.path.join(self.scratch, name + ".json") json.dump(data, open(data_path, 'w')) - if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'): + if 'hidden' in input_params and str(input_params['hidden']).lower() in ('yes', 'true', 't', '1'): hidden = 1 else: hidden = 0 From d1dd768292bfff29067c45e34e2c6ad0c15f2e21 Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 26 Aug 2024 14:42:37 -0700 Subject: [PATCH 07/24] update release notes && make the dicts in the loop --- RELEASE_NOTES.md | 2 +- lib/GenomeFileUtil/core/GenomeInterface.py | 31 +++++++++++----------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 91c21b83..c0e69fda 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.11.7] - TBD -- The internal method `save_genome_mass` was added to facilitate the batch saving of genomes +- Genomes are now saved in batches to the workspace - Unusable `export_genome_features_protein_to_fasta` function was removed - The `genbanks_to_genomes` method was added to allow users to upload multiple genome objects at once diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 990a19a8..66f6bd0a 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -93,22 +93,23 @@ def _save_genome_objects( meta_data, hidden_data, ): - ws_inputs = [] - for ws_datatype, data_path, name, meta, hidden in zip( - ws_datatypes, data_paths, names, meta_data, hidden_data - ): - ws_inputs.append( - { - 'type': ws_datatype, - 'data_json_file': data_path, - 'name': name, - 'meta': meta, - 'hidden': hidden, - } - ) - return self.ws_large_data.save_objects( - {'id': workspace_id, 'objects': ws_inputs} + dfu_infos = self.ws_large_data.save_objects( + { + 'id': workspace_id, + 'objects': [ + { + 'type': ws_datatype, + 'data_json_file': data_path, + 'name': name, + 'meta': meta, + 'hidden': hidden, + } for ws_datatype, data_path, name, meta, hidden in zip( + ws_datatypes, data_paths, names, meta_data, hidden_data + ) + ] + } ) + return dfu_infos def _check_shock_response(self, response, errtxt): """ From e79c297ab6522ee00faceb4bfb531d232dbb5b75 Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 26 Aug 2024 15:23:53 -0700 Subject: [PATCH 08/24] remove logging && add NOTE for workspace_datatype --- lib/GenomeFileUtil/core/GenomeInterface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 66f6bd0a..321ae71b 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -78,7 +78,6 @@ def _validate_genome_input_params(self, genome_input): """ Check required parameters are in genome_input """ - logging.info("start validating genome_input params") # check for required parameters for p in ["name", "data"]: if p not in genome_input: @@ -213,6 +212,7 @@ def _save_genome_mass(self, params): data = input_params['data'] # XXX there is no `workspace_datatype` param in the spec + # NOTE: The method caller should not be able to choose an arbitrary workspace type ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome") # XXX there is no `meta` param in the spec meta = input_params.get('meta', {}) From 35030a03fe3271681dca2fd9c924ac59ba501354 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 28 Aug 2024 15:24:54 -0700 Subject: [PATCH 09/24] move set_up_single_params && validate_mass_params into GenomeUtils --- lib/GenomeFileUtil/core/GenbankToGenome.py | 40 ++-------- lib/GenomeFileUtil/core/GenomeInterface.py | 48 +++-------- lib/GenomeFileUtil/core/GenomeUtils.py | 93 +++++++++++++++++++++- test/problematic_tests/save_genome_test.py | 72 ++++++++--------- 4 files changed, 146 insertions(+), 107 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py index 606f83e3..ffdd55c3 100644 --- a/lib/GenomeFileUtil/core/GenbankToGenome.py +++ b/lib/GenomeFileUtil/core/GenbankToGenome.py @@ -19,11 +19,11 @@ from installed_clients.AssemblyUtilClient import AssemblyUtil from installed_clients.DataFileUtilClient import DataFileUtil from GenomeFileUtil.core.GenomeInterface import GenomeInterface -from GenomeFileUtil.core.MiscUtils import get_int from installed_clients.WorkspaceClient import Workspace from GenomeFileUtil.core.GenomeUtils import ( is_parent, propagate_cds_props_to_gene, warnings, parse_inferences, - load_ontology_mappings, set_taxon_data, set_default_taxon_data + load_ontology_mappings, set_taxon_data, set_default_taxon_data, + set_up_single_params, validate_mass_params ) MAX_MISC_FEATURE_SIZE = 10000 @@ -114,44 +114,16 @@ def __init__(self, config): def import_genbank(self, params): print('validating parameters') - mass_params = self._set_up_single_params(params) + mass_params = set_up_single_params( + params, _WSNAME, self._validate_params, self.dfu.ws_name_to_id + ) return self._import_genbank_mass(mass_params)[0] def import_genbank_mass(self, params): print('validating parameters') - self._validate_mass_params(params) + validate_mass_params(params, self._validate_params) return self._import_genbank_mass(params) - def _set_up_single_params(self, params): - # avoid side effects and keep variables in params unmodfied - inputs = dict(params) - self._validate_params(inputs) - ws_id = get_int(inputs.pop(_WSID, None), _WSID) - ws_name = inputs.pop(_WSNAME, None) - if (bool(ws_id) == bool(ws_name)): # xnor - raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided") - if not ws_id: - print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting " - + "a workspace ID over a mutable workspace name that may cause race conditions") - ws_id = self.dfu.ws_name_to_id(ws_name) - mass_params = {_WSID: ws_id, _INPUTS: [inputs]} - return mass_params - - def _validate_mass_params(self, params): - ws_id = get_int(params.get(_WSID), _WSID) - if not ws_id: - raise ValueError(f"{_WSID} is required") - inputs = params.get(_INPUTS) - if not inputs or type(inputs) is not list: - raise ValueError(f"{_INPUTS} field is required and must be a non-empty list") - for i, inp in enumerate(inputs, start=1): - if type(inp) is not dict: - raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required") - try: - self._validate_params(inp) - except Exception as e: - raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e - def _import_genbank_mass(self, params): workspace_id = params[_WSID] diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 321ae71b..33b0ff07 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -12,8 +12,10 @@ from installed_clients.AssemblySequenceAPIServiceClient import AssemblySequenceAPI from installed_clients.DataFileUtilClient import DataFileUtil from installed_clients.WSLargeDataIOClient import WsLargeDataIO -from GenomeFileUtil.core import GenomeUtils -from GenomeFileUtil.core.MiscUtils import get_int +from GenomeFileUtil.core.GenomeUtils import ( + set_taxon_data, set_default_taxon_data, sort_dict, + set_up_single_params, validate_mass_params +) MAX_GENOME_SIZE = 2**30 @@ -38,42 +40,16 @@ def __init__(self, config): self.ws_large_data = WsLargeDataIO(self.callback_url) def save_one_genome(self, params): - mass_params = self._set_up_single_params(params) + mass_params = set_up_single_params( + params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id + ) return self._save_genome_mass(mass_params)[0] + # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail. def save_genome_mass(self, params): - self._validate_mass_params(params) + validate_mass_params(params, self._validate_genome_input_params) return self._save_genome_mass(params) - def _set_up_single_params(self, params): - inputs = dict(params) - self._validate_genome_input_params(inputs) - ws_id = get_int(inputs.pop(_WSID, None), _WSID) - ws_name = inputs.pop(_WS, None) - if bool(ws_id) == bool(ws_name): # xnor - raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided") - if not ws_id: - print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting " - + "a workspace ID over a mutable workspace name that may cause race conditions") - ws_id = self.dfu.ws_name_to_id(ws_name) - mass_params = {_WSID: ws_id, _INPUTS: [inputs]} - return mass_params - - def _validate_mass_params(self, params): - ws_id = get_int(params.get(_WSID), _WSID) - if not ws_id: - raise ValueError(f"{_WSID} is required") - inputs = params.get(_INPUTS) - if not inputs or type(inputs) != list: - raise ValueError(f"{_INPUTS} field is required and must be a non-empty list") - for i, inp in enumerate(inputs, start=1): - if type(inp) != dict: - raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required") - try: - self._validate_genome_input_params(inp) - except Exception as e: - raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e - def _validate_genome_input_params(self, genome_input): """ Check required parameters are in genome_input @@ -236,7 +212,7 @@ def _save_genome_mass(self, params): data['warnings'] = self.validate_genome(data) # sort data - data = GenomeUtils.sort_dict(data) + data = sort_dict(data) # dump genome to scratch for upload data_path = os.path.join(self.scratch, name + ".json") json.dump(data, open(data_path, 'w')) @@ -311,9 +287,9 @@ def _update_genome(self, genome): # NOTE: Metagenome object does not have a 'taxon_assignments' field if 'taxon_assignments' in genome and genome['taxon_assignments'].get('ncbi'): tax_id = int(genome['taxon_assignments']['ncbi']) - GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome) + set_taxon_data(tax_id, self.re_api_url, genome) else: - GenomeUtils.set_default_taxon_data(genome) + set_default_taxon_data(genome) if any([x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')]): if 'assembly_ref' in genome: diff --git a/lib/GenomeFileUtil/core/GenomeUtils.py b/lib/GenomeFileUtil/core/GenomeUtils.py index 0ffeae29..d1b63935 100644 --- a/lib/GenomeFileUtil/core/GenomeUtils.py +++ b/lib/GenomeFileUtil/core/GenomeUtils.py @@ -3,13 +3,18 @@ import os import re import time +from typing import Callable, Dict, Any from relation_engine_client import REClient from relation_engine_client.exceptions import RENotFound +from GenomeFileUtil.core.MiscUtils import get_int # Name of the ncbi taxonomy namespace stored in "taxon_assignments" _NCBI_TAX = 'ncbi' +_WSID = 'workspace_id' +_INPUTS = 'inputs' + warnings = { "cds_excluded": "SUSPECT: CDS from {} was excluded because the associated " "CDS failed coordinates validation", @@ -482,4 +487,90 @@ def set_taxon_data(tax_id, re_api_url, genome_dict): ) # Assign the scientific name to the most specific (right-most) taxon in the lineage genome_dict['scientific_name'] = sciname - \ No newline at end of file + + +def set_up_single_params( + params: Dict[str, Any], + ws: str, + validate_params_func: Callable[[Dict[str, Any]], None], + ws_name_to_id_func: Callable[[str], int] +) -> Dict[str, Any]: + """ + Sets up parameters by validating them and ensuring that exactly one of workspace ID or name is provided. + + Args: + params (Dict[str, Any]): A dictionary where the keys are parameter names (strings) and the values + can be of any type. + ws (str): A string representing the key for the workspace name or identifier. + validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters + and validates them. This function should raise an exception if the parameters are invalid. + ws_name_to_id_func (Callable[[str], int]): A function that takes a workspace name (string) and returns + its corresponding ID (integer). + + Returns: + Dict[str, Any]: A dictionary containing the workspace ID and the processed parameters. The dictionary + has keys '_WSID' and '_INPUTS', where '_WSID' is the workspace ID and '_INPUTS' is a list containing + the input parameters. + + Raises: + ValueError: If neither or both the workspace ID and workspace name are provided in the parameters. + KeyError: If the workspace ID or name is missing or invalid. + + Notes: + - If a workspace ID is not provided, the function will attempt to convert the workspace name to an ID + using `ws_name_to_id_func`. + - It is preferable to provide a workspace ID directly to avoid potential race conditions with mutable + workspace names. + """ + inputs = dict(params) + validate_params_func(inputs) + ws_id = get_int(inputs.pop(_WSID, None), _WSID) + ws_name = inputs.pop(ws, None) + if bool(ws_id) == bool(ws_name): # xnor + raise ValueError(f"Exactly one of a '{_WSID}' or a '{ws}' parameter must be provided") + if not ws_id: + print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting " + + "a workspace ID over a mutable workspace name that may cause race conditions") + ws_id = ws_name_to_id_func(ws_name) + mass_params = {_WSID: ws_id, _INPUTS: [inputs]} + return mass_params + + +def validate_mass_params( + params: Dict[str, Any], + validate_params_func: Callable[[Dict[str, Any]], None] +) -> None: + """ + Validates the provided parameters according to specific rules. + + Args: + params (Dict[str, Any]): A dictionary containing parameters to validate. Must include: + - _WSID: A workspace ID, which must be present and valid. + - _INPUTS: A list of parameter dictionaries, each of which must be validated by `validate_params_func`. + + validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters + and validates it. The function should raise an exception if the parameters are invalid. + + Raises: + ValueError: If `_WSID` is missing or invalid, if `_INPUTS` is missing or not a non-empty list, or if any + entry in `_INPUTS` is not a dictionary or fails validation. + + Notes: + - The function checks that `_WSID` is present and converts it to an integer using `get_int`. + - The `_INPUTS` field must be a non-empty list of dictionaries. Each dictionary in the list is validated + using `validate_params_func`. + - If any validation fails, a `ValueError` is raised with a message indicating the issue and entry index. + """ + ws_id = get_int(params.get(_WSID), _WSID) + if not ws_id: + raise ValueError(f"{_WSID} is required") + inputs = params.get(_INPUTS) + if not inputs or type(inputs) != list: + raise ValueError(f"{_INPUTS} field is required and must be a non-empty list") + for i, inp in enumerate(inputs, start=1): + if type(inp) != dict: + raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required") + try: + validate_params_func(inp) + except Exception as e: + raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e \ No newline at end of file diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 93b2ea31..e09547bb 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -234,42 +234,42 @@ def test_genomes_with_hidden(self): ret = self.genome_interface.save_genome_mass(params)[0] self.check_save_one_genome_output(ret, genome_name) - def test_bad_genomes_params_missing_wsid(self): - self.start_test() - invalidate_params = { - 'missing_workspace_id': 'workspace_id', - 'name': 'name', - 'data': 'data', - } - error_msg = "workspace_id is required" - self.fail_save_genome(invalidate_params, error_msg, mass=True) - - def test_bad_genomes_params_empty_inputs(self): - self.start_test() - invalidate_params = { - 'workspace_id': self.wsID, - 'inputs': [] - } - error_msg = "inputs field is required and must be a non-empty list" - self.fail_save_genome(invalidate_params, error_msg, mass=True) - - def test_bad_genomes_params_invalidate_entry_type(self): - self.start_test() - invalidate_params = { - 'workspace_id': self.wsID, - 'inputs': [['name', 'data']], - } - error_msg = "Entry #1 in inputs field is not a mapping as required" - self.fail_save_genome(invalidate_params, error_msg, mass=True) - - def test_bad_genomes_params_missing_parameter(self): - self.start_test() - invalidate_params = { - 'workspace_id': self.wsID, - 'inputs': [{'data': 'data'}], - } - error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing" - self.fail_save_genome(invalidate_params, error_msg, mass=True) + # def test_bad_genomes_params_missing_wsid(self): + # self.start_test() + # invalidate_params = { + # 'missing_workspace_id': 'workspace_id', + # 'name': 'name', + # 'data': 'data', + # } + # error_msg = "workspace_id is required" + # self.fail_save_genome(invalidate_params, error_msg, mass=True) + + # def test_bad_genomes_params_empty_inputs(self): + # self.start_test() + # invalidate_params = { + # 'workspace_id': self.wsID, + # 'inputs': [] + # } + # error_msg = "inputs field is required and must be a non-empty list" + # self.fail_save_genome(invalidate_params, error_msg, mass=True) + + # def test_bad_genomes_params_invalidate_entry_type(self): + # self.start_test() + # invalidate_params = { + # 'workspace_id': self.wsID, + # 'inputs': [['name', 'data']], + # } + # error_msg = "Entry #1 in inputs field is not a mapping as required" + # self.fail_save_genome(invalidate_params, error_msg, mass=True) + + # def test_bad_genomes_params_missing_parameter(self): + # self.start_test() + # invalidate_params = { + # 'workspace_id': self.wsID, + # 'inputs': [{'data': 'data'}], + # } + # error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing" + # self.fail_save_genome(invalidate_params, error_msg, mass=True) def test_GenomeInterface_check_dna_sequence_in_features(self): # no feature in genome From 4b1b88bc9e389ba6a8fd9a21b5a0903abb2f6578 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 28 Aug 2024 16:21:33 -0700 Subject: [PATCH 10/24] remove redundant tests --- lib/GenomeFileUtil/core/GenomeInterface.py | 15 +++----- test/problematic_tests/save_genome_test.py | 44 ++++------------------ 2 files changed, 14 insertions(+), 45 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 33b0ff07..7f77218e 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -12,10 +12,7 @@ from installed_clients.AssemblySequenceAPIServiceClient import AssemblySequenceAPI from installed_clients.DataFileUtilClient import DataFileUtil from installed_clients.WSLargeDataIOClient import WsLargeDataIO -from GenomeFileUtil.core.GenomeUtils import ( - set_taxon_data, set_default_taxon_data, sort_dict, - set_up_single_params, validate_mass_params -) +from GenomeFileUtil.core import GenomeUtils MAX_GENOME_SIZE = 2**30 @@ -40,14 +37,14 @@ def __init__(self, config): self.ws_large_data = WsLargeDataIO(self.callback_url) def save_one_genome(self, params): - mass_params = set_up_single_params( + mass_params = GenomeUtils.set_up_single_params( params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id ) return self._save_genome_mass(mass_params)[0] # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail. def save_genome_mass(self, params): - validate_mass_params(params, self._validate_genome_input_params) + GenomeUtils.validate_mass_params(params, self._validate_genome_input_params) return self._save_genome_mass(params) def _validate_genome_input_params(self, genome_input): @@ -212,7 +209,7 @@ def _save_genome_mass(self, params): data['warnings'] = self.validate_genome(data) # sort data - data = sort_dict(data) + data = GenomeUtils.sort_dict(data) # dump genome to scratch for upload data_path = os.path.join(self.scratch, name + ".json") json.dump(data, open(data_path, 'w')) @@ -287,9 +284,9 @@ def _update_genome(self, genome): # NOTE: Metagenome object does not have a 'taxon_assignments' field if 'taxon_assignments' in genome and genome['taxon_assignments'].get('ncbi'): tax_id = int(genome['taxon_assignments']['ncbi']) - set_taxon_data(tax_id, self.re_api_url, genome) + GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome) else: - set_default_taxon_data(genome) + GenomeUtils.set_default_taxon_data(genome) if any([x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')]): if 'assembly_ref' in genome: diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index e09547bb..c3c0e017 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -234,42 +234,14 @@ def test_genomes_with_hidden(self): ret = self.genome_interface.save_genome_mass(params)[0] self.check_save_one_genome_output(ret, genome_name) - # def test_bad_genomes_params_missing_wsid(self): - # self.start_test() - # invalidate_params = { - # 'missing_workspace_id': 'workspace_id', - # 'name': 'name', - # 'data': 'data', - # } - # error_msg = "workspace_id is required" - # self.fail_save_genome(invalidate_params, error_msg, mass=True) - - # def test_bad_genomes_params_empty_inputs(self): - # self.start_test() - # invalidate_params = { - # 'workspace_id': self.wsID, - # 'inputs': [] - # } - # error_msg = "inputs field is required and must be a non-empty list" - # self.fail_save_genome(invalidate_params, error_msg, mass=True) - - # def test_bad_genomes_params_invalidate_entry_type(self): - # self.start_test() - # invalidate_params = { - # 'workspace_id': self.wsID, - # 'inputs': [['name', 'data']], - # } - # error_msg = "Entry #1 in inputs field is not a mapping as required" - # self.fail_save_genome(invalidate_params, error_msg, mass=True) - - # def test_bad_genomes_params_missing_parameter(self): - # self.start_test() - # invalidate_params = { - # 'workspace_id': self.wsID, - # 'inputs': [{'data': 'data'}], - # } - # error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing" - # self.fail_save_genome(invalidate_params, error_msg, mass=True) + def test_bad_genomes_params_missing_parameter(self): + self.start_test() + invalidate_params = { + 'workspace_id': self.wsID, + 'inputs': [{'data': 'data'}], + } + error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing" + self.fail_save_genome(invalidate_params, error_msg, mass=True) def test_GenomeInterface_check_dna_sequence_in_features(self): # no feature in genome From b56d77fb4b212bb2102e2e4a62371c751ba7edf2 Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 3 Sep 2024 14:28:38 -0700 Subject: [PATCH 11/24] add test to cover the missing line --- test/problematic_tests/save_genome_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index c3c0e017..29d1754d 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -209,6 +209,21 @@ def test_genomes(self): ret = self.genome_interface.save_genome_mass(params)[0] self.check_save_one_genome_output(ret, genome_name) + def test_genomes_with_upgrade(self): + self.start_test() + genome_name = 'test_genome' + inputs = [ + { + 'name': genome_name, + 'data': self.test_genome_data, + 'ws_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly", + 'upgrade': True, + } + ] + params = {'workspace_id': self.wsID, 'inputs': inputs} + ret = self.genome_interface.save_genome_mass(params)[0] + self.check_save_one_genome_output(ret, genome_name) + def test_genomes_with_hidden(self): self.start_test() genome_name = 'test_genome_hidden' From 788c7ee524ee3e355c41ab89ac50093b8509ced0 Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 3 Sep 2024 15:19:14 -0700 Subject: [PATCH 12/24] fix params name typo --- test/problematic_tests/save_genome_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 29d1754d..c453bc87 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -216,7 +216,7 @@ def test_genomes_with_upgrade(self): { 'name': genome_name, 'data': self.test_genome_data, - 'ws_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly", + 'workspace_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly", 'upgrade': True, } ] From 65467ea89c918b98d22bf3b55cb4578de0e6b634 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 5 Sep 2024 01:00:58 -0700 Subject: [PATCH 13/24] add metagenome json file && cover the missing line --- lib/GenomeFileUtil/core/GenomeInterface.py | 1 + test/data/metagenomes/toy/metagenome.json | 53 ++++++++++++++++++++++ test/problematic_tests/save_genome_test.py | 50 ++++++++++++++------ 3 files changed, 90 insertions(+), 14 deletions(-) create mode 100644 test/data/metagenomes/toy/metagenome.json diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 7f77218e..07538bda 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -267,6 +267,7 @@ def _update_metagenome(self, genome): """Checks for missing required fields and fixes breaking changes""" if 'molecule_type' not in genome: genome['molecule_type'] = 'Unknown' + return genome def _update_genome(self, genome): """Checks for missing required fields and fixes breaking changes""" diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json new file mode 100644 index 00000000..a3c8e5ba --- /dev/null +++ b/test/data/metagenomes/toy/metagenome.json @@ -0,0 +1,53 @@ +{ + "contig_ids": [ + "Ga0065724_100001" + ], + "contig_lengths": [ + 538871 + ], + "dna_size": 538871, + "domain": "Eukaryota", + "environment": null, + "external_source_origination_date": null, + "feature_counts": { + "CDS": 20, + "gene": 20, + "non_coding_features": 0, + "protein_encoding_gene": 20 + }, + "features_handle_ref": "KBH_736245", + "gc_content": 0.64469, + "genetic_code": 1, + "genome_type": "Metagenome", + "gff_handle_ref": "KBH_736244", + "id": "MyMetagenome", + "md5": "e2ccbd5a9bed0148015bd6b784e3c1c3", + "molecule_type": "SingleLetterAlphabet", + "notes": null, + "num_contigs": 1, + "num_features": 40, + "ontologies_present": {}, + "ontology_events": [ + { + "id": "GO", + "method": "GenomeFileUtils Genbank uploader from annotations", + "method_version": "0.11.7", + "ontology_ref": "KBaseOntology/gene_ontology", + "timestamp": "2024_09_05_06_45_31" + } + ], + "original_source_file_name": null, + "protein_handle_ref": "KBH_736243", + "publications": [], + "scientific_name": "Arabidopsis thaliana", + "source": "GFF", + "source_id": "unknown", + "suspect": 1, + "taxon_assignments": { + "ncbi": "3702" + }, + "taxonomy": "cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; eudicotyledons; Gunneridae; Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; Camelineae; Arabidopsis", + "warnings": [ + "SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS." + ] +} \ No newline at end of file diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index c453bc87..5380738c 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -23,6 +23,9 @@ from GenomeFileUtil.core.GenomeInterface import GenomeInterface from installed_clients.WorkspaceClient import Workspace as workspaceService +KBASE_GENOME = "KBaseGenomes.Genome" +KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly" + class SaveGenomeTest(unittest.TestCase): @@ -115,17 +118,36 @@ def delete_shock_node(cls, node_id): @classmethod def prepare_data(cls): - assembly_file_path = os.path.join(cls.scratch, - 'e_coli_assembly.fasta') + + assembly_file_path = os.path.join(cls.scratch,'e_coli_assembly.fasta') + meta_file_path = os.path.join(cls.scratch,'metagenome.fa') + shutil.copy('data/e_coli/e_coli_assembly.fasta', assembly_file_path) + shutil.copy('data/metagenomes/toy/metagenome.fa', meta_file_path) + au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) - assembly_ref = au.save_assembly_from_fasta({ - 'workspace_name': cls.wsName, - 'assembly_name': 'e_coli.assembly', - 'file': {'path': assembly_file_path} - }) + + assembly_refs = au.save_assemblies_from_fastas( + { + 'workspace_id': cls.wsID, + 'inputs': [ + { + 'assembly_name': 'e_coli.assembly', + 'file': assembly_file_path + }, + { + 'assembly_name': 'metagenome.assembly', + 'file': meta_file_path + } + ] + } + )["results"] + cls.test_genome_data = json.load(open('data/e_coli/e_coli.json')) - cls.test_genome_data['assembly_ref'] = assembly_ref + cls.test_genome_data['assembly_ref'] = assembly_refs[0]["upa"] + + cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json')) + cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"] def getWsClient(self): return self.__class__.wsClient @@ -154,12 +176,12 @@ def fail_save_genome(self, params, error, exception=ValueError, contains=False, else: self.assertEqual(error, str(context.exception)) - def check_save_one_genome_output(self, ret, genome_name): + def check_save_one_genome_output(self, ret, genome_name, data_type=KBASE_GENOME): self.assertTrue('info' in ret) genome_info = ret['info'] self.assertEqual(genome_info[1], genome_name) - self.assertEqual(genome_info[2].split('-')[0], 'KBaseGenomes.Genome') + self.assertEqual(genome_info[2].split('-')[0], data_type) self.assertEqual(genome_info[5], self.user_id) def test_bad_one_genome_params(self): @@ -211,18 +233,18 @@ def test_genomes(self): def test_genomes_with_upgrade(self): self.start_test() - genome_name = 'test_genome' + genome_name = 'MyMetagenome' inputs = [ { 'name': genome_name, - 'data': self.test_genome_data, - 'workspace_datatype': "KBaseMetagenomes.AnnotatedMetagenomeAssembly", + 'data': self.test_metagenome_data, + 'workspace_datatype': KBASE_METAGENOME, 'upgrade': True, } ] params = {'workspace_id': self.wsID, 'inputs': inputs} ret = self.genome_interface.save_genome_mass(params)[0] - self.check_save_one_genome_output(ret, genome_name) + self.check_save_one_genome_output(ret, genome_name, data_type=KBASE_METAGENOME) def test_genomes_with_hidden(self): self.start_test() From 66af0b756a8c19fe17cca9afdcc024fe3f56bef5 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 5 Sep 2024 08:13:12 -0700 Subject: [PATCH 14/24] rm gff_handle_ref --- test/data/metagenomes/toy/metagenome.json | 1 - 1 file changed, 1 deletion(-) diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json index a3c8e5ba..57e42815 100644 --- a/test/data/metagenomes/toy/metagenome.json +++ b/test/data/metagenomes/toy/metagenome.json @@ -19,7 +19,6 @@ "gc_content": 0.64469, "genetic_code": 1, "genome_type": "Metagenome", - "gff_handle_ref": "KBH_736244", "id": "MyMetagenome", "md5": "e2ccbd5a9bed0148015bd6b784e3c1c3", "molecule_type": "SingleLetterAlphabet", From fcbb510713e34b932c216082ebc04d3d4d1e0118 Mon Sep 17 00:00:00 2001 From: Sijie Date: Mon, 9 Sep 2024 17:37:22 -0700 Subject: [PATCH 15/24] add features_handle_ref && protein_handle_ref before upload --- test/data/metagenomes/toy/features_handle_ref | 1 + test/data/metagenomes/toy/metagenome.json | 2 -- test/data/metagenomes/toy/protein_handle_ref | 1 + test/problematic_tests/save_genome_test.py | 23 +++++++++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 test/data/metagenomes/toy/features_handle_ref create mode 100644 test/data/metagenomes/toy/protein_handle_ref diff --git a/test/data/metagenomes/toy/features_handle_ref b/test/data/metagenomes/toy/features_handle_ref new file mode 100644 index 00000000..f67a087d --- /dev/null +++ b/test/data/metagenomes/toy/features_handle_ref @@ -0,0 +1 @@ +test features_handle_ref \ No newline at end of file diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json index 57e42815..95c23ef8 100644 --- a/test/data/metagenomes/toy/metagenome.json +++ b/test/data/metagenomes/toy/metagenome.json @@ -15,7 +15,6 @@ "non_coding_features": 0, "protein_encoding_gene": 20 }, - "features_handle_ref": "KBH_736245", "gc_content": 0.64469, "genetic_code": 1, "genome_type": "Metagenome", @@ -36,7 +35,6 @@ } ], "original_source_file_name": null, - "protein_handle_ref": "KBH_736243", "publications": [], "scientific_name": "Arabidopsis thaliana", "source": "GFF", diff --git a/test/data/metagenomes/toy/protein_handle_ref b/test/data/metagenomes/toy/protein_handle_ref new file mode 100644 index 00000000..4e4a8e1f --- /dev/null +++ b/test/data/metagenomes/toy/protein_handle_ref @@ -0,0 +1 @@ +test protein_handle_ref \ No newline at end of file diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 5380738c..70eff5c1 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -149,6 +149,29 @@ def prepare_data(cls): cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json')) cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"] + # Move files to the share folder + fhr_path = os.path.join(cls.scratch,'features_handle_ref') + phr_path = os.path.join(cls.scratch,'protein_handle_ref') + + shutil.copy('data/metagenomes/toy/features_handle_ref', fhr_path) + shutil.copy('data/metagenomes/toy/protein_handle_ref', phr_path) + + # Upload files to the blobstore + handle_service_outputs = cls.dfu.file_to_shock_mass( + [ + {'file_path': fhr_path, 'make_handle': 1, 'pack': 'gzip'}, + {'file_path': phr_path, 'make_handle': 1, 'pack': 'gzip'} + ] + ) + + # Update metagenome + cls.test_metagenome_data["features_handle_ref"] = handle_service_outputs[0]["handle"]["hid"] + cls.test_metagenome_data["protein_handle_ref"]= handle_service_outputs[1]["handle"]["hid"] + + # Delete shock_ids + cls.nodes_to_delete.append(handle_service_outputs[0]["shock_id"]) + cls.nodes_to_delete.append(handle_service_outputs[1]["shock_id"]) + def getWsClient(self): return self.__class__.wsClient From 32f96a04a98f333e79d315b0b302b75d9eab98c7 Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 10 Sep 2024 16:06:14 -0700 Subject: [PATCH 16/24] add boolean flag for validate_genome --- lib/GenomeFileUtil/core/GenbankToGenome.py | 6 ++++++ lib/GenomeFileUtil/core/GenomeInterface.py | 17 +++++++---------- test/problematic_tests/save_genome_test.py | 4 ++-- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenbankToGenome.py b/lib/GenomeFileUtil/core/GenbankToGenome.py index ffdd55c3..d5e42b55 100644 --- a/lib/GenomeFileUtil/core/GenbankToGenome.py +++ b/lib/GenomeFileUtil/core/GenbankToGenome.py @@ -162,6 +162,12 @@ def _import_genbank_mass(self, params): # parse genbank file self._parse_genbank(genome_obj) + # check features + self.gi.check_dna_sequence_in_features(genome_obj.genome_data) + + # validate genome + genome_obj.genome_data['warnings'] = self.gi.validate_genome(genome_obj.genome_data) + # gather all objects genome_objs.append(genome_obj) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 07538bda..cec2b3e3 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -43,9 +43,9 @@ def save_one_genome(self, params): return self._save_genome_mass(mass_params)[0] # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail. - def save_genome_mass(self, params): + def save_genome_mass(self, params, validate_genome=False): GenomeUtils.validate_mass_params(params, self._validate_genome_input_params) - return self._save_genome_mass(params) + return self._save_genome_mass(params, validate_genome=validate_genome) def _validate_genome_input_params(self, genome_input): """ @@ -128,11 +128,10 @@ def _own_handle(self, genome_data, handle_property): handle_id = dfu_shock['handle']['hid'] genome_data[handle_property] = handle_id - def _check_dna_sequence_in_features(self, genome): + def check_dna_sequence_in_features(self, genome): """ - _check_dna_sequence_in_features: check dna sequence in each feature + check_dna_sequence_in_features: check dna sequence in each feature """ - logging.info('start checking dna sequence in each feature') if 'features' in genome: features_to_work = {} @@ -166,7 +165,7 @@ def get_one_genome(self, params): return data, res['info'] # return self.dfu.get_objects(params)['data'][0] - def _save_genome_mass(self, params): + def _save_genome_mass(self, params, validate_genome=True): workspace_id = params[_WSID] inputs = params[_INPUTS] @@ -204,8 +203,8 @@ def _save_genome_mass(self, params): # check all handles point to shock nodes owned by calling user self._own_handle(data, 'genbank_handle_ref') self._own_handle(data, 'gff_handle_ref') - if "AnnotatedMetagenomeAssembly" not in ws_datatype: - self._check_dna_sequence_in_features(data) + if "AnnotatedMetagenomeAssembly" not in ws_datatype and validate_genome: + self.check_dna_sequence_in_features(data) data['warnings'] = self.validate_genome(data) # sort data @@ -406,8 +405,6 @@ def validate_genome(g): """ allowed_tiers = {'Representative', 'Reference', 'ExternalDB', 'User'} - - logging.info('Validating genome object contents') warnings = g.get('warnings', []) # TODO: Determine whether these checks make any sense for Metagenome diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 70eff5c1..f45944ae 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -307,7 +307,7 @@ def test_GenomeInterface_check_dna_sequence_in_features(self): # no feature in genome genome = {'missing_features': 'features'} copied_genome = genome.copy() - self.genome_interface._check_dna_sequence_in_features(copied_genome) + self.genome_interface.check_dna_sequence_in_features(copied_genome) self.assertEqual(copied_genome, genome) # with contigs @@ -315,7 +315,7 @@ def test_GenomeInterface_check_dna_sequence_in_features(self): for feat in copied_genome['features']: if 'dna_sequence' in feat: del feat['dna_sequence'] - self.genome_interface._check_dna_sequence_in_features(copied_genome) + self.genome_interface.check_dna_sequence_in_features(copied_genome) feature_dna_sum = 0 for feature in copied_genome['features']: From 41eed0b037601d6b33775f6674fb9002eb62be9d Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 10 Sep 2024 17:10:43 -0700 Subject: [PATCH 17/24] test validate_genome boolean flag --- test/problematic_tests/save_genome_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index f45944ae..e13d415d 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -251,7 +251,7 @@ def test_genomes(self): } ] params = {'workspace_id': self.wsID, 'inputs': inputs} - ret = self.genome_interface.save_genome_mass(params)[0] + ret = self.genome_interface.save_genome_mass(params, validate_genome=True)[0] self.check_save_one_genome_output(ret, genome_name) def test_genomes_with_upgrade(self): From 71670bce3fd95445172e415c6167a0471cd697f6 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 11 Sep 2024 16:09:33 -0700 Subject: [PATCH 18/24] 1. add pydoc for save_genme_mass; 2. make the dicts in the _save_genome_mass loop; 3. make the note much more explicit --- lib/GenomeFileUtil/core/GenomeInterface.py | 97 ++++++++++++---------- 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index cec2b3e3..8e74c53d 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -37,13 +37,48 @@ def __init__(self, config): self.ws_large_data = WsLargeDataIO(self.callback_url) def save_one_genome(self, params): + """ + Saves a single genome object to the workspace. + + This method prepares the parameters for saving a single genome and calls the + `_save_genome_mass` method to handle the actual saving process. It processes + the input parameters and performs necessary validation before saving the genome. + + Args: + params (dict): A dictionary containing the parameters for saving the genome. + Must include workspace and genome-specific information. + + Returns: + dict: The information about the saved genome object, including metadata. + The return value is derived from the `_save_genome_mass` method. + """ mass_params = GenomeUtils.set_up_single_params( params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id ) return self._save_genome_mass(mass_params)[0] - # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail. def save_genome_mass(self, params, validate_genome=False): + """ + Saves multiple genome objects to the workspace. + + This method handles the saving of multiple genome objects in bulk. It validates + the parameters, processes each genome individually, and performs necessary + updates or validations before saving. If requested, it will also validate the + genomes before saving. + + # NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, + # the workspace will fail. + + Args: + params (dict): A dictionary containing the parameters for saving the genomes. + Should include workspace ID and a list of genome inputs with their data. + validate_genome (bool, optional): A flag indicating whether to validate the + genomes before saving. Defaults to False. + + Returns: + list: A list of dictionaries, each containing information about a saved + genome object and any warnings encountered during the saving process. + """ GenomeUtils.validate_mass_params(params, self._validate_genome_input_params) return self._save_genome_mass(params, validate_genome=validate_genome) @@ -56,33 +91,6 @@ def _validate_genome_input_params(self, genome_input): if p not in genome_input: raise ValueError(f"{p} parameter is required, but missing") - def _save_genome_objects( - self, - workspace_id, - ws_datatypes, - data_paths, - names, - meta_data, - hidden_data, - ): - dfu_infos = self.ws_large_data.save_objects( - { - 'id': workspace_id, - 'objects': [ - { - 'type': ws_datatype, - 'data_json_file': data_path, - 'name': name, - 'meta': meta, - 'hidden': hidden, - } for ws_datatype, data_path, name, meta, hidden in zip( - ws_datatypes, data_paths, names, meta_data, hidden_data - ) - ] - } - ) - return dfu_infos - def _check_shock_response(self, response, errtxt): """ _check_shock_response: check shock node response (Copied from DataFileUtil) @@ -170,28 +178,28 @@ def _save_genome_mass(self, params, validate_genome=True): workspace_id = params[_WSID] inputs = params[_INPUTS] - ws_datatypes = [] - data_paths = [] - names = [] - meta_data = [] - hidden_data = [] + objects = [] warnings = [] for input_params in inputs: + obj = {} + # retrive required params name = input_params['name'] data = input_params['data'] # XXX there is no `workspace_datatype` param in the spec - # NOTE: The method caller should not be able to choose an arbitrary workspace type + # NOTE: This allows a user to specify any arbitrary workspace type which could cause, + # in the worst case, data corruption. It should be removed from the API + # (note it is not currently documented) so users cannot access it. ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome") # XXX there is no `meta` param in the spec meta = input_params.get('meta', {}) - ws_datatypes.append(ws_datatype) - names.append(name) - meta_data.append(meta) + obj["type"] = ws_datatype + obj["name"] = name + obj["meta"] = meta if "AnnotatedMetagenomeAssembly" in ws_datatype: if input_params.get('upgrade') or 'feature_counts' not in data: @@ -217,17 +225,14 @@ def _save_genome_mass(self, params, validate_genome=True): else: hidden = 0 - data_paths.append(data_path) - hidden_data.append(hidden) + obj["data_json_file"] = data_path + obj["hidden"] = hidden + + objects.append(obj) warnings.append(data.get('warnings', [])) - dfu_infos = self._save_genome_objects( - workspace_id, - ws_datatypes, - data_paths, - names, - meta_data, - hidden_data, + dfu_infos = self.ws_large_data.save_objects( + {'id': workspace_id, 'objects': objects} ) output = [ From 0cbe1551b827ac860afcccaf02fa134208a45308 Mon Sep 17 00:00:00 2001 From: Sijie Date: Thu, 12 Sep 2024 17:22:54 -0700 Subject: [PATCH 19/24] update release notes && remove tiny files --- RELEASE_NOTES.md | 1 - test/data/metagenomes/toy/features_handle_ref | 1 - test/data/metagenomes/toy/protein_handle_ref | 1 - test/problematic_tests/save_genome_test.py | 19 ++++++++++++++++--- 4 files changed, 16 insertions(+), 6 deletions(-) delete mode 100644 test/data/metagenomes/toy/features_handle_ref delete mode 100644 test/data/metagenomes/toy/protein_handle_ref diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index c0e69fda..c020b8f0 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -6,7 +6,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.11.7] - TBD -- Genomes are now saved in batches to the workspace - Unusable `export_genome_features_protein_to_fasta` function was removed - The `genbanks_to_genomes` method was added to allow users to upload multiple genome objects at once diff --git a/test/data/metagenomes/toy/features_handle_ref b/test/data/metagenomes/toy/features_handle_ref deleted file mode 100644 index f67a087d..00000000 --- a/test/data/metagenomes/toy/features_handle_ref +++ /dev/null @@ -1 +0,0 @@ -test features_handle_ref \ No newline at end of file diff --git a/test/data/metagenomes/toy/protein_handle_ref b/test/data/metagenomes/toy/protein_handle_ref deleted file mode 100644 index 4e4a8e1f..00000000 --- a/test/data/metagenomes/toy/protein_handle_ref +++ /dev/null @@ -1 +0,0 @@ -test protein_handle_ref \ No newline at end of file diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index e13d415d..1c4644aa 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -4,6 +4,7 @@ import json # noqa: F401 import os # noqa: F401 import shutil +import tempfile import time import unittest import urllib.error @@ -149,12 +150,20 @@ def prepare_data(cls): cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json')) cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"] - # Move files to the share folder + # Set taregt paths in the share folder fhr_path = os.path.join(cls.scratch,'features_handle_ref') phr_path = os.path.join(cls.scratch,'protein_handle_ref') - shutil.copy('data/metagenomes/toy/features_handle_ref', fhr_path) - shutil.copy('data/metagenomes/toy/protein_handle_ref', phr_path) + # Create temp files + with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_features_file: + temp_features_file.write("test features_handle_ref") + + with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_protein_file: + temp_protein_file.write("test protein_handle_ref") + + # Move files to the share folder + shutil.copy(temp_features_file.name, fhr_path) + shutil.copy(temp_protein_file.name, phr_path) # Upload files to the blobstore handle_service_outputs = cls.dfu.file_to_shock_mass( @@ -172,6 +181,10 @@ def prepare_data(cls): cls.nodes_to_delete.append(handle_service_outputs[0]["shock_id"]) cls.nodes_to_delete.append(handle_service_outputs[1]["shock_id"]) + # Remove temp files + os.remove(temp_features_file.name) + os.remove(temp_protein_file.name) + def getWsClient(self): return self.__class__.wsClient From 52e280d6b26116cb2d353f168cd54ff787297376 Mon Sep 17 00:00:00 2001 From: Sijie Date: Tue, 17 Sep 2024 15:22:36 -0700 Subject: [PATCH 20/24] add more info and warnings checks in test --- lib/GenomeFileUtil/core/GenomeInterface.py | 2 +- test/problematic_tests/save_genome_test.py | 40 ++++++++++++++++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 8e74c53d..1bcac050 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -187,7 +187,7 @@ def _save_genome_mass(self, params, validate_genome=True): # retrive required params name = input_params['name'] - data = input_params['data'] + data = dict(input_params['data']) # XXX there is no `workspace_datatype` param in the spec # NOTE: This allows a user to specify any arbitrary workspace type which could cause, diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 1c4644aa..648a36a6 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -12,6 +12,7 @@ import urllib.request from configparser import ConfigParser from os import environ +from datetime import datetime import requests # noqa: F401 @@ -24,8 +25,17 @@ from GenomeFileUtil.core.GenomeInterface import GenomeInterface from installed_clients.WorkspaceClient import Workspace as workspaceService -KBASE_GENOME = "KBaseGenomes.Genome" -KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly" +_KBASE_GENOME = "KBaseGenomes.Genome" +_KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly" + +_GENOME_FILE_WARNINGS = [ + 'For prokaryotes, CDS array should generally be the same length as the Features array.', + 'Genome molecule_type Unknown is not expected for domain Bacteria.', + 'Unable to determine organism taxonomy' +] +_METAGENOME_FILE_WARNINGS = [ + 'SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS.' +] class SaveGenomeTest(unittest.TestCase): @@ -212,13 +222,27 @@ def fail_save_genome(self, params, error, exception=ValueError, contains=False, else: self.assertEqual(error, str(context.exception)) - def check_save_one_genome_output(self, ret, genome_name, data_type=KBASE_GENOME): + def check_save_one_genome_output( + self, + ret, + genome_name, + data_type=_KBASE_GENOME, + warnings=_GENOME_FILE_WARNINGS + ): self.assertTrue('info' in ret) + self.assertTrue('warnings' in ret) + # Check info genome_info = ret['info'] self.assertEqual(genome_info[1], genome_name) self.assertEqual(genome_info[2].split('-')[0], data_type) + self.assertTrue(datetime.strptime(genome_info[3], '%Y-%m-%dT%H:%M:%S+%f')) self.assertEqual(genome_info[5], self.user_id) + self.assertEqual(genome_info[6], self.wsID) + self.assertEqual(genome_info[7], self.wsName) + + # Check warnings + self.assertEqual(ret['warnings'], warnings) def test_bad_one_genome_params(self): self.start_test() @@ -274,13 +298,15 @@ def test_genomes_with_upgrade(self): { 'name': genome_name, 'data': self.test_metagenome_data, - 'workspace_datatype': KBASE_METAGENOME, + 'workspace_datatype': _KBASE_METAGENOME, 'upgrade': True, } ] params = {'workspace_id': self.wsID, 'inputs': inputs} ret = self.genome_interface.save_genome_mass(params)[0] - self.check_save_one_genome_output(ret, genome_name, data_type=KBASE_METAGENOME) + self.check_save_one_genome_output( + ret, genome_name, data_type=_KBASE_METAGENOME, warnings=_METAGENOME_FILE_WARNINGS + ) def test_genomes_with_hidden(self): self.start_test() @@ -294,7 +320,7 @@ def test_genomes_with_hidden(self): ] params = {'workspace_id': self.wsID, 'inputs': inputs} ret = self.genome_interface.save_genome_mass(params)[0] - self.check_save_one_genome_output(ret, genome_name) + self.check_save_one_genome_output(ret, genome_name, warnings=[]) inputs = [ { @@ -305,7 +331,7 @@ def test_genomes_with_hidden(self): ] params = {'workspace_id': self.wsID, 'inputs': inputs} ret = self.genome_interface.save_genome_mass(params)[0] - self.check_save_one_genome_output(ret, genome_name) + self.check_save_one_genome_output(ret, genome_name, warnings=[]) def test_bad_genomes_params_missing_parameter(self): self.start_test() From f018f0e7ab09f39b535e54c84e8e5f090d6cee9d Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 18 Sep 2024 21:40:19 -0700 Subject: [PATCH 21/24] remove metagenome from test --- lib/GenomeFileUtil/core/GenomeInterface.py | 11 +-- test/data/metagenomes/toy/metagenome.json | 50 ------------ test/problematic_tests/save_genome_test.py | 89 ++-------------------- 3 files changed, 7 insertions(+), 143 deletions(-) delete mode 100644 test/data/metagenomes/toy/metagenome.json diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 1bcac050..0265dd6a 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -201,10 +201,7 @@ def _save_genome_mass(self, params, validate_genome=True): obj["name"] = name obj["meta"] = meta - if "AnnotatedMetagenomeAssembly" in ws_datatype: - if input_params.get('upgrade') or 'feature_counts' not in data: - data = self._update_metagenome(data) - else: + if "AnnotatedMetagenomeAssembly" not in ws_datatype: if input_params.get('upgrade') or 'feature_counts' not in data: data = self._update_genome(data) @@ -267,12 +264,6 @@ def determine_tier(source): return "Ensembl", ['Representative', 'ExternalDB'] return source, ['User'] - def _update_metagenome(self, genome): - """Checks for missing required fields and fixes breaking changes""" - if 'molecule_type' not in genome: - genome['molecule_type'] = 'Unknown' - return genome - def _update_genome(self, genome): """Checks for missing required fields and fixes breaking changes""" # do top level updates diff --git a/test/data/metagenomes/toy/metagenome.json b/test/data/metagenomes/toy/metagenome.json deleted file mode 100644 index 95c23ef8..00000000 --- a/test/data/metagenomes/toy/metagenome.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "contig_ids": [ - "Ga0065724_100001" - ], - "contig_lengths": [ - 538871 - ], - "dna_size": 538871, - "domain": "Eukaryota", - "environment": null, - "external_source_origination_date": null, - "feature_counts": { - "CDS": 20, - "gene": 20, - "non_coding_features": 0, - "protein_encoding_gene": 20 - }, - "gc_content": 0.64469, - "genetic_code": 1, - "genome_type": "Metagenome", - "id": "MyMetagenome", - "md5": "e2ccbd5a9bed0148015bd6b784e3c1c3", - "molecule_type": "SingleLetterAlphabet", - "notes": null, - "num_contigs": 1, - "num_features": 40, - "ontologies_present": {}, - "ontology_events": [ - { - "id": "GO", - "method": "GenomeFileUtils Genbank uploader from annotations", - "method_version": "0.11.7", - "ontology_ref": "KBaseOntology/gene_ontology", - "timestamp": "2024_09_05_06_45_31" - } - ], - "original_source_file_name": null, - "publications": [], - "scientific_name": "Arabidopsis thaliana", - "source": "GFF", - "source_id": "unknown", - "suspect": 1, - "taxon_assignments": { - "ncbi": "3702" - }, - "taxonomy": "cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; eudicotyledons; Gunneridae; Pentapetalae; rosids; malvids; Brassicales; Brassicaceae; Camelineae; Arabidopsis", - "warnings": [ - "SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS." - ] -} \ No newline at end of file diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index 648a36a6..a396c5ba 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -26,16 +26,11 @@ from installed_clients.WorkspaceClient import Workspace as workspaceService _KBASE_GENOME = "KBaseGenomes.Genome" -_KBASE_METAGENOME = "KBaseMetagenomes.AnnotatedMetagenomeAssembly" - _GENOME_FILE_WARNINGS = [ 'For prokaryotes, CDS array should generally be the same length as the Features array.', 'Genome molecule_type Unknown is not expected for domain Bacteria.', 'Unable to determine organism taxonomy' ] -_METAGENOME_FILE_WARNINGS = [ - 'SUSPECT: This genome has 20 genes that needed to be spoofed for existing parentless CDS.' -] class SaveGenomeTest(unittest.TestCase): @@ -129,71 +124,16 @@ def delete_shock_node(cls, node_id): @classmethod def prepare_data(cls): - assembly_file_path = os.path.join(cls.scratch,'e_coli_assembly.fasta') - meta_file_path = os.path.join(cls.scratch,'metagenome.fa') - shutil.copy('data/e_coli/e_coli_assembly.fasta', assembly_file_path) - shutil.copy('data/metagenomes/toy/metagenome.fa', meta_file_path) - au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) - - assembly_refs = au.save_assemblies_from_fastas( - { - 'workspace_id': cls.wsID, - 'inputs': [ - { - 'assembly_name': 'e_coli.assembly', - 'file': assembly_file_path - }, - { - 'assembly_name': 'metagenome.assembly', - 'file': meta_file_path - } - ] - } - )["results"] - + assembly_ref = au.save_assembly_from_fasta({ + 'workspace_name': cls.wsName, + 'assembly_name': 'e_coli.assembly', + 'file': {'path': assembly_file_path} + }) cls.test_genome_data = json.load(open('data/e_coli/e_coli.json')) - cls.test_genome_data['assembly_ref'] = assembly_refs[0]["upa"] - - cls.test_metagenome_data = json.load(open('data/metagenomes/toy/metagenome.json')) - cls.test_metagenome_data['assembly_ref'] = assembly_refs[1]["upa"] - - # Set taregt paths in the share folder - fhr_path = os.path.join(cls.scratch,'features_handle_ref') - phr_path = os.path.join(cls.scratch,'protein_handle_ref') - - # Create temp files - with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_features_file: - temp_features_file.write("test features_handle_ref") - - with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8') as temp_protein_file: - temp_protein_file.write("test protein_handle_ref") - - # Move files to the share folder - shutil.copy(temp_features_file.name, fhr_path) - shutil.copy(temp_protein_file.name, phr_path) - - # Upload files to the blobstore - handle_service_outputs = cls.dfu.file_to_shock_mass( - [ - {'file_path': fhr_path, 'make_handle': 1, 'pack': 'gzip'}, - {'file_path': phr_path, 'make_handle': 1, 'pack': 'gzip'} - ] - ) - - # Update metagenome - cls.test_metagenome_data["features_handle_ref"] = handle_service_outputs[0]["handle"]["hid"] - cls.test_metagenome_data["protein_handle_ref"]= handle_service_outputs[1]["handle"]["hid"] - - # Delete shock_ids - cls.nodes_to_delete.append(handle_service_outputs[0]["shock_id"]) - cls.nodes_to_delete.append(handle_service_outputs[1]["shock_id"]) - - # Remove temp files - os.remove(temp_features_file.name) - os.remove(temp_protein_file.name) + cls.test_genome_data['assembly_ref'] = assembly_ref def getWsClient(self): return self.__class__.wsClient @@ -291,23 +231,6 @@ def test_genomes(self): ret = self.genome_interface.save_genome_mass(params, validate_genome=True)[0] self.check_save_one_genome_output(ret, genome_name) - def test_genomes_with_upgrade(self): - self.start_test() - genome_name = 'MyMetagenome' - inputs = [ - { - 'name': genome_name, - 'data': self.test_metagenome_data, - 'workspace_datatype': _KBASE_METAGENOME, - 'upgrade': True, - } - ] - params = {'workspace_id': self.wsID, 'inputs': inputs} - ret = self.genome_interface.save_genome_mass(params)[0] - self.check_save_one_genome_output( - ret, genome_name, data_type=_KBASE_METAGENOME, warnings=_METAGENOME_FILE_WARNINGS - ) - def test_genomes_with_hidden(self): self.start_test() genome_name = 'test_genome_hidden' From 54fedf0a3620a71458812bdfcd8b07267fdc34fa Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 18 Sep 2024 21:42:51 -0700 Subject: [PATCH 22/24] remove unused lib --- test/problematic_tests/save_genome_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/problematic_tests/save_genome_test.py b/test/problematic_tests/save_genome_test.py index a396c5ba..c2e51b46 100644 --- a/test/problematic_tests/save_genome_test.py +++ b/test/problematic_tests/save_genome_test.py @@ -4,7 +4,6 @@ import json # noqa: F401 import os # noqa: F401 import shutil -import tempfile import time import unittest import urllib.error From 861a50b5ce2d0b8b481fe514b41a6d6b89ee0c96 Mon Sep 17 00:00:00 2001 From: Sijie Date: Wed, 30 Oct 2024 16:35:38 -0700 Subject: [PATCH 23/24] fix documentation --- lib/GenomeFileUtil/core/GenomeInterface.py | 6 ++---- lib/GenomeFileUtil/core/GenomeUtils.py | 19 +++++++++---------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/lib/GenomeFileUtil/core/GenomeInterface.py b/lib/GenomeFileUtil/core/GenomeInterface.py index 0265dd6a..e16b12e1 100644 --- a/lib/GenomeFileUtil/core/GenomeInterface.py +++ b/lib/GenomeFileUtil/core/GenomeInterface.py @@ -40,9 +40,8 @@ def save_one_genome(self, params): """ Saves a single genome object to the workspace. - This method prepares the parameters for saving a single genome and calls the - `_save_genome_mass` method to handle the actual saving process. It processes - the input parameters and performs necessary validation before saving the genome. + This method prepares and validates the necessary parameters for saving a genome. + It then executes the saving process and returns relevant information about the saved genome. Args: params (dict): A dictionary containing the parameters for saving the genome. @@ -50,7 +49,6 @@ def save_one_genome(self, params): Returns: dict: The information about the saved genome object, including metadata. - The return value is derived from the `_save_genome_mass` method. """ mass_params = GenomeUtils.set_up_single_params( params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id diff --git a/lib/GenomeFileUtil/core/GenomeUtils.py b/lib/GenomeFileUtil/core/GenomeUtils.py index d1b63935..491f6d24 100644 --- a/lib/GenomeFileUtil/core/GenomeUtils.py +++ b/lib/GenomeFileUtil/core/GenomeUtils.py @@ -495,7 +495,7 @@ def set_up_single_params( validate_params_func: Callable[[Dict[str, Any]], None], ws_name_to_id_func: Callable[[str], int] ) -> Dict[str, Any]: - """ + f""" Sets up parameters by validating them and ensuring that exactly one of workspace ID or name is provided. Args: @@ -509,12 +509,11 @@ def set_up_single_params( Returns: Dict[str, Any]: A dictionary containing the workspace ID and the processed parameters. The dictionary - has keys '_WSID' and '_INPUTS', where '_WSID' is the workspace ID and '_INPUTS' is a list containing + has keys {_WSID} and {_INPUTS}, where {_WSID} is the workspace ID and {_INPUTS} is a list containing the input parameters. Raises: ValueError: If neither or both the workspace ID and workspace name are provided in the parameters. - KeyError: If the workspace ID or name is missing or invalid. Notes: - If a workspace ID is not provided, the function will attempt to convert the workspace name to an ID @@ -540,24 +539,24 @@ def validate_mass_params( params: Dict[str, Any], validate_params_func: Callable[[Dict[str, Any]], None] ) -> None: - """ + f""" Validates the provided parameters according to specific rules. Args: params (Dict[str, Any]): A dictionary containing parameters to validate. Must include: - - _WSID: A workspace ID, which must be present and valid. - - _INPUTS: A list of parameter dictionaries, each of which must be validated by `validate_params_func`. + - {_WSID}: A workspace ID, which must be present and valid. + - {_INPUTS}: A list of parameter dictionaries, each of which must be validated by `validate_params_func`. validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters and validates it. The function should raise an exception if the parameters are invalid. Raises: - ValueError: If `_WSID` is missing or invalid, if `_INPUTS` is missing or not a non-empty list, or if any - entry in `_INPUTS` is not a dictionary or fails validation. + ValueError: If {_WSID} is missing or invalid, if {_INPUTS} is missing or not a non-empty list, or if any + entry in {_INPUTS} is not a dictionary or fails validation. Notes: - - The function checks that `_WSID` is present and converts it to an integer using `get_int`. - - The `_INPUTS` field must be a non-empty list of dictionaries. Each dictionary in the list is validated + - The function checks that {_WSID} is present and converts it to an integer using `get_int`. + - The {_INPUTS} field must be a non-empty list of dictionaries. Each dictionary in the list is validated using `validate_params_func`. - If any validation fails, a `ValueError` is raised with a message indicating the issue and entry index. """ From 01a1dc87a775123e18ed82d7dc2dfdea1fbba148 Mon Sep 17 00:00:00 2001 From: Sijie Date: Fri, 1 Nov 2024 10:35:19 -0700 Subject: [PATCH 24/24] add workspace_id in GenomeFileUtil.spec --- GenomeFileUtil.spec | 1 + lib/GenomeFileUtil/GenomeFileUtilImpl.py | 44 ++++++++++++------------ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/GenomeFileUtil.spec b/GenomeFileUtil.spec index 05cca1f9..5971b43d 100644 --- a/GenomeFileUtil.spec +++ b/GenomeFileUtil.spec @@ -318,6 +318,7 @@ module GenomeFileUtil { returns (MetagenomeSaveResult returnVal) authentication required; typedef structure { + int workspace_id; string workspace; string name; KBaseGenomes.Genome data; diff --git a/lib/GenomeFileUtil/GenomeFileUtilImpl.py b/lib/GenomeFileUtil/GenomeFileUtilImpl.py index 9ae1311d..b41f4d82 100644 --- a/lib/GenomeFileUtil/GenomeFileUtilImpl.py +++ b/lib/GenomeFileUtil/GenomeFileUtilImpl.py @@ -69,7 +69,7 @@ class GenomeFileUtil: ######################################### noqa VERSION = "0.11.7" GIT_URL = "git@github.com:kbaseapps/GenomeFileUtil.git" - GIT_COMMIT_HASH = "330d6c2da8a0dc2d57efffe690e8db2928455776" + GIT_COMMIT_HASH = "861a50b5ce2d0b8b481fe514b41a6d6b89ee0c96" #BEGIN_CLASS_HEADER #END_CLASS_HEADER @@ -831,27 +831,27 @@ def fasta_gff_to_metagenome(self, ctx, params): def save_one_genome(self, ctx, params): """ :param params: instance of type "SaveOneGenomeParams" -> structure: - parameter "workspace" of String, parameter "name" of String, - parameter "data" of type "Genome" (Genome type -- annotated and - assembled genome data. Field descriptions: id - string - KBase - legacy data ID scientific_name - string - human readable species - name domain - string - human readable phylogenetic domain name - (eg. "Bacteria") warnings - list of string - genome-level warnings - generated in the annotation process genome_tiers - list of string - - controlled vocabulary (based on app input and checked by - GenomeFileUtil) A list of labels describing the data source for - this genome. Allowed values - Representative, Reference, - ExternalDB, User Tier assignments based on genome source: * All - phytozome - Representative and ExternalDB * Phytozome flagship - genomes - Reference, Representative and ExternalDB * Ensembl - - Representative and ExternalDB * RefSeq Reference - Reference, - Representative and ExternalDB * RefSeq Representative - - Representative and ExternalDB * RefSeq Latest or All Assemblies - folder - ExternalDB * User Data - User tagged feature_counts - map - of string to integer - total counts of each type of feature keys - are a controlled vocabulary of - "CDS", "gene", "misc_feature", - "misc_recomb", "mobile_element", "ncRNA" - 72, - "non_coding_features", "non_coding_genes", + parameter "workspace_id" of Long, parameter "workspace" of String, + parameter "name" of String, parameter "data" of type "Genome" + (Genome type -- annotated and assembled genome data. Field + descriptions: id - string - KBase legacy data ID scientific_name - + string - human readable species name domain - string - human + readable phylogenetic domain name (eg. "Bacteria") warnings - list + of string - genome-level warnings generated in the annotation + process genome_tiers - list of string - controlled vocabulary + (based on app input and checked by GenomeFileUtil) A list of + labels describing the data source for this genome. Allowed values + - Representative, Reference, ExternalDB, User Tier assignments + based on genome source: * All phytozome - Representative and + ExternalDB * Phytozome flagship genomes - Reference, + Representative and ExternalDB * Ensembl - Representative and + ExternalDB * RefSeq Reference - Reference, Representative and + ExternalDB * RefSeq Representative - Representative and ExternalDB + * RefSeq Latest or All Assemblies folder - ExternalDB * User Data + - User tagged feature_counts - map of string to integer - total + counts of each type of feature keys are a controlled vocabulary of + - "CDS", "gene", "misc_feature", "misc_recomb", "mobile_element", + "ncRNA" - 72, "non_coding_features", "non_coding_genes", "protein_encoding_gene", "rRNA", "rep_origin", "repeat_region", "tRNA" genetic_code - int - An NCBI-assigned taxonomic category for the organism See here -