-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathAssemblyUtil.spec
251 lines (209 loc) · 9.49 KB
/
AssemblyUtil.spec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/*
*/
module AssemblyUtil {
/* A Unique Permanent Address for a workspace object, which is of the form W/O/V,
where W is the numeric workspace ID, O is the numeric object ID, and V is the object
version.
*/
typedef string upa;
typedef structure {
string path;
string assembly_name;
} FastaAssemblyFile;
/*
@optional filename
*/
typedef structure {
string ref;
string filename;
} GetAssemblyParams;
/*
Given a reference to an Assembly (or legacy ContigSet data object), along with a set of options,
construct a local Fasta file with the sequence data. If filename is set, attempt to save to the
specified filename. Otherwise, a random name will be generated.
*/
funcdef get_assembly_as_fasta(GetAssemblyParams params)
returns (FastaAssemblyFile file) authentication required;
/*
ref: workspace reference.
KBaseOjbReferences:
ref_lst: is an object wrapped array of KBase object references, which can be of the following types:
- KBaseGenomes.Genome
- KBaseSets.AssemblySet
- KBaseMetagenome.BinnedContigs
- KBaseGenomes.ContigSet
- KBaseGenomeAnnotations.Assembly
- KBaseSearch.GenomeSet
- KBaseSets.GenomeSet
ref_fastas
paths - list of paths to fasta files associated with workspace object.
type - workspace object type
parent_refs - (optional) list of associated workspace object references if different from the output key
*/
typedef string ref;
typedef structure {
list<ref> ref_lst;
} KBaseOjbReferences;
typedef structure {
list<string> paths;
list<ref> parent_refs;
string type;
} ref_fastas;
/*
Given a reference list of KBase objects constructs a local Fasta file with the sequence data for each ref.
*/
funcdef get_fastas(KBaseOjbReferences params)
returns (mapping<ref, ref_fastas> output) authentication required;
typedef structure {
string input_ref;
} ExportParams;
typedef structure {
string shock_id;
} ExportOutput;
/*
A method designed especially for download, this calls 'get_assembly_as_fasta' to do
the work, but then packages the output with WS provenance and object info into
a zip file and saves to shock.
*/
funcdef export_assembly_as_fasta(ExportParams params)
returns (ExportOutput output) authentication required;
typedef string ShockNodeId;
/*
Structure for setting additional Contig information per contig
is_circ - flag if contig is circular, 0 is false, 1 is true, missing
indicates unknown
description - if set, sets the description of the field in the assembly object
which may override what was in the fasta file
*/
typedef structure {
int is_circ;
string description;
} ExtraContigInfo;
/*
Required arguments:
Exactly one of:
file - a pre-existing FASTA file to import. The 'assembly_name' field in the
FastaAssemblyFile object is ignored.
shock_id - an ID of a node in the Blobstore containing the FASTA file.
Exactly one of:
workspace_id - the immutable, numeric ID of the target workspace. Always prefer
providing the ID over the name.
workspace_name - the name of the target workspace.
assembly_name - target object name
Optional arguments:
type - should be one of 'isolate', 'metagenome', (maybe 'transcriptome').
Defaults to 'Unknown'
min_contig_length - if set and value is greater than 1, this will only
include sequences with length greater or equal to the min_contig_length
specified, discarding all other sequences
contig_info - map from contig_id to a small structure that can be used to set the
is_circular and description fields for Assemblies (optional)
*/
typedef structure {
FastaAssemblyFile file;
ShockNodeId shock_id;
int workspace_id;
string workspace_name;
string assembly_name;
string type;
string external_source;
string external_source_id;
int min_contig_length;
mapping<string, ExtraContigInfo> contig_info;
} SaveAssemblyParams;
/* Information about an object, including user provided metadata.
objid - the numerical id of the object.
name - the name of the object.
type - the type of the object.
save_date - the save date of the object.
ver - the version of the object.
saved_by - the user that saved or copied the object.
wsid - the id of the workspace containing the object.
workspace - the name of the workspace containing the object.
chsum - the md5 checksum of the object.
size - the size of the object in bytes.
meta - arbitrary user-supplied metadata about
the object.
*/
typedef tuple<int objid, string name, string type, string save_date,
int version, string saved_by, int wsid, string workspace, string chsum,
int size, mapping<string, string> meta> object_info;
/* Results from saving an assembly.
upa - the address of the resulting workspace object.
filtered_input - the filtered input file if the minimum contig length parameter is
present and > 0. null otherwise.
*/
typedef structure {
upa upa;
string filtered_input;
object_info object_info;
} SaveAssemblyResult;
/* Save a KBase Workspace assembly object from a FASTA file. */
funcdef save_assembly_from_fasta2(SaveAssemblyParams params)
returns (SaveAssemblyResult result) authentication required;
/* @deprecated AssemblyUtil.save_assembly_from_fasta2 */
funcdef save_assembly_from_fasta(SaveAssemblyParams params) returns (string ref)
authentication required;
/* An input FASTA file and metadata for import.
Required arguments:
Exactly one of:
file - a path to an input FASTA file. Must be accessible inside the AssemblyUtil
docker continer.
node - a node ID for a Blobstore (formerly Shock) node containing an input FASTA
file.
assembly_name - the workspace name under which to save the Assembly object.
Optional arguments:
type - should be one of 'isolate', 'metagenome', (maybe 'transcriptome').
Defaults to 'Unknown'
external_source - the source of the input data. E.g. JGI, NCBI, etc.
external_source_id - the ID of the input data at the source.
contig_info - map from contig_id to a small structure that can be used to set the
is_circular and description fields for Assemblies
object_metadata - An arbitrary key-value pair intended for addition to the metadata of the Assembly object.
Saved along with the object in the Workspace. Note that any auto metadata keys in the Assembly
typespec will take precedence over any user-submitted keys.
*/
typedef structure {
string file;
string node;
string assembly_name;
string type;
string external_source;
string external_source_id;
mapping<string, ExtraContigInfo> contig_info;
mapping<string, string> object_metadata;
} FASTAInput;
/* Input for the save_assemblies_from_fastas function.
Required arguments:
workspace_id - the numerical ID of the workspace in which to save the Assemblies.
inputs - a list of FASTA files to import. All of the files must be from the same
source - either all local files or all Blobstore nodes.
Optional arguments:
min_contig_length - an integer > 1. If present, sequences of lesser length will
be removed from the input FASTA files.
*/
typedef structure {
int workspace_id;
list<FASTAInput> inputs;
int min_contig_length;
} SaveAssembliesParams;
/* Results for the save_assemblies_from_fastas function.
results - the results of the save operation in the same order as the input.
*/
typedef structure {
list<SaveAssemblyResult> results;
} SaveAssembliesResults;
/* Save multiple assembly objects from FASTA files.
WARNING: The code currently saves all assembly object data in memory before sending it
to the workspace in a single batch. Since the object data doesn't include sequences,
it is typically small and so in most cases this shouldn't cause issues. However, many
assemblies and / or many contigs could conceivably cause memeory issues or could
cause the workspace to reject the data package if the serialized data is > 1GB.
TODO: If this becomes a common issue (not particularly likely?) update the code to
* Save assembly object data on disk if it becomes too large
* Batch uploads to the workspace based on data size
*/
funcdef save_assemblies_from_fastas(SaveAssembliesParams params)
returns(SaveAssembliesResults results)
authentication required;
};