Skip to content

Commit

Permalink
add save_genomes function
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiangs18 committed Aug 15, 2024
1 parent 4819598 commit 6208a9a
Show file tree
Hide file tree
Showing 8 changed files with 637 additions and 69 deletions.
19 changes: 19 additions & 0 deletions GenomeFileUtil.spec
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,25 @@ module GenomeFileUtil {
funcdef save_one_genome(SaveOneGenomeParams params)
returns (SaveGenomeResult returnVal) authentication required;

typedef structure {
string name;
KBaseGenomes.Genome data;
boolean hidden;
boolean upgrade;
} GenomeInput;

typedef structure {
int workspace_id;
list<GenomeInput> inputs;
} SaveGenomesParams;

typedef structure {
list<SaveGenomeResult> results;
} SaveGenomesResults;

funcdef save_genomes(SaveGenomesParams params)
returns(SaveGenomesResults results) authentication required;

/*
gff_file - object containing path to gff_file
ws_ref - input Assembly or Genome reference
Expand Down
2 changes: 2 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.11.7] - TBD
- The `save_genomes` method was added to allow users to save genmes in batch
- Parsed and validated genome before upload
- Unusable `export_genome_features_protein_to_fasta` function was removed
- The `genbanks_to_genomes` method was added to allow users to upload multiple
genome objects at once
Expand Down
438 changes: 437 additions & 1 deletion lib/GenomeFileUtil/GenomeFileUtilImpl.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions lib/GenomeFileUtil/GenomeFileUtilServer.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,10 @@ def __init__(self):
name='GenomeFileUtil.save_one_genome',
types=[dict])
self.method_authentication['GenomeFileUtil.save_one_genome'] = 'required' # noqa
self.rpc_service.add(impl_GenomeFileUtil.save_genomes,
name='GenomeFileUtil.save_genomes',
types=[dict])
self.method_authentication['GenomeFileUtil.save_genomes'] = 'required' # noqa
self.rpc_service.add(impl_GenomeFileUtil.ws_obj_gff_to_genome,
name='GenomeFileUtil.ws_obj_gff_to_genome',
types=[dict])
Expand Down
13 changes: 3 additions & 10 deletions lib/GenomeFileUtil/core/GenbankToGenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from installed_clients.AssemblyUtilClient import AssemblyUtil
from installed_clients.DataFileUtilClient import DataFileUtil
from GenomeFileUtil.core.GenomeInterface import GenomeInterface
from GenomeFileUtil.core.MiscUtils import get_int
from installed_clients.WorkspaceClient import Workspace
from GenomeFileUtil.core.GenomeUtils import (
is_parent, propagate_cds_props_to_gene, warnings, parse_inferences,
Expand Down Expand Up @@ -125,7 +126,7 @@ def _set_up_single_params(self, params):
# avoid side effects and keep variables in params unmodfied
inputs = dict(params)
self._validate_params(inputs)
ws_id = self._get_int(inputs.pop(_WSID, None), _WSID)
ws_id = get_int(inputs.pop(_WSID, None), _WSID)
ws_name = inputs.pop(_WSNAME, None)
if (bool(ws_id) == bool(ws_name)): # xnor
raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided")
Expand All @@ -137,7 +138,7 @@ def _set_up_single_params(self, params):
return mass_params

def _validate_mass_params(self, params):
ws_id = self._get_int(params.get(_WSID), _WSID)
ws_id = get_int(params.get(_WSID), _WSID)
if not ws_id:
raise ValueError(f"{_WSID} is required")
inputs = params.get(_INPUTS)
Expand All @@ -151,14 +152,6 @@ def _validate_mass_params(self, params):
except Exception as e:
raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e

def _get_int(self, putative_int, name, minimum=1):
if putative_int is not None:
if type(putative_int) is not int:
raise ValueError(f"{name} must be an integer, got: {putative_int}")
if putative_int < minimum:
raise ValueError(f"{name} must be an integer >= {minimum}")
return putative_int

def _import_genbank_mass(self, params):

workspace_id = params[_WSID]
Expand Down
202 changes: 145 additions & 57 deletions lib/GenomeFileUtil/core/GenomeInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,14 @@
from installed_clients.DataFileUtilClient import DataFileUtil
from installed_clients.WSLargeDataIOClient import WsLargeDataIO
from GenomeFileUtil.core import GenomeUtils
from GenomeFileUtil.core.MiscUtils import get_int

MAX_GENOME_SIZE = 2**30

_WS = "workspace"
_WSID = "workspace_id"
_INPUTS = "inputs"


class GenomeInterface:
def __init__(self, config):
Expand All @@ -32,18 +37,80 @@ def __init__(self, config):
self.scratch = config.raw['scratch']
self.ws_large_data = WsLargeDataIO(self.callback_url)

@staticmethod
def _validate_save_one_genome_params(params):
def save_one_genome(self, params):
print("validating parameters")
mass_params = self._set_up_single_params(params)
return self._save_genome_mass(mass_params)[0]

def save_genome_mass(self, params):
print("validating parameters")
self._validate_mass_params(params)
return self._save_genome_mass(params)

def _set_up_single_params(self, params):
inputs = dict(params)
self._validate_genome_input_params(inputs)
ws_id = get_int(inputs.pop(_WSID, None), _WSID)
ws_name = inputs.pop('workspace', None)
if bool(ws_id) == bool(ws_name): # xnor
raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided")
if not ws_id:
print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
+ "a workspace ID over a mutable workspace name that may cause race conditions")
ws_id = self.dfu.ws_name_to_id(ws_name)
mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
return mass_params

def _validate_mass_params(self, params):
ws_id = get_int(params.get(_WSID), _WSID)
if not ws_id:
raise ValueError(f"{_WSID} is required")
inputs = params.get(_INPUTS)
if not inputs or type(inputs) != list:
raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
for i, inp in enumerate(inputs, start=1):
if type(inp) != dict:
raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
try:
self._validate_genome_input_params(inp)
except Exception as e:
raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e

def _validate_genome_input_params(self, genome_input):
"""
_validate_save_one_genome_params:
validates params passed to save_one_genome method
Check required parameters are in genome_input
"""
logging.info('start validating save_one_genome params')
logging.info("start validating genome_input params")
# check for required parameters
for p in ['workspace', 'name', 'data']:
if p not in params:
raise ValueError(
'"{}" parameter is required, but missing'.format(p))
for p in ["name", "data"]:
if p not in genome_input:
raise ValueError(f"{p} parameter is required, but missing")

def _save_genome_objects(
self,
workspace_id,
ws_datatypes,
data_paths,
names,
meta_data,
hidden_data,
):
ws_inputs = []
for ws_datatype, data_path, name, meta, hidden in zip(
ws_datatypes, data_paths, names, meta_data, hidden_data
):
ws_inputs.append(
{
'type': ws_datatype,
'data_json_file': data_path,
'name': name,
'meta': meta,
'hidden': hidden,
}
)
return self.ws_large_data.save_objects(
{'id': workspace_id, 'objects': ws_inputs}
)

def _check_shock_response(self, response, errtxt):
"""
Expand Down Expand Up @@ -128,54 +195,75 @@ def get_one_genome(self, params):
return data, res['info']
# return self.dfu.get_objects(params)['data'][0]

def save_one_genome(self, params):
logging.info('start saving genome object')
self._validate_save_one_genome_params(params)
workspace = params['workspace']
name = params['name']
data = params['data']
# XXX there is no `workspace_datatype` param in the spec
ws_datatype = params.get('workspace_datatype', "KBaseGenomes.Genome")
# XXX there is no `meta` param in the spec
meta = params.get('meta', {})
if "AnnotatedMetagenomeAssembly" in ws_datatype:
if params.get('upgrade') or 'feature_counts' not in data:
data = self._update_metagenome(data)
else:
if params.get('upgrade') or 'feature_counts' not in data:
data = self._update_genome(data)

# check all handles point to shock nodes owned by calling user
self._own_handle(data, 'genbank_handle_ref')
self._own_handle(data, 'gff_handle_ref')
if "AnnotatedMetagenomeAssembly" not in ws_datatype:
self._check_dna_sequence_in_features(data)
data['warnings'] = self.validate_genome(data)

# sort data
data = GenomeUtils.sort_dict(data)
# dump genome to scratch for upload
data_path = os.path.join(self.scratch, name + ".json")
json.dump(data, open(data_path, 'w'))
if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'):
hidden = 1
else:
hidden = 0

if isinstance(workspace, int) or workspace.isdigit():
workspace_id = workspace
else:
workspace_id = self.dfu.ws_name_to_id(workspace)

save_params = {'id': workspace_id,
'objects': [{'type': ws_datatype,
'data_json_file': data_path,
'name': name,
'meta': meta,
'hidden': hidden}]}
dfu_oi = self.ws_large_data.save_objects(save_params)[0]
returnVal = {'info': dfu_oi, 'warnings': data.get('warnings', [])}
return returnVal
def _save_genome_mass(self, params):

workspace_id = params[_WSID]
inputs = params[_INPUTS]

ws_datatypes = []
data_paths = []
names = []
meta_data = []
hidden_data = []
warnings = []

for input_params in inputs:

# retrive required params
name = input_params['name']
data = input_params['data']

# XXX there is no `workspace_datatype` param in the spec
ws_datatype = input_params.get('workspace_datatype', "KBaseGenomes.Genome")
# XXX there is no `meta` param in the spec
meta = input_params.get('meta', {})

ws_datatypes.append(ws_datatype)
names.append(name)
meta_data.append(meta)

if "AnnotatedMetagenomeAssembly" in ws_datatype:
if input_params.get('upgrade') or 'feature_counts' not in data:
data = self._update_metagenome(data)
else:
if input_params.get('upgrade') or 'feature_counts' not in data:
data = self._update_genome(data)

# check all handles point to shock nodes owned by calling user
self._own_handle(data, 'genbank_handle_ref')
self._own_handle(data, 'gff_handle_ref')
if "AnnotatedMetagenomeAssembly" not in ws_datatype:
self._check_dna_sequence_in_features(data)
data['warnings'] = self.validate_genome(data)

# sort data
data = GenomeUtils.sort_dict(data)
# dump genome to scratch for upload
data_path = os.path.join(self.scratch, name + ".json")
json.dump(data, open(data_path, 'w'))
if 'hidden' in params and str(params['hidden']).lower() in ('yes', 'true', 't', '1'):
hidden = 1
else:
hidden = 0

data_paths.append(data_path)
hidden_data.append(hidden)
warnings.append(data.get('warnings', []))

dfu_infos = self._save_genome_objects(
workspace_id,
ws_datatypes,
data_paths,
names,
meta_data,
hidden_data,
)

output = [
{'info': dfu_oi, 'warnings': warning}
for dfu_oi, warning in zip(dfu_infos, warnings)
]
return output

@staticmethod
def determine_tier(source):
Expand Down
26 changes: 26 additions & 0 deletions lib/GenomeFileUtil/core/MiscUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,29 @@ def validate_lists_have_same_elements(l1, l2):
diff = set(l1) ^ (set(l2)) # get the symmetric difference of the sets
# check if all ids are shared
return len(diff) == 0


def get_int(putative_int, name, minimum=1):
"""
Validates and returns an integer value.
This function checks whether the provided value is an integer and if it meets the specified minimum value.
If the checks are not passed, it raises a `ValueError` with a descriptive message.
Args:
putative_int (int or None): The value to be validated and returned. If `None`, it will be returned as is.
name (str): A descriptive name for the value being checked. This is used in error messages.
minimum (int, optional): The minimum acceptable value for `putative_int`. Defaults to 1.
Returns:
int: The validated integer if all checks are passed.
Raises:
ValueError: If `putative_int` is not an integer, or if it is less than `minimum`.
"""
if putative_int is not None:
if type(putative_int) is not int:
raise ValueError(f"{name} must be an integer, got: {putative_int}")
if putative_int < minimum:
raise ValueError(f"{name} must be an integer >= {minimum}")
return putative_int
2 changes: 1 addition & 1 deletion test/problematic_tests/save_genome_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_bad_one_genome_params(self):
invalidate_params = {'missing_workspace': 'workspace',
'name': 'name',
'data': 'data'}
error_msg = '"workspace" parameter is required, but missing'
error_msg = "Exactly one of a 'workspace_id' or a 'workspace' parameter must be provided"
self.fail_save_one_genome(invalidate_params, error_msg)

def test_one_genome(self):
Expand Down

0 comments on commit 6208a9a

Please sign in to comment.