Skip to content

Commit

Permalink
move set_up_single_params && validate_mass_params into GenomeUtils
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiangs18 committed Aug 28, 2024
1 parent e79c297 commit 35030a0
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 107 deletions.
40 changes: 6 additions & 34 deletions lib/GenomeFileUtil/core/GenbankToGenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
from installed_clients.AssemblyUtilClient import AssemblyUtil
from installed_clients.DataFileUtilClient import DataFileUtil
from GenomeFileUtil.core.GenomeInterface import GenomeInterface
from GenomeFileUtil.core.MiscUtils import get_int
from installed_clients.WorkspaceClient import Workspace
from GenomeFileUtil.core.GenomeUtils import (
is_parent, propagate_cds_props_to_gene, warnings, parse_inferences,
load_ontology_mappings, set_taxon_data, set_default_taxon_data
load_ontology_mappings, set_taxon_data, set_default_taxon_data,
set_up_single_params, validate_mass_params
)

MAX_MISC_FEATURE_SIZE = 10000
Expand Down Expand Up @@ -114,44 +114,16 @@ def __init__(self, config):

def import_genbank(self, params):
print('validating parameters')
mass_params = self._set_up_single_params(params)
mass_params = set_up_single_params(
params, _WSNAME, self._validate_params, self.dfu.ws_name_to_id
)
return self._import_genbank_mass(mass_params)[0]

def import_genbank_mass(self, params):
print('validating parameters')
self._validate_mass_params(params)
validate_mass_params(params, self._validate_params)
return self._import_genbank_mass(params)

def _set_up_single_params(self, params):
# avoid side effects and keep variables in params unmodfied
inputs = dict(params)
self._validate_params(inputs)
ws_id = get_int(inputs.pop(_WSID, None), _WSID)
ws_name = inputs.pop(_WSNAME, None)
if (bool(ws_id) == bool(ws_name)): # xnor
raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WSNAME}' parameter must be provided")
if not ws_id:
print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
+ "a workspace ID over a mutable workspace name that may cause race conditions")
ws_id = self.dfu.ws_name_to_id(ws_name)
mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
return mass_params

def _validate_mass_params(self, params):
ws_id = get_int(params.get(_WSID), _WSID)
if not ws_id:
raise ValueError(f"{_WSID} is required")
inputs = params.get(_INPUTS)
if not inputs or type(inputs) is not list:
raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
for i, inp in enumerate(inputs, start=1):
if type(inp) is not dict:
raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
try:
self._validate_params(inp)
except Exception as e:
raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e

def _import_genbank_mass(self, params):

workspace_id = params[_WSID]
Expand Down
48 changes: 12 additions & 36 deletions lib/GenomeFileUtil/core/GenomeInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from installed_clients.AssemblySequenceAPIServiceClient import AssemblySequenceAPI
from installed_clients.DataFileUtilClient import DataFileUtil
from installed_clients.WSLargeDataIOClient import WsLargeDataIO
from GenomeFileUtil.core import GenomeUtils
from GenomeFileUtil.core.MiscUtils import get_int
from GenomeFileUtil.core.GenomeUtils import (
set_taxon_data, set_default_taxon_data, sort_dict,
set_up_single_params, validate_mass_params
)

MAX_GENOME_SIZE = 2**30

Expand All @@ -38,42 +40,16 @@ def __init__(self, config):
self.ws_large_data = WsLargeDataIO(self.callback_url)

def save_one_genome(self, params):
mass_params = self._set_up_single_params(params)
mass_params = set_up_single_params(
params, _WS, self._validate_genome_input_params, self.dfu.ws_name_to_id
)
return self._save_genome_mass(mass_params)[0]

# NOTE If there is more than 1GB of data or more than 10,000 genomes to upload, the workspace will fail.
def save_genome_mass(self, params):
self._validate_mass_params(params)
validate_mass_params(params, self._validate_genome_input_params)
return self._save_genome_mass(params)

def _set_up_single_params(self, params):
inputs = dict(params)
self._validate_genome_input_params(inputs)
ws_id = get_int(inputs.pop(_WSID, None), _WSID)
ws_name = inputs.pop(_WS, None)
if bool(ws_id) == bool(ws_name): # xnor
raise ValueError(f"Exactly one of a '{_WSID}' or a '{_WS}' parameter must be provided")
if not ws_id:
print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
+ "a workspace ID over a mutable workspace name that may cause race conditions")
ws_id = self.dfu.ws_name_to_id(ws_name)
mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
return mass_params

def _validate_mass_params(self, params):
ws_id = get_int(params.get(_WSID), _WSID)
if not ws_id:
raise ValueError(f"{_WSID} is required")
inputs = params.get(_INPUTS)
if not inputs or type(inputs) != list:
raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
for i, inp in enumerate(inputs, start=1):
if type(inp) != dict:
raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
try:
self._validate_genome_input_params(inp)
except Exception as e:
raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e

def _validate_genome_input_params(self, genome_input):
"""
Check required parameters are in genome_input
Expand Down Expand Up @@ -236,7 +212,7 @@ def _save_genome_mass(self, params):
data['warnings'] = self.validate_genome(data)

# sort data
data = GenomeUtils.sort_dict(data)
data = sort_dict(data)
# dump genome to scratch for upload
data_path = os.path.join(self.scratch, name + ".json")
json.dump(data, open(data_path, 'w'))
Expand Down Expand Up @@ -311,9 +287,9 @@ def _update_genome(self, genome):
# NOTE: Metagenome object does not have a 'taxon_assignments' field
if 'taxon_assignments' in genome and genome['taxon_assignments'].get('ncbi'):
tax_id = int(genome['taxon_assignments']['ncbi'])
GenomeUtils.set_taxon_data(tax_id, self.re_api_url, genome)
set_taxon_data(tax_id, self.re_api_url, genome)

Check warning on line 290 in lib/GenomeFileUtil/core/GenomeInterface.py

View check run for this annotation

Codecov / codecov/patch

lib/GenomeFileUtil/core/GenomeInterface.py#L290

Added line #L290 was not covered by tests
else:
GenomeUtils.set_default_taxon_data(genome)
set_default_taxon_data(genome)

if any([x not in genome for x in ('dna_size', 'md5', 'gc_content', 'num_contigs')]):
if 'assembly_ref' in genome:
Expand Down
93 changes: 92 additions & 1 deletion lib/GenomeFileUtil/core/GenomeUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@
import os
import re
import time
from typing import Callable, Dict, Any

from relation_engine_client import REClient
from relation_engine_client.exceptions import RENotFound
from GenomeFileUtil.core.MiscUtils import get_int

# Name of the ncbi taxonomy namespace stored in "taxon_assignments"
_NCBI_TAX = 'ncbi'

_WSID = 'workspace_id'
_INPUTS = 'inputs'

warnings = {
"cds_excluded": "SUSPECT: CDS from {} was excluded because the associated "
"CDS failed coordinates validation",
Expand Down Expand Up @@ -482,4 +487,90 @@ def set_taxon_data(tax_id, re_api_url, genome_dict):
)
# Assign the scientific name to the most specific (right-most) taxon in the lineage
genome_dict['scientific_name'] = sciname



def set_up_single_params(
params: Dict[str, Any],
ws: str,
validate_params_func: Callable[[Dict[str, Any]], None],
ws_name_to_id_func: Callable[[str], int]
) -> Dict[str, Any]:
"""
Sets up parameters by validating them and ensuring that exactly one of workspace ID or name is provided.
Args:
params (Dict[str, Any]): A dictionary where the keys are parameter names (strings) and the values
can be of any type.
ws (str): A string representing the key for the workspace name or identifier.
validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters
and validates them. This function should raise an exception if the parameters are invalid.
ws_name_to_id_func (Callable[[str], int]): A function that takes a workspace name (string) and returns
its corresponding ID (integer).
Returns:
Dict[str, Any]: A dictionary containing the workspace ID and the processed parameters. The dictionary
has keys '_WSID' and '_INPUTS', where '_WSID' is the workspace ID and '_INPUTS' is a list containing
the input parameters.
Raises:
ValueError: If neither or both the workspace ID and workspace name are provided in the parameters.
KeyError: If the workspace ID or name is missing or invalid.
Notes:
- If a workspace ID is not provided, the function will attempt to convert the workspace name to an ID
using `ws_name_to_id_func`.
- It is preferable to provide a workspace ID directly to avoid potential race conditions with mutable
workspace names.
"""
inputs = dict(params)
validate_params_func(inputs)
ws_id = get_int(inputs.pop(_WSID, None), _WSID)
ws_name = inputs.pop(ws, None)
if bool(ws_id) == bool(ws_name): # xnor
raise ValueError(f"Exactly one of a '{_WSID}' or a '{ws}' parameter must be provided")
if not ws_id:
print(f"Translating workspace name {ws_name} to a workspace ID. Prefer submitting "
+ "a workspace ID over a mutable workspace name that may cause race conditions")
ws_id = ws_name_to_id_func(ws_name)
mass_params = {_WSID: ws_id, _INPUTS: [inputs]}
return mass_params


def validate_mass_params(
params: Dict[str, Any],
validate_params_func: Callable[[Dict[str, Any]], None]
) -> None:
"""
Validates the provided parameters according to specific rules.
Args:
params (Dict[str, Any]): A dictionary containing parameters to validate. Must include:
- _WSID: A workspace ID, which must be present and valid.
- _INPUTS: A list of parameter dictionaries, each of which must be validated by `validate_params_func`.
validate_params_func (Callable[[Dict[str, Any]], None]): A function that takes a dictionary of parameters
and validates it. The function should raise an exception if the parameters are invalid.
Raises:
ValueError: If `_WSID` is missing or invalid, if `_INPUTS` is missing or not a non-empty list, or if any
entry in `_INPUTS` is not a dictionary or fails validation.
Notes:
- The function checks that `_WSID` is present and converts it to an integer using `get_int`.
- The `_INPUTS` field must be a non-empty list of dictionaries. Each dictionary in the list is validated
using `validate_params_func`.
- If any validation fails, a `ValueError` is raised with a message indicating the issue and entry index.
"""
ws_id = get_int(params.get(_WSID), _WSID)
if not ws_id:
raise ValueError(f"{_WSID} is required")
inputs = params.get(_INPUTS)
if not inputs or type(inputs) != list:
raise ValueError(f"{_INPUTS} field is required and must be a non-empty list")
for i, inp in enumerate(inputs, start=1):
if type(inp) != dict:
raise ValueError(f"Entry #{i} in {_INPUTS} field is not a mapping as required")
try:
validate_params_func(inp)
except Exception as e:
raise ValueError(f"Entry #{i} in {_INPUTS} field has invalid params: {e}") from e
72 changes: 36 additions & 36 deletions test/problematic_tests/save_genome_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,42 +234,42 @@ def test_genomes_with_hidden(self):
ret = self.genome_interface.save_genome_mass(params)[0]
self.check_save_one_genome_output(ret, genome_name)

def test_bad_genomes_params_missing_wsid(self):
self.start_test()
invalidate_params = {
'missing_workspace_id': 'workspace_id',
'name': 'name',
'data': 'data',
}
error_msg = "workspace_id is required"
self.fail_save_genome(invalidate_params, error_msg, mass=True)

def test_bad_genomes_params_empty_inputs(self):
self.start_test()
invalidate_params = {
'workspace_id': self.wsID,
'inputs': []
}
error_msg = "inputs field is required and must be a non-empty list"
self.fail_save_genome(invalidate_params, error_msg, mass=True)

def test_bad_genomes_params_invalidate_entry_type(self):
self.start_test()
invalidate_params = {
'workspace_id': self.wsID,
'inputs': [['name', 'data']],
}
error_msg = "Entry #1 in inputs field is not a mapping as required"
self.fail_save_genome(invalidate_params, error_msg, mass=True)

def test_bad_genomes_params_missing_parameter(self):
self.start_test()
invalidate_params = {
'workspace_id': self.wsID,
'inputs': [{'data': 'data'}],
}
error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
self.fail_save_genome(invalidate_params, error_msg, mass=True)
# def test_bad_genomes_params_missing_wsid(self):
# self.start_test()
# invalidate_params = {
# 'missing_workspace_id': 'workspace_id',
# 'name': 'name',
# 'data': 'data',
# }
# error_msg = "workspace_id is required"
# self.fail_save_genome(invalidate_params, error_msg, mass=True)

# def test_bad_genomes_params_empty_inputs(self):
# self.start_test()
# invalidate_params = {
# 'workspace_id': self.wsID,
# 'inputs': []
# }
# error_msg = "inputs field is required and must be a non-empty list"
# self.fail_save_genome(invalidate_params, error_msg, mass=True)

# def test_bad_genomes_params_invalidate_entry_type(self):
# self.start_test()
# invalidate_params = {
# 'workspace_id': self.wsID,
# 'inputs': [['name', 'data']],
# }
# error_msg = "Entry #1 in inputs field is not a mapping as required"
# self.fail_save_genome(invalidate_params, error_msg, mass=True)

# def test_bad_genomes_params_missing_parameter(self):
# self.start_test()
# invalidate_params = {
# 'workspace_id': self.wsID,
# 'inputs': [{'data': 'data'}],
# }
# error_msg = "Entry #1 in inputs field has invalid params: name parameter is required, but missing"
# self.fail_save_genome(invalidate_params, error_msg, mass=True)

def test_GenomeInterface_check_dna_sequence_in_features(self):
# no feature in genome
Expand Down

0 comments on commit 35030a0

Please sign in to comment.